def test_multiple_attr_filter(self): nc = EnhancedDataset(self.input_file) grid_spacing_vars = nc.get_variables_by_attributes(grid_spacing='4.0 km', standard_name='projection_y_coordinate') y = nc.variables.get('y') self.assertEqual(len(grid_spacing_vars), 1) assert y in grid_spacing_vars
def test_multiple_attr_filter(self): nc = EnhancedDataset(self.input_file) grid_spacing_vars = nc.get_variables_by_attributes(grid_spacing='4.0 km', standard_name='projection_y_coordinate') y = nc.variables.get('y') self.assertEquals(len(grid_spacing_vars), 1) assert y in grid_spacing_vars
def test_single_attr_filter(self): nc = EnhancedDataset(self.input_file) grid_spacing_vars = nc.get_variables_by_attributes(grid_spacing='4.0 km') x = nc.variables.get('x') y = nc.variables.get('y') self.assertEquals(len(grid_spacing_vars), 2) assert x in grid_spacing_vars assert y in grid_spacing_vars
def test_single_attr_filter(self): nc = EnhancedDataset(self.input_file) grid_spacing_vars = nc.get_variables_by_attributes(grid_spacing='4.0 km') x = nc.variables.get('x') y = nc.variables.get('y') self.assertEqual(len(grid_spacing_vars), 2) assert x in grid_spacing_vars assert y in grid_spacing_vars
def test_int64_dtypes(data, expected_dtype): assert get_dtype(data) == expected_dtype with EnhancedDataset('foo.nc', 'w') as ncd: ncd.createDimension('three', 3) v = ncd.createVariable('foo', expected_dtype, ('three', )) v[:] = data os.remove('foo.nc')
def create_file(output, ncfile, varname, df): with EnhancedDataset(ncfile) as ncd: var = ncd[varname] latitude = ncd.get_variables_by_attributes( standard_name='latitude')[0][:] longitude = ncd.get_variables_by_attributes( standard_name='longitude')[0][:] project = ncd.original_folder feature_name = '{}_{}'.format(project, ncd.MOORING).lower() station_urn = IoosUrn(authority=ncd.naming_authority, label=feature_name, asset_type='station').urn discriminant = ncd.id.replace('-', '_') output_filename = '{0}_{1}-{2}_{3}_TO_{4}.nc'.format( feature_name, var.name, discriminant, df['time'].min().strftime("%Y%m%dT%H%M%SZ"), df['time'].max().strftime("%Y%m%dT%H%M%SZ")) output_directory = os.path.join(output, feature_name) if not os.path.isdir(output_directory): os.makedirs(output_directory) file_global_attributes = {k: getattr(ncd, k) for k in ncd.ncattrs()} # original_folder is the project name file_global_attributes.update( dict(title='{} - {}'.format(project, ncd.MOORING), id=feature_name)) variable_attributes = {k: getattr(var, k) for k in var.ncattrs()} # Add the specific sensor as a discriminant variable_attributes.update(dict(discriminant=discriminant)) fillvalue = -9999.9 if hasattr(var, "_FillValue"): fillvalue = var._FillValue vertical_datum = None if 'crs' in ncd.variables and hasattr(ncd.variables['crs'], 'vertical_datum'): vertical_datum = ncd.variables['crs'].vertical_datum ts = TimeSeries.from_dataframe(df, output_directory, output_filename, latitude, longitude, station_urn, file_global_attributes, var.standard_name, variable_attributes, sensor_vertical_datum=vertical_datum, fillvalue=fillvalue, vertical_axis_name='height', vertical_positive='down') ts.add_instrument_variable(variable_name=var.standard_name) del ts
def normalize_vectors(netcdf_file): with EnhancedDataset(netcdf_file, 'a') as nc: east = None north = None for v in nc.variables: nc_var = nc.variables.get(v) if hasattr( nc_var, 'standard_name' ) and nc_var.standard_name == 'eastward_sea_water_velocity': east = nc_var continue if hasattr( nc_var, 'standard_name' ) and nc_var.standard_name == 'northward_sea_water_velocity': north = nc_var continue std_names = [] for varname in nc.variables: var = nc.variables.get(varname) if hasattr(var, 'standard_name'): std_names.append(var.standard_name) # Only add the variables if they don't already exist if east is not None and north is not None and 'sea_water_speed' not in std_names and 'direction_of_sea_water_velocity' not in std_names: # We have vectors... create the speed and direction variables speed = np.sqrt(np.square(east[:]) + np.square(north[:])) direction = np.degrees(np.arctan2(north[:], east[:])) east_fill_value = east._FillValue if hasattr( east, '_FillValue') else np.nan spd = nc.createVariable('CS_300', east.dtype, east.dimensions, fill_value=east_fill_value) spd.standard_name = 'sea_water_speed' spd.long_name = "Current speed" spd.units = 'm/s' spd.epic_code = 300 spd[:] = speed drc = nc.createVariable('CD_310', east.dtype, east.dimensions, fill_value=east_fill_value) drc.standard_name = 'direction_of_sea_water_velocity' drc.long_name = "Current direction" drc.units = 'degree' drc.epic_code = 310 drc[:] = direction
def normalize_units(netcdf_file): with EnhancedDataset(netcdf_file, 'a') as nc: for v in nc.variables: nc_var = nc.variables.get(v) if hasattr(nc_var, 'units') and nc_var.units == "K": # Convert kelvin to Celsius nc_var[:] = nc_var[:] - 273.15 nc_var.units = "degree_Celsius" elif hasattr( nc_var, 'standard_name' ) and nc_var.standard_name == 'sea_surface_wave_from_direction': # Convert "From" to "To" direction nc_var[:] = (nc_var[:] + 180) % 360 nc_var.standard_name = 'sea_surface_wave_to_direction' nc_var.long_name = "Wave Direction (to TN)"
def test_generic_masked_bad_min_max_value(self): _, tpath = tempfile.mkstemp(suffix='.nc', prefix='pyaxiom-test') shutil.copy2(self.input_file, tpath) with EnhancedDataset(tpath, 'a') as ncd: v = ncd.variables['v_component_wind_true_direction_all_geometries'] v.valid_min = 0.1 v.valid_max = 0.1 r = generic_masked(v[:], attrs=ncd.vatts(v.name)) rflat = r.flatten() assert rflat[~rflat.mask].size == 0 # Create a byte variable with a float valid_min and valid_max # to make sure it doesn't error b = ncd.createVariable('imabyte', 'b') b.valid_min = 0 b.valid_max = 600 # this ss over a byte and thus invalid b[:] = 3 r = generic_masked(b[:], attrs=ncd.vatts(b.name)) assert np.all(r.mask == False) # noqa b.valid_min = 0 b.valid_max = 2 r = generic_masked(b[:], attrs=ncd.vatts(b.name)) assert np.all(r.mask == True) # noqa c = ncd.createVariable('imanotherbyte', 'f4') c.setncattr('valid_min', '0b') c.setncattr('valid_max', '9b') c[:] = 3 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == False) # noqa c = ncd.createVariable('imarange', 'f4') c.valid_range = [0.0, 2.0] c[:] = 3.0 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == True) # noqa c.valid_range = [0.0, 2.0] c[:] = 1.0 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == False) # noqa if os.path.exists(tpath): os.remove(tpath)
def add_time_bounds(self, delta=None, position=None): with EnhancedDataset(self.out_file, 'a') as nc: nc.createDimension("bounds", 2) time_bounds = nc.createVariable('{}_bounds'.format( self.time_axis_name), "f8", ( "time", "bounds", ), chunksizes=( 1000, 2, )) time_bounds.units = "seconds since 1970-01-01T00:00:00Z" time_bounds.calendar = "gregorian" time_objs = netCDF4.num2date(self.time[:], units=self.time.units, calendar=self.time.calendar) bounds_kwargs = dict(units=time_bounds.units, calendar=time_bounds.calendar) if position == "start": time_bounds[:] = np.asarray( list( zip( self.time[:], netCDF4.date2num(time_objs + delta, **bounds_kwargs)))) elif position == "middle": time_bounds[:] = np.asarray( list( zip( netCDF4.date2num(time_objs - delta / 2, **bounds_kwargs), netCDF4.date2num(time_objs + delta / 2, **bounds_kwargs)))) elif position == "end": time_bounds[:] = np.asarray( list( zip( netCDF4.date2num(time_objs - delta, **bounds_kwargs), self.time[:])))
def normalize_time(netcdf_file): epoch_units = 'seconds since 1970-01-01T00:00:00Z' millisecond_units = 'milliseconds since 1858-11-17T00:00:00Z' with EnhancedDataset(netcdf_file, 'a') as nc: # Signell said this works, any problems and we can all blame him! time_data = netCDF4.num2date( (np.int64(nc.variables['time'][:]) - 2400001) * 3600 * 24 * 1000 + nc.variables['time2'][:].__array__(), units=millisecond_units) # noqa nc.renameVariable("time", "old_time") nc.sync() time = nc.createVariable('time', 'f8', ('time')) time.units = epoch_units time.standard_name = "time" time.long_name = "time of measurement" time.calendar = "gregorian" time[:] = netCDF4.date2num(time_data, units=epoch_units).round() return time_data[0]
def add_variable_object(self, varobject, dimension_map=None, reduce_dims=None): dimension_map = dimension_map or {} reduce_dims = reduce_dims or False with EnhancedDataset(self.out_file, 'a') as nc: fillvalue = -9999.99 if hasattr(varobject, '_FillValue'): fillvalue = varobject._FillValue dims = [] for n in varobject.dimensions: d = dimension_map.get(n, n) dim_size = varobject.shape[list(varobject.dimensions).index(n)] if reduce_dims is True and dim_size in [0, 1]: continue if d not in nc.dimensions: nc.createDimension(d, dim_size) dims.append(d) var = nc.createVariable(varobject.name, varobject.dtype, dims, fill_value=fillvalue, zlib=True) for k in varobject.ncattrs(): if k not in ['name', '_FillValue']: var.setncattr(k, varobject.getncattr(k)) if reduce_dims: var[:] = varobject[:].squeeze() else: var[:] = varobject[:]
def download(folder, project_metadata, filesubset, since): # Use thredds_crawler to find DAP endpoints of the RAW data. total_datasets = [] skips = Crawl.SKIPS + ['.*OTHER.*', '.*ancillary.*', '.*OLD_VERSIONS.*'] try: for k, v in project_metadata.items(): # http://regexr.com/3conn datasets = Crawl(v['catalog_xml'], select=['(.*)'], skip=skips, after=since).datasets logger.info("Found {0} datasets in {1}!".format(len(datasets), k)) total_datasets += datasets logger.info("Found {0} TOTAL datasets!".format(len(total_datasets))) except KeyboardInterrupt: logger.info("Breaking out of crawling loop.") total_datasets = [] try: os.makedirs(folder) except OSError: pass # Save datasets to download directory saved_files = [] for num, d in enumerate(total_datasets): if filesubset and d.name.lower() not in filesubset: continue try: http_url = next(s["url"] for s in d.services if s["service"].lower() == "httpserver") project_name = http_url.split("/")[-2] except StopIteration: logger.error("No HTTPServer endpoint found, skipping") continue # Make download folder save_file = os.path.join(folder, project_name, d.name) if not os.path.isdir(os.path.dirname(save_file)): os.makedirs(os.path.dirname(save_file)) logger.info("Downloading {0}".format(http_url)) try: with open(save_file, "wb") as f: r = requests.get(http_url, stream=True) if not r.ok: logger.error( "Could not download '{!s}' from '{!s}', skipping". format(d.name, http_url)) break for block in r.iter_content(1024): if not block: break f.write(block) except KeyboardInterrupt: logger.info("Breaking out of download loop.") raise except BaseException: logger.error( "Could not download... error with HTTP endpoint. Skipping.") continue # Try to open file, if it fails, writing failed. try: with EnhancedDataset(save_file, 'a') as nc: name, _ = os.path.splitext(d.name) nc.id = "{0}/{1}".format(project_name, name) except BaseException: os.remove(save_file) raise else: logger.info("{!s} saved ({!s}/{!s})".format( d.name, num + 1, len(total_datasets))) saved_files.append(save_file) return saved_files
def main(output, download_folder, projects, do_download, filesubset=None, since=None): if do_download: try: downloaded_files = download(download_folder, projects, filesubset, since) except KeyboardInterrupt: logger.exception('Interpted downloading datasets from THREDDS') downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, '**', '*')) if since is not None: def should_keep(d): modt = datetime.utcfromtimestamp( os.path.getmtime(d)).replace(tzinfo=pytz.utc) return modt >= since downloaded_files = [ dl for dl in downloaded_files if should_keep(dl) ] epic_skips = metadata_codes + voltage_codes + location_codes + time_codes downloaded_files = sorted(downloaded_files) # Take the downloaded_files and split them up into arrays of related files. # This is needed because some of the files need to be combined... they # represent that same station/mooring/variables, but at different depths i = 0 combinations = [] while i < len(downloaded_files): combo = [] nc_file = os.path.abspath(downloaded_files[i]) combo.append(nc_file) try: if filesubset is not None: if os.path.basename(nc_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue with EnhancedDataset(nc_file) as tmpnc: if projects: if hasattr( tmpnc, 'original_folder' ) and tmpnc.original_folder.upper() not in projects: continue logger.info("Scanned {}".format(nc_file)) # Now search for files that are of the same var, but with a different depth thisbase = os.path.basename(nc_file).lower().split("_d")[0] for j in range(i + 1, len(downloaded_files)): nextout = os.path.abspath(downloaded_files[j]) nextbase = os.path.basename(nextout).lower().split("_d")[0] if thisbase == nextbase: # Found a match logger.info("Scanned {}".format(nextout)) combo.append(nextout) # Now skip file because we added it already i += 1 else: # Doesn't match, move on to the next outfile break # Add to combinations so it is processed combinations.append(combo) except BaseException: logger.exception("Error. Skipping {0}.".format(nc_file)) continue finally: i += 1 # Now iterate over each set of files and combine as necessary for c in combinations: dataframes_to_create = dict() for f in c: try: with EnhancedDataset(f) as ncd: for var in ncd.get_variables_by_attributes( coordinates=lambda v: v is not None): if not hasattr(var, 'standard_name'): logger.warning( "{}: Skipping variable {} because it has no standard_name" .format(f, var.name)) continue if hasattr( var, 'epic_code') and var.epic_code in epic_skips: logger.warning( "{}: Skipping metadata variable {}".format( f, var.standard_name)) continue df = get_dataframe_from_variable(ncd, var) if var.name in dataframes_to_create: logger.info("Combining variable {}".format( var.name)) old_df = dataframes_to_create[var.name]['frame'] dataframes_to_create[ var.name]['frame'] = old_df.combine_first(df) else: logger.info("New variable {}".format(var.name)) df_dict = dict(frame=df, varname=var.name, ncfile=f) dataframes_to_create[var.name] = df_dict except BaseException: logger.exception("Error. Skipping {0}.".format(f)) continue # Create a file for each dataframe for varname, creation in dataframes_to_create.items(): create_file(output, creation['ncfile'], creation['varname'], creation['frame'])
def add_variable(self, variable_name, values, times=None, verticals=None, sensor_vertical_datum=None, attributes=None, unlink_from_profile=None, fillvalue=None, raise_on_error=False): if isinstance(values, ( list, tuple, )) and values: values = np.asarray(values) if isinstance(times, ( list, tuple, )) and times: times = np.asarray(times) if isinstance(verticals, ( list, tuple, )) and verticals: verticals = np.asarray(verticals) # Set vertical datum on the CRS variable if sensor_vertical_datum is not None: try: self.crs.geoid_name = sensor_vertical_datum self.crs.vertical_datum = sensor_vertical_datum self.crs.water_surface_reference_datum = sensor_vertical_datum except AttributeError: pass # Set default fillvalue for new variables if fillvalue is None: fillvalue = -9999.9 used_values = None try: if unlink_from_profile is True: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] # These next two cases should work for all but a few cases, which are caught below elif self.z.size == 1: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] else: used_values = np.ma.reshape(values, ( self.time.size, self.z.size, )) used_values = used_values[self.time_indexes] try: used_values = used_values[:, self.vertical_indexes] except IndexError: # The vertical values most likely had duplicates. Ignore the # falty index here and try to save the values as is. pass except ValueError: if raise_on_error is True: raise else: logger.warning( "Could not do a simple reshape of data, trying to match manually! Time:{!s}, Heights:{!s}, Values:{!s}" .format(self.time.size, self.z.size, values.size)) if self.z.size > 1: if times is not None and verticals is not None: # Hmmm, we have two actual height values for this station. # Not cool man, not cool. # Reindex the entire values array. This is slow. indexed = ((bisect.bisect_left(self.time[:], times[i]), bisect.bisect_left(self.z[:], verticals[i]), values[i]) for i in range(values.size)) used_values = np.ndarray(( self.time.size, self.z.size, ), dtype=values.dtype) used_values.fill(float(fillvalue)) for (tzi, zzi, vz) in indexed: if zzi < self.z.size and tzi < self.time.size: used_values[tzi, zzi] = vz else: raise ValueError( "You need to pass in both 'times' and 'verticals' parameters that matches the size of the 'values' parameter." ) else: if times is not None: # Ugh, find the time indexes manually indexed = ((bisect.bisect_left(self.time[:], times[i]), values[i]) for i in range(values.size)) used_values = np.ndarray((self.time.size, ), dtype=values.dtype) used_values.fill(float(fillvalue)) for (tzi, vz) in indexed: if tzi < self.time.size: used_values[tzi] = vz else: raise ValueError( "You need to pass in a 'times' parameter that matches the size of the 'values' parameter." ) with EnhancedDataset(self.out_file, 'a') as nc: logger.info("Setting values for {}...".format(variable_name)) if len(used_values.shape) == 1: var = nc.createVariable(variable_name, used_values.dtype, ("time", ), fill_value=fillvalue, chunksizes=(1000, ), zlib=True) if self.z.size == 1: var.coordinates = "{} {} latitude longitude".format( self.time_axis_name, self.vertical_axis_name) else: # This is probably a bottom sensor on an ADCP or something, don't add the height coordinate var.coordinates = "{} latitude longitude".format( self.time_axis_name) if unlink_from_profile is True: # Create metadata variable for the sensor_depth if nc.variables.get('sensor_depth') is None: logger.info( "Setting the special case 'sensor_depth' metadata variable" ) inst_depth = nc.createVariable( 'sensor_depth', 'f4') inst_depth.units = 'm' inst_depth.standard_name = 'surface_altitude' inst_depth.positive = self.vertical_positive if self.vertical_positive.lower() == 'down': inst_depth.long_name = 'sensor depth below datum' elif self.vertical_positive.lower() == 'up': inst_depth.long_name = 'sensor height above datum' inst_depth.datum = sensor_vertical_datum or 'Unknown' if verticals and verticals.size > 0: inst_depth[:] = verticals[0] else: inst_depth[:] = self.vertical_fill elif len(used_values.shape) == 2: var = nc.createVariable(variable_name, used_values.dtype, ( "time", "z", ), fill_value=fillvalue, chunksizes=( 1000, self.z.size, ), zlib=True) var.coordinates = "{} {} latitude longitude".format( self.time_axis_name, self.vertical_axis_name) else: raise ValueError( "Could not create variable. Shape of data is {!s}. Expected a dimension of 1 or 2, not {!s}." .format(used_values.shape, len(used_values.shape))) # Set the variable attributes as passed in if attributes: for k, v in attributes.items(): if k == 'vertical_datum' and sensor_vertical_datum is None and v is not None: # Use this as the vertical datum if it is specified and we didn't already have one try: self.crs.geoid_name = v self.crs.vertical_datum = v self.crs.water_surface_reference_datum = v except AttributeError: pass if k not in ['name', 'coordinates', '_FillValue' ] and v is not None: try: var.setncattr(k, v) except BaseException: logger.info( 'Could not add attribute {}: {}, skipping.'. format(k, v)) var.grid_mapping = 'crs' var[:] = used_values return var
def main(output, download_folder, do_download, projects, csv_metadata_file, filesubset=None, since=None): project_metadata = dict() with open(csv_metadata_file, 'r') as f: reader = csv.DictReader(f) for row in reader: project_name = row['project_name'] if isinstance(project_name, str) and project_name[0] == '#': continue if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata, filesubset, since) except KeyboardInterrupt: logger.exception('Error downloading datasets from THREDDS') downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, '**', '*')) if since is not None: def should_keep(d): modt = datetime.utcfromtimestamp( os.path.getmtime(d)).replace(tzinfo=pytz.utc) return modt >= since downloaded_files = [ dl for dl in downloaded_files if should_keep(dl) ] for down_file in sorted(downloaded_files): temp_fd, temp_file = tempfile.mkstemp(prefix='cmg_collector', suffix='nc') try: if filesubset is not None: if os.path.basename(down_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue project_name = os.path.basename(os.path.dirname(down_file)) if projects: if project_name.lower() not in projects: # Skip this project! continue shutil.copy(down_file, temp_file) # Cleanup to CF-1.6 try: first_time = normalize_time(temp_file) except (TypeError, ValueError, IndexError): logger.exception( "Could not normalize the time variable. Skipping {0}.". format(down_file)) continue except OverflowError: logger.error( "Dates out of range. Skipping {0}.".format(down_file)) continue normalize_epic_codes(temp_file, down_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. mooring_id = None latitude = None longitude = None fname = os.path.basename(down_file) feature_name, file_ext = os.path.splitext( os.path.basename(down_file)) try: mooring_id = int(9999) except ValueError: logger.exception( "Could not create a suitable station_id. Skipping {0}.". format(down_file)) continue file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info("Translating {0} into CF1.6 format: {1}".format( down_file, os.path.abspath(os.path.join(output_directory, file_name)))) with EnhancedDataset(temp_file) as nc: try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] except TypeError: logger.error( "Could not find lat/lon variables. Skipping {0}.". format(down_file)) file_global_attributes = { k: getattr(nc, k) for k in nc.ncattrs() } file_global_attributes.update(global_attributes) file_global_attributes['id'] = feature_name file_global_attributes['MOORING'] = mooring_id file_global_attributes['original_filename'] = fname file_global_attributes['original_folder'] = project_name no_override = [ 'id', 'MOORING', 'original_filename', 'original_folder', 'catalog_xml', 'project_name' ] if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in no_override: file_global_attributes[k] = v if 'summary' in file_global_attributes: # Save the original summary file_global_attributes[ 'WHOI_Buoy_Group_summary'] = file_global_attributes[ 'summary'] # Better title/summary for discovery via catalogs project_title = file_global_attributes.get( 'project_title', project_name).strip() project_summary = file_global_attributes.get( 'project_summary', '').strip() file_global_attributes[ 'title'] = 'USGS-CMG time-series data: {0} - {1} - {2}'.format( project_name, mooring_id, feature_name) file_global_attributes[ 'summary'] = 'USGS-CMG time-series data from the {} project, mooring {} and package {}. {}'.format( project_title, mooring_id, feature_name, project_summary).strip() times = nc.variables.get('time')[:] # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [ x for x in nc.variables.get(dv).dimensions if 'depth' in x ] depth_variables = sorted(list(set(depth_variables))) try: assert depth_variables depth_values = np.asarray([ nc.variables.get(x)[:] for x in depth_variables ]).flatten() except (AssertionError, TypeError): logger.warning( "No depth variables found in {}, skipping.".format( down_file)) continue # Convert everything to positive up, unless it is specifically specified as "up" already depth_conversion = -1.0 if depth_variables: pull_positive = nc.variables.get(depth_variables[0]) if hasattr(pull_positive, 'positive' ) and pull_positive.positive.lower() == 'up': depth_conversion = 1.0 depth_values = depth_values * depth_conversion if not os.path.isdir(output_directory): os.makedirs(output_directory) ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name, vertical_positive='up') # Set the platform type from the global attribute 'platform_type', defaulting to 'fixed' with EnhancedDataset(ts.out_file, 'a') as onc: platform_type = getattr(onc, 'platform_type', 'fixed').lower() onc.variables['platform'].setncattr('type', platform_type) onc.variables['platform'].setncattr( 'nodc_name', "FIXED PLATFORM, MOORINGS") # Add ERDDAP variables onc.cdm_data_type = "TimeSeries" onc.cdm_timeseries_variables = "latitude,longitude,z,feature_type_instance" v = [] depth_files = [] for other in sorted( nc.variables): # Sorted for a reason... don't change! try: if other in coord_vars: continue ovsd = None # old var sensor depth old_var = nc.variables.get(other) variable_attributes = { k: getattr(old_var, k) for k in old_var.ncattrs() } # Remove/rename some attributes # https://github.com/USGS-CMG/usgs-cmg-portal/issues/67 if 'valid_range' in variable_attributes: del variable_attributes['valid_range'] if 'minimum' in variable_attributes: variable_attributes[ 'actual_min'] = variable_attributes['minimum'] del variable_attributes['minimum'] if 'maximum' in variable_attributes: variable_attributes[ 'actual_max'] = variable_attributes['maximum'] del variable_attributes['maximum'] if 'sensor_depth' in variable_attributes: # sensor_depth is ALWAYS positive "down", so don't convert! # This is contrary to the "positive" attribute on the Z axis. # variable_attributes['sensor_depth'] = variable_attributes['sensor_depth'] * -1 # Round the sensor_depth attribute variable_attributes['sensor_depth'] = np.around( variable_attributes['sensor_depth'], decimals=4) ovsd = np.around(old_var.sensor_depth * depth_conversion, decimals=4) fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split('_')[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [ x for x in old_var.dimensions if 'depth' in x ] if depth_variable and len( old_var.dimensions ) > 1 and 'time' in old_var.dimensions: depth_index = np.squeeze( np.where(depth_values == ( nc.variables.get(depth_variable[0])[:] * depth_conversion))) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if search_var != other and search_var.split('_')[0] == new_var_name and \ depth_variable[0] != [ x for x in nc.variables[search_var].dimensions if 'depth' in x ][0]: # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get( search_var).dimensions if 'depth' in x ] depth_index = np.squeeze( np.where(depth_values == ( nc.variables.get( search_depth_variable[0])[:] * depth_conversion))) depth_indexes.append( (search_var, depth_index)) logger.info( "Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)" .format(search_var, other, new_var_name)) values = np.ma.empty( (times.size, len(depth_values)), dtype=old_var.dtype) values.fill_value = fillvalue values.mask = True inconsistent = False for nm, index in depth_indexes: try: values[:, index] = np.squeeze( nc.variables.get(nm)[:]) except ValueError: inconsistent = True break # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other if inconsistent is True: # Incorrect array size, most likely a strange variable ts.add_variable_object( old_var, dimension_map=dict(depth='z'), reduce_dims=True) else: # Create this one, should be the first we encounter for this type ts.add_variable(new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) elif len(old_var.dimensions ) == 1 and old_var.dimensions[0] == 'time': # A single time dimensioned variable, like pitch, roll, record count, etc. ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) elif old_var.ndim <= 3 and ovsd and \ ((depth_values.size == 1 and not depth_variable and 'time' in old_var.dimensions) or (depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions and 'sensor_depth' in ts.ncd.variables)): if 'sensor_depth' in ts.ncd.variables and np.isclose( ts.ncd.variables['sensor_depth'][:], ovsd): ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: # Search through secondary files that have been created for detached variables at a certain depth and # try to match this variable with one of the depths. found_df = False for dfts in depth_files: if isinstance(ovsd, np.ndarray): # Well, this is a bad file. raise ValueError( "The sensor_depth attribute has more than one value, please fix the source NetCDF: {}" .format(down_file)) if np.isclose( dfts.ncd.variables[ ts.vertical_axis_name][:], ovsd): dfts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) found_df = True break # If we couldn't match the current or one of the existing secondary depth files, create a new one. if found_df is False: new_file_name = file_name.replace( file_ext, '_z{}{}'.format( len(depth_files) + 1, file_ext)) fga = copy(file_global_attributes) fga['id'] = os.path.splitext( new_file_name)[0] new_ts = TimeSeries( output_directory, latitude, longitude, feature_name, fga, times=times, verticals=[ovsd], output_filename=new_file_name, vertical_positive='up') new_ts.add_variable( other, values=old_var[:], times=times, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) depth_files.append(new_ts) elif old_var.ndim <= 3 and ( depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions): if ovsd: # An ADCP or profiling dataset, but this variable is measued at a single depth. # Example: Bottom Temperature on an ADCP # Skip things with a dimension over 3 (some beam variables like `brange`) ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: if 'time' in old_var.dimensions and old_var.ndim <= 3: ts.add_variable(other, values=old_var[:], times=times, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable_object( old_var, dimension_map=dict(depth='z'), reduce_dims=True) except BaseException: logger.exception( "Error processing variable {0} in {1}. Skipping it." .format(other, down_file)) except KeyboardInterrupt: logger.info("Breaking out of Translate loop!") break except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: try: for df in depth_files: del df except NameError: pass try: del ts except NameError: pass os.close(temp_fd) if os.path.isfile(temp_file): os.remove(temp_file)
def add_instrument_metadata(self, urn): with EnhancedDataset(self.out_file, 'a') as nc: instrument = nc.createVariable("instrument", "i4") instrument.definition = "http://mmisw.org/ont/ioos/definition/sensorID" instrument.long_name = urn instrument.ioos_code = urn
def __init__(self, output_directory, latitude, longitude, station_name, global_attributes, times=None, verticals=None, vertical_fill=None, output_filename=None, vertical_axis_name=None, vertical_positive=None): if output_filename is None: output_filename = '{}_{}.nc'.format(station_name, int(random.random() * 100000)) logger.info("No output filename specified, saving as {}".format(output_filename)) self.vertical_positive = vertical_positive or 'down' self.vertical_axis_name = vertical_axis_name or 'z' self.time_axis_name = 'time' # Make directory if not os.path.exists(output_directory): os.makedirs(output_directory) self.time = None self.out_file = os.path.abspath(os.path.join(output_directory, output_filename)) if os.path.isfile(self.out_file): os.remove(self.out_file) with EnhancedDataset(self.out_file, 'w') as nc: # Global attributes # These are set by this script, we don't someone to be able to set them manually global_skips = ["time_coverage_start", "time_coverage_end", "time_coverage_duration", "time_coverage_resolution", "featureType", "geospatial_vertical_positive", "geospatial_vertical_min", "geospatial_vertical_max", "geospatial_lat_min", "geospatial_lon_min", "geospatial_lat_max", "geospatial_lon_max", "geospatial_bounds" "geospatial_vertical_resolution", "geospatial_lat_resolution", "geospatial_lon_resolution", "Conventions", "date_created", "date_modified", "date_issued"] for k, v in global_attributes.items(): if v is None: v = "None" if k not in global_skips: nc.setncattr(k, v) now_date = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:00Z") nc.setncattr("Conventions", "CF-1.6,ACDD-1.3") nc.setncattr("date_created", now_date) nc.setncattr("date_modified", now_date) nc.setncattr("date_issued", now_date) if not hasattr(nc, "date_metadata_modified"): nc.setncattr("date_metadata_modified", now_date) # Allow the customization of this attribute if 'cdm_data_type' not in global_attributes: nc.setncattr('cdm_data_type', 'Station') old_history = getattr(nc, 'history', '') new_history = '{} - {} - {}'.format(now_date, 'pyaxiom', 'File created using pyaxiom') if old_history: nc.setncattr('history', '{}\n{}'.format(old_history, new_history)) else: nc.setncattr('history', new_history) # Station name nc.createDimension("feature_type_instance", len(station_name)) name = nc.createVariable("feature_type_instance", "S1", ("feature_type_instance",)) name.cf_role = "timeseries_id" name.long_name = "Identifier for each feature type instance" name[:] = list(station_name) # Location lat = nc.createVariable("latitude", get_type(latitude)) lat.units = "degrees_north" lat.standard_name = "latitude" lat.long_name = "sensor latitude" lat.axis = "Y" lat.valid_min = latitude lat.valid_max = latitude lat[:] = latitude nc.setncattr("geospatial_lat_min", latitude) nc.setncattr("geospatial_lat_max", latitude) nc.setncattr("geospatial_lat_resolution", 0) nc.setncattr("geospatial_lat_units", "degrees_north") lon = nc.createVariable("longitude", get_type(longitude)) lon.units = "degrees_east" lon.standard_name = "longitude" lon.long_name = "sensor longitude" lon.axis = "X" lon.valid_min = longitude lon.valid_max = longitude lon[:] = longitude nc.setncattr("geospatial_lon_min", longitude) nc.setncattr("geospatial_lon_max", longitude) nc.setncattr("geospatial_lon_resolution", 0) nc.setncattr("geospatial_lon_units", "degrees_east") nc.setncattr("geospatial_bounds", "POINT({} {})".format(longitude, latitude)) if not hasattr(nc, "geospatial_bounds_crs"): nc.setncattr("geospatial_bounds_crs", "EPSG:4326") # Metadata variables self.crs = nc.createVariable("crs", "i4") self.crs.long_name = "http://www.opengis.net/def/crs/EPSG/0/4326" self.crs.grid_mapping_name = "latitude_longitude" self.crs.epsg_code = "EPSG:4326" self.crs.semi_major_axis = float(6378137.0) self.crs.inverse_flattening = float(298.257223563) platform = nc.createVariable("platform", "i4") platform.definition = "http://mmisw.org/ont/ioos/definition/stationID" urn = IoosUrn.from_string(station_name) if urn.valid() is True: platform.short_name = global_attributes.get("title", urn.label) platform.long_name = global_attributes.get('summary', 'Station {}'.format(urn.label)) platform.ioos_code = urn.urn else: platform.short_name = global_attributes.get("title", station_name) platform.long_name = global_attributes.get("summary", station_name) platform.ioos_code = station_name if vertical_fill is None: vertical_fill = -9999.9 self.vertical_fill = vertical_fill self._nc = EnhancedDataset(self.out_file, 'a') self.setup_times_and_verticals(times, verticals) logger.info("Created file at '{}'".format(self.out_file))
class TimeSeries(object): @staticmethod def from_dataframe(df, output_directory, output_filename, latitude, longitude, station_name, global_attributes, variable_name, variable_attributes, sensor_vertical_datum=None, fillvalue=None, data_column=None, vertical_axis_name=None, vertical_positive=None, create_instrument_variable=False, attempts=None): # Attempts is how many files to try to build a NetCDF files from a # dataframe. For backwards compatibility purposes, we always try # everything (even manual matching which takes forever and is a memory # hog). attempts = attempts or 5 if fillvalue is None: fillvalue = -9999.9 if data_column is None: data_column = 'value' data_fillvalue = df[data_column].values.dtype.type(fillvalue) vertical_fillvalue = df['depth'].values.dtype.type(fillvalue) df[data_column] = df[data_column].fillna(data_fillvalue) times = np.asarray([ calendar.timegm(x.utctimetuple()) for x in df['time'] ]) df['depth'] = df['depth'].fillna(vertical_fillvalue) depths = df['depth'].values try: ts = TimeSeries(output_directory, latitude, longitude, station_name, global_attributes, times=times, verticals=depths, output_filename=output_filename, vertical_fill=vertical_fillvalue, vertical_axis_name=vertical_axis_name, vertical_positive=vertical_positive) ts.add_variable(variable_name, df[data_column].values, attributes=variable_attributes, sensor_vertical_datum=sensor_vertical_datum, raise_on_error=True, fillvalue=data_fillvalue, create_instrument_variable=create_instrument_variable) except ValueError: if attempts < 2: raise logger.warning("Attempt 2: using unique times") try: # Try uniquing time newtimes = np.unique(times) ts = TimeSeries(output_directory, latitude, longitude, station_name, global_attributes, times=newtimes, verticals=depths, output_filename=output_filename, vertical_fill=vertical_fillvalue, vertical_axis_name=vertical_axis_name, vertical_positive=vertical_positive) ts.add_variable(variable_name, df[data_column].values, attributes=variable_attributes, sensor_vertical_datum=sensor_vertical_datum, raise_on_error=True, fillvalue=data_fillvalue, create_instrument_variable=create_instrument_variable) except ValueError: if attempts < 3: raise logger.warning("Attempt 3: using unique depths") try: # Try uniquing depths newdepths = np.unique(df['depth'].values) ts = TimeSeries(output_directory, latitude, longitude, station_name, global_attributes, times=times, verticals=newdepths, output_filename=output_filename, vertical_fill=vertical_fillvalue, vertical_axis_name=vertical_axis_name, vertical_positive=vertical_positive) ts.add_variable(variable_name, df[data_column].values, attributes=variable_attributes, sensor_vertical_datum=sensor_vertical_datum, raise_on_error=True, fillvalue=data_fillvalue, create_instrument_variable=create_instrument_variable) except ValueError: if attempts < 4: raise logger.warning("Attempt 4: using unique time and depth") try: # Unique both time and depth newdepths = np.unique(df['depth'].values) ts = TimeSeries(output_directory, latitude, longitude, station_name, global_attributes, times=newtimes, verticals=newdepths, output_filename=output_filename, vertical_fill=vertical_fillvalue, vertical_axis_name=vertical_axis_name, vertical_positive=vertical_positive) ts.add_variable(variable_name, df[data_column].values, attributes=variable_attributes, sensor_vertical_datum=sensor_vertical_datum, raise_on_error=True, fillvalue=data_fillvalue, create_instrument_variable=create_instrument_variable) except ValueError: if attempts < 5: raise logger.warning("Attempt 5: manually matching (this is SLOW)") # Manually match ts = TimeSeries(output_directory, latitude, longitude, station_name, global_attributes, times=times, verticals=depths, output_filename=output_filename, vertical_fill=vertical_fillvalue, vertical_axis_name=vertical_axis_name, vertical_positive=vertical_positive) ts.add_variable(variable_name, df[data_column].values, attributes=variable_attributes, times=times, verticals=depths, sensor_vertical_datum=sensor_vertical_datum, raise_on_error=False, fillvalue=data_fillvalue, create_instrument_variable=create_instrument_variable) return ts def __init__(self, output_directory, latitude, longitude, station_name, global_attributes, times=None, verticals=None, vertical_fill=None, output_filename=None, vertical_axis_name=None, vertical_positive=None): if output_filename is None: output_filename = '{}_{}.nc'.format(station_name, int(random.random() * 100000)) logger.info("No output filename specified, saving as {}".format(output_filename)) self.vertical_positive = vertical_positive or 'down' self.vertical_axis_name = vertical_axis_name or 'z' self.time_axis_name = 'time' # Make directory if not os.path.exists(output_directory): os.makedirs(output_directory) self.time = None self.out_file = os.path.abspath(os.path.join(output_directory, output_filename)) if os.path.isfile(self.out_file): os.remove(self.out_file) with EnhancedDataset(self.out_file, 'w') as nc: # Global attributes # These are set by this script, we don't someone to be able to set them manually global_skips = ["time_coverage_start", "time_coverage_end", "time_coverage_duration", "time_coverage_resolution", "featureType", "geospatial_vertical_positive", "geospatial_vertical_min", "geospatial_vertical_max", "geospatial_lat_min", "geospatial_lon_min", "geospatial_lat_max", "geospatial_lon_max", "geospatial_bounds" "geospatial_vertical_resolution", "geospatial_lat_resolution", "geospatial_lon_resolution", "Conventions", "date_created", "date_modified", "date_issued"] for k, v in global_attributes.items(): if v is None: v = "None" if k not in global_skips: nc.setncattr(k, v) now_date = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:00Z") nc.setncattr("Conventions", "CF-1.6,ACDD-1.3") nc.setncattr("date_created", now_date) nc.setncattr("date_modified", now_date) nc.setncattr("date_issued", now_date) if not hasattr(nc, "date_metadata_modified"): nc.setncattr("date_metadata_modified", now_date) # Allow the customization of this attribute if 'cdm_data_type' not in global_attributes: nc.setncattr('cdm_data_type', 'Station') old_history = getattr(nc, 'history', '') new_history = '{} - {} - {}'.format(now_date, 'pyaxiom', 'File created using pyaxiom') if old_history: nc.setncattr('history', '{}\n{}'.format(old_history, new_history)) else: nc.setncattr('history', new_history) # Station name nc.createDimension("feature_type_instance", len(station_name)) name = nc.createVariable("feature_type_instance", "S1", ("feature_type_instance",)) name.cf_role = "timeseries_id" name.long_name = "Identifier for each feature type instance" name[:] = list(station_name) # Location lat = nc.createVariable("latitude", get_type(latitude)) lat.units = "degrees_north" lat.standard_name = "latitude" lat.long_name = "sensor latitude" lat.axis = "Y" lat.valid_min = latitude lat.valid_max = latitude lat[:] = latitude nc.setncattr("geospatial_lat_min", latitude) nc.setncattr("geospatial_lat_max", latitude) nc.setncattr("geospatial_lat_resolution", 0) nc.setncattr("geospatial_lat_units", "degrees_north") lon = nc.createVariable("longitude", get_type(longitude)) lon.units = "degrees_east" lon.standard_name = "longitude" lon.long_name = "sensor longitude" lon.axis = "X" lon.valid_min = longitude lon.valid_max = longitude lon[:] = longitude nc.setncattr("geospatial_lon_min", longitude) nc.setncattr("geospatial_lon_max", longitude) nc.setncattr("geospatial_lon_resolution", 0) nc.setncattr("geospatial_lon_units", "degrees_east") nc.setncattr("geospatial_bounds", "POINT({} {})".format(longitude, latitude)) if not hasattr(nc, "geospatial_bounds_crs"): nc.setncattr("geospatial_bounds_crs", "EPSG:4326") # Metadata variables self.crs = nc.createVariable("crs", "i4") self.crs.long_name = "http://www.opengis.net/def/crs/EPSG/0/4326" self.crs.grid_mapping_name = "latitude_longitude" self.crs.epsg_code = "EPSG:4326" self.crs.semi_major_axis = float(6378137.0) self.crs.inverse_flattening = float(298.257223563) platform = nc.createVariable("platform", "i4") platform.definition = "http://mmisw.org/ont/ioos/definition/stationID" urn = IoosUrn.from_string(station_name) if urn.valid() is True: platform.short_name = global_attributes.get("title", urn.label) platform.long_name = global_attributes.get('summary', 'Station {}'.format(urn.label)) platform.ioos_code = urn.urn else: platform.short_name = global_attributes.get("title", station_name) platform.long_name = global_attributes.get("summary", station_name) platform.ioos_code = station_name if vertical_fill is None: vertical_fill = -9999.9 self.vertical_fill = vertical_fill self._nc = EnhancedDataset(self.out_file, 'a') self.setup_times_and_verticals(times, verticals) logger.info("Created file at '{}'".format(self.out_file)) def add_instrument_metadata(self, urn): instrument = self._nc.createVariable("instrument", "i4") instrument.definition = "http://mmisw.org/ont/ioos/definition/sensorID" instrument.long_name = urn instrument.ioos_code = urn self._nc.instrument = 'instrument' self._nc.sync() def add_instrument_variable(self, variable_name): if variable_name not in self._nc.variables: logger.error("Variable {} not found in file, cannot create instrument metadata variable") return elif 'id' not in self._nc.ncattrs() or 'naming_authority' not in self._nc.ncattrs(): logger.error("Global attributes 'id' and 'naming_authority' are required to create an instrument variable") return instr_var_name = "{}_instrument".format(variable_name) instrument = self._nc.createVariable(instr_var_name, "i4") datavar = self._nc.variables[variable_name] vats = { k: getattr(datavar, k) for k in datavar.ncattrs() } instrument_urn = urnify(self._nc.naming_authority, self._nc.id, vats) inst_urn = IoosUrn.from_string(instrument_urn) instrument.long_name = 'Instrument measuring {} from {}'.format(inst_urn.component, inst_urn.label) instrument.ioos_code = instrument_urn instrument.short_name = inst_urn.component instrument.definition = "http://mmisw.org/ont/ioos/definition/sensorID" datavar.instrument = instr_var_name # Append the instrument to the ancilary variables av = getattr(datavar, 'ancillary_variables', '') av += ' {}'.format(instr_var_name) datavar.ancillary_variables = av.strip() self._nc.sync() def add_time_bounds(self, delta=None, position=None): self._nc.createDimension("bounds", 2) time_bounds = self._nc.createVariable('{}_bounds'.format(self.time_axis_name), "f8", ("time", "bounds",), chunksizes=(self.time_chunk, 2,)) time_bounds.units = "seconds since 1970-01-01T00:00:00Z" time_bounds.calendar = "gregorian" time_objs = netCDF4.num2date(self.time[:], units=self.time.units, calendar=self.time.calendar) bounds_kwargs = dict(units=time_bounds.units, calendar=time_bounds.calendar) if position == "start": time_bounds[:] = np.asarray(list(zip(self.time[:], netCDF4.date2num(time_objs + delta, **bounds_kwargs)))) elif position == "middle": time_bounds[:] = np.asarray(list(zip(netCDF4.date2num(time_objs - delta / 2, **bounds_kwargs), netCDF4.date2num(time_objs + delta / 2, **bounds_kwargs)))) elif position == "end": time_bounds[:] = np.asarray(list(zip(netCDF4.date2num(time_objs - delta, **bounds_kwargs), self.time[:]))) self._nc.sync() def add_variable(self, variable_name, values, times=None, verticals=None, sensor_vertical_datum=None, attributes=None, unlink_from_profile=None, fillvalue=None, raise_on_error=False, create_instrument_variable=False): if isinstance(values, (list, tuple,)) and values: values = np.asarray(values) if get_type(values) == np.int64: # Create values as int32 because DAP does not support int64 until DAP4. values = values.astype(np.int32) if isinstance(times, (list, tuple,)) and times: times = np.asarray(times) if get_type(times) == np.int64: # Create time as int32 because DAP does not support int64 until DAP4. times = times.astype(np.int32) if isinstance(verticals, (list, tuple,)) and verticals: verticals = np.asarray(verticals) if get_type(verticals) == np.int64: # Create verticals as int32 because DAP does not support int64 until DAP4. verticals = verticals.astype(np.int32) # Set vertical datum on the CRS variable if sensor_vertical_datum is not None: try: self.crs.geoid_name = sensor_vertical_datum self.crs.vertical_datum = sensor_vertical_datum self.crs.water_surface_reference_datum = sensor_vertical_datum if not hasattr(self._nc, "geospatial_bounds_vertical_crs"): self._nc.setncattr("geospatial_bounds_vertical_crs", sensor_vertical_datum) except AttributeError: pass # Set default fillvalue for new variables if fillvalue is None: fillvalue = -9999.9 fillvalue = values.dtype.type(fillvalue) used_values = None vertical_axis = self._nc.variables.get(self.vertical_axis_name) try: if unlink_from_profile is True: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] # These next two cases should work for all but a few cases, which are caught below elif vertical_axis.size == 1: used_values = np.ma.reshape(values, (self.time.size, )) used_values = used_values[self.time_indexes] else: used_values = np.ma.reshape(values, (self.time.size, vertical_axis.size, )) used_values = used_values[self.time_indexes] try: used_values = used_values[:, self.vertical_indexes] except IndexError: # The vertical values most likely had duplicates. Ignore the # falty index here and try to save the values as is. pass except ValueError: if raise_on_error is True: raise else: logger.warning("Could not do a simple reshape of data, trying to match manually! Time:{!s}, Heights:{!s}, Values:{!s}".format(self.time.size, vertical_axis.size, values.size)) if vertical_axis.size > 1: if times is not None and verticals is not None: # Hmmm, we have two actual height values for this station. # Not cool man, not cool. # Reindex the entire values array. This is slow. indexed = ((bisect.bisect_left(self.time[:], times[i]), bisect.bisect_left(vertical_axis[:], verticals[i]), values[i]) for i in range(values.size)) used_values = np.ndarray((self.time.size, vertical_axis.size, ), dtype=get_type(values)) used_values.fill(fillvalue) for (tzi, zzi, vz) in indexed: if zzi < vertical_axis.size and tzi < self.time.size: used_values[tzi, zzi] = vz del indexed else: raise ValueError("You need to pass in both 'times' and 'verticals' parameters that matches the size of the 'values' parameter.") else: if times is not None: # Ugh, find the time indexes manually indexed = ((bisect.bisect_left(self.time[:], times[i]), values[i]) for i in range(values.size)) used_values = np.ndarray((self.time.size, ), dtype=get_type(values)) used_values.fill(fillvalue) for (tzi, vz) in indexed: if tzi < self.time.size: used_values[tzi] = vz del indexed else: raise ValueError("You need to pass in a 'times' parameter that matches the size of the 'values' parameter.") logger.info("Setting values for {}...".format(variable_name)) if len(used_values.shape) == 1: var = self._nc.createVariable(variable_name, get_type(used_values), ("time",), fill_value=fillvalue, chunksizes=(self.time_chunk,), zlib=True) self._nc.setncattr('ncei_template_version', 'NCEI_NetCDF_TimeSeries_Orthogonal_Template_v2.0') if vertical_axis.size == 1: var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name) else: # This is probably a bottom sensor on an ADCP or something, don't add the height coordinate var.coordinates = "{} latitude longitude".format(self.time_axis_name) if unlink_from_profile is True: # Create metadata variable for the sensor_depth if verticals is not None and self._nc.variables.get('sensor_depth') is None: logger.info("Setting the special case 'sensor_depth' metadata variable") inst_depth = self._nc.createVariable('sensor_depth', get_type(verticals)) inst_depth.units = 'm' inst_depth.standard_name = 'surface_altitude' inst_depth.positive = self.vertical_positive if self.vertical_positive.lower() == 'down': inst_depth.long_name = 'sensor depth below datum' elif self.vertical_positive.lower() == 'up': inst_depth.long_name = 'sensor height above datum' inst_depth.datum = sensor_vertical_datum or 'Unknown' if verticals and verticals.size > 0: inst_depth[:] = verticals[0] else: inst_depth[:] = self.vertical_fill elif len(used_values.shape) == 2: var = self._nc.createVariable(variable_name, get_type(used_values), ("time", "z",), fill_value=fillvalue, chunksizes=(self.time_chunk, vertical_axis.size,), zlib=True) var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name) self._nc.setncattr('ncei_template_version', 'NCEI_NetCDF_TimeSeriesProfile_Orthogonal_Template_v2.0') else: raise ValueError("Could not create variable. Shape of data is {!s}. Expected a dimension of 1 or 2, not {!s}.".format(used_values.shape, len(used_values.shape))) # Set missing_value as well attributes = attributes or {} attributes['missing_value'] = fillvalue # Set the variable attributes as passed in if attributes: for k, v in attributes.items(): if k == 'vertical_datum' and sensor_vertical_datum is None and v is not None: # Use this as the vertical datum if it is specified and we didn't already have one try: self.crs.geoid_name = v self.crs.vertical_datum = v self.crs.water_surface_reference_datum = v if not hasattr(self._nc, "geospatial_bounds_vertical_crs"): self._nc.setncattr("geospatial_bounds_vertical_crs", v) except AttributeError: pass if k not in ['name', 'coordinates', '_FillValue'] and v is not None: try: var.setncattr(k, v) except BaseException: logger.info('Could not add attribute {}: {}, skipping.'.format(k, v)) # Add a long name if it doesn't exist if not hasattr(var, 'long_name'): varunits = getattr(var, 'units', None) vartitle = getattr(var, 'standard_name', getattr(var, 'name')) vartitle = vartitle.title().replace('_', ' ') if varunits is not None: vartitle = '{} ({})'.format(vartitle, varunits) var.long_name = vartitle var.grid_mapping = 'crs' var.platform = 'platform' var.ancillary_variables = 'platform' var.coverage_content_type = 'physicalMeasurement' var[:] = used_values if create_instrument_variable is True: self.add_instrument_variable(variable_name) self._nc.sync() del used_values return var def add_variable_object(self, varobject, dimension_map=None, reduce_dims=None): dimension_map = dimension_map or {} reduce_dims = reduce_dims or False fillvalue = -9999.99 if hasattr(varobject, '_FillValue'): fillvalue = varobject._FillValue fillvalue = varobject.dtype.type(fillvalue) dims = [] for n in varobject.dimensions: d = dimension_map.get(n, n) dim_size = varobject.shape[list(varobject.dimensions).index(n)] if reduce_dims is True and dim_size in [0, 1]: continue if d not in self._nc.dimensions: self._nc.createDimension(d, dim_size) dims.append(d) var = self._nc.createVariable(varobject.name, get_type(varobject), dims, fill_value=fillvalue, zlib=True) for k in varobject.ncattrs(): if k not in ['name', '_FillValue']: var.setncattr(k, varobject.getncattr(k)) if reduce_dims: var[:] = varobject[:].squeeze() else: var[:] = varobject[:] self._nc.sync() def setup_times_and_verticals(self, times, verticals): if isinstance(times, (list, tuple,)): times = np.asarray(times) # Create time as int32 or float64 because DAP does not support int64 until DAP4. if get_type(times) == np.int64: if times[-1] < 2147483647: # We can fit inside of an int32 times = times.astype(np.int32) else: # Create time as float32 because of int32 overflow times = times.astype(np.float64) # If nothing is passed in, set to the vertical_fill value. if not isinstance(verticals, np.ndarray) and not verticals: verticals = np.ma.masked_values([self.vertical_fill], self.vertical_fill) # Convert to masked array if isinstance(verticals, (list, tuple)): verticals = np.ma.masked_values(verticals, self.vertical_fill) elif isinstance(verticals, np.ndarray): self.vertical_fill = verticals.dtype.type(self.vertical_fill) verticals = np.ma.masked_values(verticals, self.vertical_fill) if get_type(verticals) == np.int64: # Create time as int32 because DAP does not support int64 until DAP4. verticals = verticals.astype(np.int32) # Don't unique Time... rely on the person submitting the data correctly. # That means we allow duplicate times, as long as the data contains duplicate times as well. self.time_indexes = np.argsort(times) full_times = times[self.time_indexes] # Unique the vertical values # Special case for all zeros. Added here for greater readability. if np.isclose(verticals, 0).all(): save_mask = verticals.mask verticals.mask = False unique_verticals, self.vertical_indexes = np.ma.unique(verticals, return_index=True) if save_mask.size > 1: unique_verticals.mask = save_mask[self.vertical_indexes] elif verticals is not None and verticals.any(): save_mask = verticals.mask verticals.mask = False unique_verticals, self.vertical_indexes = np.ma.unique(verticals, return_index=True) if save_mask.size > 1: unique_verticals.mask = save_mask[self.vertical_indexes] else: unique_verticals = verticals self.vertical_indexes = np.arange(len(verticals)) # Calculate time stats based on a unique time array unique_times = np.unique(full_times) starting = datetime.utcfromtimestamp(unique_times[0]) ending = datetime.utcfromtimestamp(unique_times[-1]) logger.debug("Setting up time...") # Time extents self._nc.setncattr("time_coverage_start", starting.isoformat()) self._nc.setncattr("time_coverage_end", ending.isoformat()) # duration (ISO8601 format) self._nc.setncattr("time_coverage_duration", "PT{0:d}S".format(int(round((ending - starting).total_seconds())))) # resolution (ISO8601 format) # subtract adjacent times to produce an array of differences, then get the most common occurance diffs = unique_times[1:] - unique_times[:-1] uniqs, inverse = np.unique(diffs, return_inverse=True) if uniqs.size > 1: time_diffs = diffs[np.bincount(inverse).argmax()] self._nc.setncattr("time_coverage_resolution", "PT{0:d}S".format(int(round(time_diffs)))) # Time self.time_chunk = min(full_times.size, 1000) self._nc.createDimension("time", full_times.size) self.time = self._nc.createVariable(self.time_axis_name, get_type(full_times), ("time",), chunksizes=(self.time_chunk,)) self.time.units = "seconds since 1970-01-01T00:00:00Z" self.time.standard_name = "time" self.time.long_name = "time of measurement" self.time.calendar = "gregorian" self.time.axis = "T" self.time[:] = full_times logger.debug("Setting up {}...".format(self.vertical_axis_name)) # Figure out if we are creating a Profile or just a TimeSeries self._nc.setncattr("geospatial_vertical_units", "meters") self._nc.setncattr("geospatial_vertical_positive", self.vertical_positive) if unique_verticals.size <= 1: # TIMESERIES self._nc.setncattr("featureType", "timeSeries") # Fill in variable if we have an actual height. Else, the fillvalue remains. self._nc.setncattr("geospatial_vertical_resolution", '0') self.z = self._nc.createVariable(self.vertical_axis_name, get_type(unique_verticals), fill_value=self.vertical_fill) if unique_verticals.size == 1 and not np.isnan(unique_verticals[0]) and unique_verticals[0] != self.vertical_fill: # Vertical extents self._nc.setncattr("geospatial_vertical_min", unique_verticals[0]) self._nc.setncattr("geospatial_vertical_max", unique_verticals[0]) self.z.valid_min = unique_verticals[0] self.z.valid_max = unique_verticals[0] elif unique_verticals.size > 1: # TIMESERIES PROFILE self._nc.setncattr("featureType", "timeSeriesProfile") # Vertical extents non_nan_verticals = unique_verticals[ (~np.isnan(unique_verticals)) & (unique_verticals != self.vertical_fill) ] minvertical = float(np.min(non_nan_verticals)) maxvertical = float(np.max(non_nan_verticals)) vertical_diffs = non_nan_verticals[1:] - non_nan_verticals[:-1] self._nc.setncattr("geospatial_vertical_min", minvertical) self._nc.setncattr("geospatial_vertical_max", maxvertical) if vertical_diffs.size >= 1: self._nc.setncattr("geospatial_vertical_resolution", " ".join([ str(x) for x in list(vertical_diffs) if not np.isnan(x) ])) else: self._nc.setncattr("geospatial_vertical_resolution", '0') # There is more than one vertical value for this variable, we need to create a vertical dimension self._nc.createDimension("z", unique_verticals.size) self.z = self._nc.createVariable(self.vertical_axis_name, get_type(unique_verticals), ("z", ), fill_value=self.vertical_fill) self.z.valid_min = minvertical self.z.valid_max = maxvertical self.z.grid_mapping = 'crs' self.z.long_name = "{} of the sensor relative to the water surface".format(self.vertical_axis_name) if self.vertical_positive == 'up': self.z.standard_name = 'height' elif self.vertical_positive == 'down': self.z.standard_name = 'depth' self.z.positive = self.vertical_positive self.z.units = "m" self.z.axis = "Z" self.z[:] = unique_verticals self._nc.sync() @property def ncd(self): return self._nc def __del__(self): if hasattr(self, '_nc') and self._nc: self._nc.close()
def setup_times_and_verticals(self, times, verticals): if isinstance(times, ( list, tuple, )): times = np.asarray(times) # If nothing is passed in, set to the vertical_fill value. if not isinstance(verticals, np.ndarray) and not verticals: verticals = np.ma.masked_values([self.vertical_fill], self.vertical_fill) # Convert to masked array if isinstance(verticals, ( list, tuple, )) or isinstance(verticals, np.ndarray): verticals = np.ma.masked_values(verticals, self.vertical_fill) # Don't unique Time... rely on the person submitting the data correctly. # That means we allow duplicate times, as long as the data contains duplicate times as well. self.time_indexes = np.argsort(times) unique_times = times[self.time_indexes] # Unique the vertical values # Special case for all zeros. Added here for greater readability. if np.isclose(verticals, 0).all(): save_mask = verticals.mask verticals.mask = False unique_verticals, self.vertical_indexes = np.ma.unique( verticals, return_index=True) if save_mask.size > 1: unique_verticals.mask = save_mask[self.vertical_indexes] elif verticals is not None and verticals.any(): save_mask = verticals.mask verticals.mask = False unique_verticals, self.vertical_indexes = np.ma.unique( verticals, return_index=True) if save_mask.size > 1: unique_verticals.mask = save_mask[self.vertical_indexes] else: unique_verticals = verticals self.vertical_indexes = np.arange(len(verticals)) starting = datetime.utcfromtimestamp(unique_times[0]) ending = datetime.utcfromtimestamp(unique_times[-1]) with EnhancedDataset(self.out_file, 'a') as nc: logger.debug("Setting up time...") # Time extents nc.setncattr("time_coverage_start", starting.isoformat()) nc.setncattr("time_coverage_end", ending.isoformat()) # duration (ISO8601 format) nc.setncattr( "time_coverage_duration", "P%sS" % str(int(round((ending - starting).total_seconds())))) # resolution (ISO8601 format) # subtract adjacent times to produce an array of differences, then get the most common occurance diffs = unique_times[1:] - unique_times[:-1] uniqs, inverse = np.unique(diffs, return_inverse=True) if uniqs.size > 1: time_diffs = diffs[np.bincount(inverse).argmax()] nc.setncattr("time_coverage_resolution", "P%sS" % str(int(round(time_diffs)))) # Time - 32-bit unsigned integer nc.createDimension("time") self.time = nc.createVariable(self.time_axis_name, "f8", ("time", ), chunksizes=(1000, )) self.time.units = "seconds since 1970-01-01T00:00:00Z" self.time.standard_name = "time" self.time.long_name = "time of measurement" self.time.calendar = "gregorian" self.time[:] = unique_times logger.debug("Setting up {}...".format(self.vertical_axis_name)) # Figure out if we are creating a Profile or just a TimeSeries nc.setncattr("geospatial_vertical_units", "meters") nc.setncattr("geospatial_vertical_positive", self.vertical_positive) if unique_verticals.size <= 1: # TIMESERIES nc.setncattr("featureType", "timeSeries") # Fill in variable if we have an actual height. Else, the fillvalue remains. nc.setncattr("geospatial_vertical_resolution", '0') if unique_verticals.size == 1 and not np.isnan( unique_verticals[0] ) and unique_verticals[0] != self.vertical_fill: # Vertical extents nc.setncattr("geospatial_vertical_min", unique_verticals[0]) nc.setncattr("geospatial_vertical_max", unique_verticals[0]) self.z = nc.createVariable(self.vertical_axis_name, "f8", fill_value=self.vertical_fill) elif unique_verticals.size > 1: # TIMESERIES PROFILE nc.setncattr("featureType", "timeSeriesProfile") # Vertical extents non_nan_verticals = unique_verticals[ (~np.isnan(unique_verticals)) & (unique_verticals != self.vertical_fill)] minvertical = float(np.min(non_nan_verticals)) maxvertical = float(np.max(non_nan_verticals)) vertical_diffs = non_nan_verticals[1:] - non_nan_verticals[:-1] nc.setncattr("geospatial_vertical_min", minvertical) nc.setncattr("geospatial_vertical_max", maxvertical) if vertical_diffs.size >= 1: nc.setncattr( "geospatial_vertical_resolution", " ".join([ str(x) for x in list(vertical_diffs) if not np.isnan(x) ])) else: nc.setncattr("geospatial_vertical_resolution", '0') # There is more than one vertical value for this variable, we need to create a vertical dimension nc.createDimension("z", unique_verticals.size) self.z = nc.createVariable(self.vertical_axis_name, "f8", ("z", ), fill_value=self.vertical_fill) self.z.grid_mapping = 'crs' self.z.long_name = "{} of the sensor relative to the water surface".format( self.vertical_axis_name) if self.vertical_positive == 'up': self.z.standard_name = 'height' elif self.vertical_positive == 'down': self.z.standard_name = 'depth' self.z.positive = self.vertical_positive self.z.units = "m" self.z.axis = "Z" self.z[:] = unique_verticals
def ncd(self): with EnhancedDataset(self.out_file, 'r') as nc: return nc
def normalize_epic_codes(netcdf_file, original_filename): with EnhancedDataset(netcdf_file, 'a') as nc: for v in nc.variables: nc_var = nc.variables.get(v) if v in variable_name_overrides: ec = variable_name_overrides.get(v).get('epic_code', None) if ec is not None: nc_var.epic_code = ec overrides = variable_name_overrides.get(v).get( 'overrides', dict()) for k, d in overrides.items(): if k == 'convert': nc_var[:] = d(nc_var[:]) elif k != 'original_units': nc_var.setncattr(k, d) if hasattr(nc_var, 'long_name'): if not hasattr(nc_var, 'epic_code') or ( hasattr(nc_var, 'epic_code') and nc_var.epic_code in IGNORABLE_CODES): lookup_long_name = nc_var.long_name.lower().strip() if lookup_long_name in long_name_overrides: ec = long_name_overrides.get(lookup_long_name).get( 'epic_code', None) if ec is not None: nc_var.epic_code = ec overrides = long_name_overrides.get( lookup_long_name).get('overrides', dict()) for k, d in overrides.items(): if k == 'convert': nc_var[:] = d(nc_var[:]) elif k != 'original_units': nc_var.setncattr(k, d) if hasattr(nc_var, "epic_code") and nc_var.epic_code: try: epic_code = int(nc_var.epic_code) except ValueError: logger.debug("No EPIC code specified on {0}".format(v)) else: # Specialized cases for generic EPIC codes if epic_code in special_map: attribs = special_map.get(epic_code)(nc_var, original_filename) else: attribs = epic2cf.mapping.get(epic_code) # Special case for 'Onset weather stations'. # https://github.com/USGS-CMG/usgs-cmg-portal/issues/69 if epic_code in [905, 908 ] and 'hml' in netcdf_file.lower(): attribs.standard_name = 'surface_downwelling_photosynthetic_radiative_flux_in_air' if attribs is not None and attribs.standard_name is not None: # Convert data to CF units nc_var[:] = attribs.convert(nc_var[:]) # Set attributes nc_var.standard_name = attribs.standard_name nc_var.long_name = attribs.long_name nc_var.units = attribs.cf_units nc_var.epic_code = epic_code # Set it again to be sure it is an int if attribs.cell_methods is not None: nc_var.cell_methods = attribs.cell_methods else: logger.debug( "Could not find CF mapping for EPIC code {!s}". format(epic_code))
def __init__(self, output_directory, latitude, longitude, station_name, global_attributes, times=None, verticals=None, vertical_fill=None, output_filename=None, vertical_axis_name=None, vertical_positive=None): if output_filename is None: output_filename = '{}_{}.nc'.format(station_name, int(random.random() * 100000)) logger.info("No output filename specified, saving as {}".format( output_filename)) self.vertical_positive = vertical_positive or 'down' self.vertical_axis_name = vertical_axis_name or 'z' self.time_axis_name = 'time' # Make directory if not os.path.exists(output_directory): os.makedirs(output_directory) self.time = None self.out_file = os.path.abspath( os.path.join(output_directory, output_filename)) if os.path.isfile(self.out_file): os.remove(self.out_file) with EnhancedDataset(self.out_file, 'w') as nc: # Global attributes # These are set by this script, we don't someone to be able to set them manually global_skips = [ "time_coverage_start", "time_coverage_end", "time_coverage_duration", "time_coverage_resolution", "featureType", "geospatial_vertical_positive", "geospatial_vertical_min", "geospatial_vertical_max", "geospatial_lat_min", "geospatial_lon_min", "geospatial_lat_max", "geospatial_lon_max", "geospatial_vertical_resolution", "Conventions", "date_created" ] for k, v in global_attributes.items(): if v is None: v = "None" if k not in global_skips: nc.setncattr(k, v) nc.setncattr("Conventions", "CF-1.6") nc.setncattr("date_created", datetime.utcnow().strftime("%Y-%m-%dT%H:%M:00Z")) nc.setncattr("date_issued", datetime.utcnow().strftime("%Y-%m-%dT%H:%M:00Z")) nc.setncattr('cdm_data_type', 'Station') # Station name nc.createDimension("feature_type_instance", len(station_name)) name = nc.createVariable("feature_type_instance", "S1", ("feature_type_instance", )) name.cf_role = "timeseries_id" name.long_name = "Identifier for each feature type instance" name[:] = list(station_name) # Location lat = nc.createVariable("latitude", "f8") lat.units = "degrees_north" lat.standard_name = "latitude" lat.long_name = "sensor latitude" lat[:] = latitude nc.setncattr("geospatial_lat_min", latitude) nc.setncattr("geospatial_lat_max", latitude) nc.setncattr("geospatial_lat_units", "degrees_north") lon = nc.createVariable("longitude", "f8") lon.units = "degrees_east" lon.standard_name = "longitude" lon.long_name = "sensor longitude" lon[:] = longitude nc.setncattr("geospatial_lon_min", longitude) nc.setncattr("geospatial_lon_max", longitude) nc.setncattr("geospatial_lon_units", "degrees_east") # Metadata variables self.crs = nc.createVariable("crs", "i4") self.crs.long_name = "http://www.opengis.net/def/crs/EPSG/0/4326" self.crs.grid_mapping_name = "latitude_longitude" self.crs.epsg_code = "EPSG:4326" self.crs.semi_major_axis = float(6378137.0) self.crs.inverse_flattening = float(298.257223563) platform = nc.createVariable("platform", "i4") platform.ioos_code = station_name platform.short_name = global_attributes.get("title", station_name) platform.long_name = global_attributes.get("description", station_name) platform.definition = "http://mmisw.org/ont/ioos/definition/stationID" nc.setncattr('platform', 'platform') if vertical_fill is None: vertical_fill = -9999.9 self.vertical_fill = vertical_fill self.setup_times_and_verticals(times, verticals) logger.info("Created file at '{}'".format(self.out_file))