def test_timeseries_profile_duplicate_heights(self): filename = 'test_timeseries_profile_duplicate_heights.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 0, 0, 1, 1, 1] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], 2) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 1) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(list(set(verticals))) assert nc.variables.get('temperature').size == len(times) * len(list(set(verticals))) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), 2))).all()
def test_timeseries_profile_with_shape(self): filename = 'test_timeseries_profile_with_shape.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)).reshape((len(times), len(verticals))) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), len(verticals)))).all()
def test_timeseries_extra_values(self): """ This will map directly to the time variable and ignore any time indexes that are not found. The 'times' parameter to add_variable should be the same length as the values parameter. """ filename = 'test_timeseries_extra_values.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25, 26, 27, 28] value_times = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, times=value_times) ts.close() nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables.get('time').size == len(times) assert nc.variables.get('temperature').size == len(times) assert (nc.variables.get('temperature')[:] == np.asarray(values[0:6])).all()
def test_instrumnet_metadata_variable(self): filename = 'test_timeseries.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None gats = copy(self.global_attributes) gats['naming_authority'] = 'pyaxiom' gats['geospatial_bounds_vertical_crs'] = 'NAVD88' ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=gats, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, create_instrument_variable=True, sensor_vertical_datum='bar') nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.geospatial_bounds_vertical_crs == 'NAVD88' # First one set datavar = nc.variables.get('temperature') instrument_var_name = datavar.instrument instvar = nc.variables[instrument_var_name] assert instvar.short_name == 'sea_water_temperature' assert instvar.ioos_code == urnify(gats['naming_authority'], gats['id'], attrs)
def test_extracting_dataframe_some_masked_heights(self): filename = 'test_extracting_dataframe_some_masked_heights.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [-9999.9, 7.8, 7.9] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals, vertical_fill=-9999.9) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z')[:].dtype == np.float64 assert np.allclose(nc.variables.get('z')[:], np.ma.array([np.nan, 7.8, 7.9], mask=[1, 0, 0])) assert nc.variables.get('temperature').size == len(times) * len(verticals) df = get_dataframe_from_variable(nc, nc.variables.get('temperature')) assert not df['depth'].dropna().empty
def test_history_append_to_list(self): filename = 'test_history_append.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None gats = copy(self.global_attributes) gats['history'] = 'this is some history\nsome other history\nsome more' ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=gats, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None history = nc.history.split('\n') assert len(history) == 4 assert history[0] == 'this is some history' assert history[1] == 'some other history' assert history[2] == 'some more' assert 'File created using pyaxiom' in history[3]
def test_station_name_as_urn_override_with_globals(self): filename = 'test_station_name_as_urn_override_with_globals.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None gats = copy(self.global_attributes) gats['title'] = "My Title Override" gats['summary'] = "My Summary Override" urn = 'urn:ioos:station:myauthority:mylabel' ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=urn, global_attributes=gats, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables['platform'].ioos_code == urn assert nc.variables['platform'].short_name == gats['title'] assert nc.variables['platform'].long_name == gats['summary']
def test_timeseries_profile(self): filename = 'test_timeseries_profile.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) ts.close() nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z').positive == 'down' assert nc.variables.get('temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), len(verticals)))).all()
def test_timeseries_profile_with_bottom_temperature(self): filename = 'test_timeseries_profile_with_bottom_temperature.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) bottom_values = [30, 31, 32, 33, 34, 35] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) ts.add_variable('bottom_temperature', values=bottom_values, verticals=[60], unlink_from_profile=True, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert nc.variables.get('sensor_depth') is not None assert nc.variables.get('bottom_temperature').size == len(times) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), len(verticals)))).all() assert (nc.variables.get('bottom_temperature')[:] == np.asarray(bottom_values)).all()
def test_timeseries_profile_fill_value_in_z(self): filename = 'test_timeseries_profile_fill_value_in_z.nc' times = [0, 1000, 2000, 3000, 4000, 5000] # Vertical fills MUST be at the BEGINNING of the array!!!! verticals = [self.fillvalue, 0] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [ self.fillvalue, 20, self.fillvalue, 21, self.fillvalue, 22, self.fillvalue, 23, self.fillvalue, 24, self.fillvalue, 25 ] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, fillvalue=self.fillvalue) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '0') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 0) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z')[:].dtype == np.float64 assert nc.variables.get( 'temperature').size == len(times) * len(verticals) assert nc.variables.get('temperature')[:][0][1] == 20 assert nc.variables.get('temperature')[:].mask[0][0] == True # noqa assert nc.variables.get('temperature')[:][1][1] == 21 assert nc.variables.get('temperature')[:].mask[1][0] == True # noqa assert nc.variables.get('temperature')[:][2][1] == 22 assert nc.variables.get('temperature')[:].mask[2][0] == True # noqa assert nc.variables.get('temperature')[:][3][1] == 23 assert nc.variables.get('temperature')[:].mask[3][0] == True # noqa assert nc.variables.get('temperature')[:][4][1] == 24 assert nc.variables.get('temperature')[:].mask[4][0] == True # noqa assert nc.variables.get('temperature')[:][5][1] == 25 assert nc.variables.get('temperature')[:].mask[5][0] == True # noqa assert ( nc.variables.get('temperature')[:] == np.asarray(values).reshape( (len(times), len(verticals)))).all()
def test_timeseries_many_variables(self): filename = 'test_timeseries_many_variables.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) bottom_values = [30, 31, 32, 33, 34, 35] full_masked = values.view(np.ma.MaskedArray) full_masked.mask = True attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) ts.add_variable('salinity', values=values.reshape((len(times), len(verticals)))) ts.add_variable('dissolved_oxygen', values=full_masked, fillvalue=full_masked.fill_value) ts.add_variable('bottom_temperature', values=bottom_values, verticals=[60], unlink_from_profile=True, attributes=attrs) ts.close() nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), len(verticals)))).all() assert (nc.variables.get('salinity')[:] == values.reshape((len(times), len(verticals)))).all() assert nc.variables.get('dissolved_oxygen')[:].mask.all()
def test_timeseries_profile_fill_value_in_z(self): filename = 'test_timeseries_profile_fill_value_in_z.nc' times = [0, 1000, 2000, 3000, 4000, 5000] # Vertical fills MUST be at the BEGINNING of the array!!!! verticals = [self.fillvalue, 0] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [self.fillvalue, 20, self.fillvalue, 21, self.fillvalue, 22, self.fillvalue, 23, self.fillvalue, 24, self.fillvalue, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, fillvalue=self.fillvalue) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '0') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 0) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert nc.variables.get('temperature')[:][0][1] == 20 assert nc.variables.get('temperature')[:].mask[0][0] == True assert nc.variables.get('temperature')[:][1][1] == 21 assert nc.variables.get('temperature')[:].mask[1][0] == True assert nc.variables.get('temperature')[:][2][1] == 22 assert nc.variables.get('temperature')[:].mask[2][0] == True assert nc.variables.get('temperature')[:][3][1] == 23 assert nc.variables.get('temperature')[:].mask[3][0] == True assert nc.variables.get('temperature')[:][4][1] == 24 assert nc.variables.get('temperature')[:].mask[4][0] == True assert nc.variables.get('temperature')[:][5][1] == 25 assert nc.variables.get('temperature')[:].mask[5][0] == True assert (nc.variables.get('temperature')[:] == np.asarray(values).reshape((len(times), len(verticals)))).all()
def test_timeseries_profile_unsorted_time_and_z(self): filename = 'test_timeseries_profile_unsorted_time_and_z.nc' times = [5000, 1000, 2000, 3000, 4000, 0] verticals = [0, 50] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, fillvalue=self.fillvalue) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '50') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 50) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z')[:].dtype == np.int32 assert nc.variables.get( 'temperature').size == len(times) * len(verticals) assert nc.variables.get('temperature')[:][0][0] == 25 assert nc.variables.get('temperature')[:][0][1] == 25 assert nc.variables.get('temperature')[:][1][0] == 21 assert nc.variables.get('temperature')[:][1][1] == 21 assert nc.variables.get('temperature')[:][2][0] == 22 assert nc.variables.get('temperature')[:][2][1] == 22 assert nc.variables.get('temperature')[:][3][0] == 23 assert nc.variables.get('temperature')[:][3][1] == 23 assert nc.variables.get('temperature')[:][4][0] == 24 assert nc.variables.get('temperature')[:][4][1] == 24 assert nc.variables.get('temperature')[:][5][0] == 20 assert nc.variables.get('temperature')[:][5][1] == 20
def test_timeseries_profile_extra_values(self): """ This will map directly to the time variable and ignore any time indexes that are not found. The 'times' parameter to add_variable should be the same length as the values parameter. """ filename = 'test_timeseries_profile_extra_values.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25, 26, 27, 28], len(verticals)) new_times = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000] values_times = np.repeat(new_times, len(verticals)) values_verticals = np.repeat(verticals, len(new_times)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, times=values_times, verticals=values_verticals) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get( 'temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == np.repeat( [20, 21, 22, 23, 24, 25], len(verticals)).reshape( (len(times), len(verticals)))).all()
def test_extracting_dataframe_ordered_masked_heights(self): filename = 'test_extracting_dataframe_ordered_masked_heights.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [np.nan, 7.8] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals, vertical_fill=np.nan) values = np.asarray([[20, 21], [22, 23], [24, 25], [30, 31], [32, 33], [34, 35]]) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z')[:].dtype == np.float64 # The height order is sorted! assert np.allclose( nc.variables.get('z')[:], np.ma.array([7.8, np.nan], mask=[0, 1])) assert nc.variables.get( 'temperature').size == len(times) * len(verticals) # Be sure the values are re-arranged because the height order is sorted! assert np.isclose(nc.variables.get('temperature')[:][0][0], 21) assert np.isclose(nc.variables.get('temperature')[:][1][0], 23) assert np.isclose(nc.variables.get('temperature')[:][2][0], 25) assert np.isclose(nc.variables.get('temperature')[:][3][0], 31) assert np.isclose(nc.variables.get('temperature')[:][4][0], 33) assert np.isclose(nc.variables.get('temperature')[:][5][0], 35) df = get_dataframe_from_variable(nc, nc.variables.get('temperature')) assert not df['depth'].dropna().empty
def test_from_variable(self): filename = 'test_urn_from_variable.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='lwe_thickness_of_precipitation_amount', vertical_datum='NAVD88') ts.add_variable('temperature', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#vertical_datum=navd88' values = [20, 21, 22, 23, 24, 25] attrs = dict( standard_name='lwe_thickness_of_precipitation_amount', cell_methods= 'time: variance (interval: PT1H comment: sampled instantaneously)') ts.add_variable('temperature2', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature2']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#cell_methods=time:variance;interval=pt1h' values = [20, 21, 22, 23, 24, 25] attrs = dict( standard_name='lwe_thickness_of_precipitation_amount', cell_methods= 'time: variance time: mean (interval: PT1H comment: sampled instantaneously)' ) ts.add_variable('temperature3', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature3']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#cell_methods=time:mean,time:variance;interval=pt1h' values = [20, 21, 22, 23, 24, 25] attrs = dict( standard_name='lwe_thickness_of_precipitation_amount', cell_methods= 'time: variance time: mean (interval: PT1H comment: sampled instantaneously)', discriminant='2') ts.add_variable('temperature4', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature4']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount-2#cell_methods=time:mean,time:variance;interval=pt1h'
def test_timeseries_extra_values(self): """ This will map directly to the time variable and ignore any time indexes that are not found. The 'times' parameter to add_variable should be the same length as the values parameter. """ filename = 'test_timeseries_extra_values.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25, 26, 27, 28] value_times = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, times=value_times) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '0') # No verticals, so these were not set with self.assertRaises(AttributeError): nc.geospatial_vertical_min with self.assertRaises(AttributeError): nc.geospatial_vertical_max assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('temperature').size == len(times) assert (nc.variables.get('temperature')[:] == np.asarray( values[0:6])).all()
def test_timeseries_profile_unsorted_time_and_z(self): filename = 'test_timeseries_profile_unsorted_time_and_z.nc' times = [5000, 1000, 2000, 3000, 4000, 0] verticals = [0, 50] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, fillvalue=self.fillvalue) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '50') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 50) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert nc.variables.get('temperature')[:][0][0] == 25 assert nc.variables.get('temperature')[:][0][1] == 25 assert nc.variables.get('temperature')[:][1][0] == 21 assert nc.variables.get('temperature')[:][1][1] == 21 assert nc.variables.get('temperature')[:][2][0] == 22 assert nc.variables.get('temperature')[:][2][1] == 22 assert nc.variables.get('temperature')[:][3][0] == 23 assert nc.variables.get('temperature')[:][3][1] == 23 assert nc.variables.get('temperature')[:][4][0] == 24 assert nc.variables.get('temperature')[:][4][1] == 24 assert nc.variables.get('temperature')[:][5][0] == 20 assert nc.variables.get('temperature')[:][5][1] == 20
def test_timeseries_profile(self): filename = 'test_timeseries_profile.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None # Basic metadata on all timeseries self.assertEqual(nc.cdm_data_type, 'Station') self.assertEqual(nc.geospatial_lat_units, 'degrees_north') self.assertEqual(nc.geospatial_lon_units, 'degrees_east') self.assertEqual(nc.geospatial_vertical_units, 'meters') self.assertEqual(nc.geospatial_vertical_positive, 'down') self.assertEqual(nc.featureType, 'timeSeriesProfile') self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z').positive == 'down' assert nc.variables.get('z')[:].dtype == np.int32 assert nc.variables.get( 'temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape( (len(times), len(verticals)))).all()
def test_timeseries(self): filename = 'test_timeseries.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None # Basic metadata on all timeseries self.assertEqual(nc.cdm_data_type, 'Station') self.assertEqual(nc.geospatial_lat_units, 'degrees_north') self.assertEqual(nc.geospatial_lon_units, 'degrees_east') self.assertEqual(nc.geospatial_vertical_units, 'meters') self.assertEqual(nc.geospatial_vertical_positive, 'down') self.assertEqual(nc.featureType, 'timeSeries') self.assertEqual(nc.geospatial_vertical_resolution, '0') # No verticals, so these were not set with self.assertRaises(AttributeError): nc.geospatial_vertical_min with self.assertRaises(AttributeError): nc.geospatial_vertical_max assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('temperature').size == len(times) assert (nc.variables.get('temperature')[:] == np.asarray(values)).all()
def test_from_variable(self): filename = 'test_urn_from_variable.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='lwe_thickness_of_precipitation_amount', vertical_datum='NAVD88') ts.add_variable('temperature', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#vertical_datum=navd88' values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='lwe_thickness_of_precipitation_amount', cell_methods='time: variance (interval: PT1H comment: sampled instantaneously)') ts.add_variable('temperature2', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature2']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#cell_methods=time:variance;interval=pt1h' values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='lwe_thickness_of_precipitation_amount', cell_methods='time: variance time: mean (interval: PT1H comment: sampled instantaneously)') ts.add_variable('temperature3', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature3']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount#cell_methods=time:mean,time:variance;interval=pt1h' values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='lwe_thickness_of_precipitation_amount', cell_methods='time: variance time: mean (interval: PT1H comment: sampled instantaneously)', discriminant='2') ts.add_variable('temperature4', values=values, attributes=attrs) ts.ncd.sync() urn = urnify('axiom', 'foo', ts.ncd.variables['temperature4']) assert urn == 'urn:ioos:sensor:axiom:foo:lwe_thickness_of_precipitation_amount-2#cell_methods=time:mean,time:variance;interval=pt1h' ts.close()
def test_timeseries(self): filename = 'test_timeseries.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = None ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None # Basic metadata on all timeseries self.assertEqual(nc.cdm_data_type, 'Station') self.assertEqual(nc.geospatial_lat_units, 'degrees_north') self.assertEqual(nc.geospatial_lon_units, 'degrees_east') self.assertEqual(nc.geospatial_vertical_units, 'meters') self.assertEqual(nc.geospatial_vertical_positive, 'down') self.assertEqual(nc.featureType, 'timeSeries') self.assertEqual(nc.geospatial_vertical_resolution, '0') # No verticals, so these were not set with self.assertRaises(AttributeError): nc.geospatial_vertical_min with self.assertRaises(AttributeError): nc.geospatial_vertical_max assert nc.variables.get('time').size == len(times) assert nc.variables.get('temperature').size == len(times) assert (nc.variables.get('temperature')[:] == np.asarray(values)).all()
def test_timeseries_many_variables(self): filename = 'test_timeseries_many_variables.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) bottom_values = [30, 31, 32, 33, 34, 35] full_masked = values.view(np.ma.MaskedArray) full_masked.mask = True attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) ts.add_variable('salinity', values=values.reshape((len(times), len(verticals)))) ts.add_variable('dissolved_oxygen', values=full_masked, fillvalue=full_masked.fill_value) ts.add_variable('bottom_temperature', values=bottom_values, verticals=[60], unlink_from_profile=True, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('time')[:].dtype == np.int32 assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z')[:].dtype == np.int32 assert nc.variables.get( 'temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape( (len(times), len(verticals)))).all() assert (nc.variables.get('salinity')[:] == values.reshape( (len(times), len(verticals)))).all() assert nc.variables.get('dissolved_oxygen')[:].mask.all()
def test_timeseries_profile_extra_values(self): """ This will map directly to the time variable and ignore any time indexes that are not found. The 'times' parameter to add_variable should be the same length as the values parameter. """ filename = 'test_timeseries_profile_extra_values.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25, 26, 27, 28], len(verticals)) new_times = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000] values_times = np.repeat(new_times, len(verticals)) values_verticals = np.repeat(verticals, len(new_times)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs, times=values_times, verticals=values_verticals) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == np.repeat([20, 21, 22, 23, 24, 25], len(verticals)).reshape((len(times), len(verticals)))).all()
def test_timeseries_profile(self): filename = 'test_timeseries_profile.nc' times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0, 1, 2] ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=filename, times=times, verticals=verticals) values = np.repeat([20, 21, 22, 23, 24, 25], len(verticals)) attrs = dict(standard_name='sea_water_temperature') ts.add_variable('temperature', values=values, attributes=attrs) nc = netCDF4.Dataset(os.path.join(self.output_directory, filename)) assert nc is not None # Basic metadata on all timeseries self.assertEqual(nc.cdm_data_type, 'Station') self.assertEqual(nc.geospatial_lat_units, 'degrees_north') self.assertEqual(nc.geospatial_lon_units, 'degrees_east') self.assertEqual(nc.geospatial_vertical_units, 'meters') self.assertEqual(nc.geospatial_vertical_positive, 'down') self.assertEqual(nc.featureType, 'timeSeriesProfile') self.assertEqual(nc.geospatial_vertical_resolution, '1 1') self.assertEqual(nc.geospatial_vertical_min, 0) self.assertEqual(nc.geospatial_vertical_max, 2) assert nc.variables.get('time').size == len(times) assert nc.variables.get('z').size == len(verticals) assert nc.variables.get('z').positive == 'down' assert nc.variables.get('temperature').size == len(times) * len(verticals) assert (nc.variables.get('temperature')[:] == values.reshape((len(times), len(verticals)))).all()
def main(output, download_folder, do_download, projects, csv_metadata_file, filesubset=None): project_metadata = dict() with open(csv_metadata_file, 'r') as f: reader = csv.DictReader(f) for row in reader: project_name = row['project_name'] if isinstance(project_name, str) and project_name[0] == '#': continue if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata, filesubset) except KeyboardInterrupt: logger.exception('Error downloading datasets from THREDDS') downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, "*")) for down_file in downloaded_files: if filesubset is not None: if os.path.basename(down_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue if projects: tmpnc = netCDF4.Dataset(down_file) project_name, _ = tmpnc.id.split("/") nc_close(tmpnc) if project_name.lower() not in projects: # Skip this project! continue _, temp_file = tempfile.mkstemp(prefix='cmg_collector', suffix='nc') shutil.copy(down_file, temp_file) nc = None try: # Cleanup to CF-1.6 first_time = normalize_time(temp_file) normalize_epic_codes(temp_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. mooring_id = None latitude = None longitude = None nc = netCDF4.Dataset(temp_file) project_name, _ = nc.id.split("/") feature_name, _ = os.path.splitext(os.path.basename(down_file)) fname = os.path.basename(down_file) try: if int(fname[0]) <= 9 and int(fname[0]) >= 2: # 1.) everything with first char between 2-9 is 3-digit mooring_id = int(fname[0:3]) elif int(fname[0]) == 1: # 2.) if MOORING starts with 1, and data is newer than 2014, it's 4 digit, otherwise 3 digit. if first_time > datetime(2014, 1, 1, 0): # 4 digit if after Jan 1, 2014 mooring_id = int(fname[0:4]) else: # 3 digit if before mooring_id = int(fname[0:3]) except ValueError: logger.exception("Could not create a suitable station_id. Skipping {0}.".format(down_file)) continue try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info("Translating {0} into CF1.6 format: {1}".format(down_file, os.path.abspath(os.path.join(output_directory, file_name)))) if not os.path.isdir(output_directory): os.makedirs(output_directory) file_global_attributes = { k : getattr(nc, k) for k in nc.ncattrs() } file_global_attributes.update(global_attributes) file_global_attributes['id'] = feature_name file_global_attributes['title'] = '{0} - {1}'.format(project_name, os.path.basename(down_file)) file_global_attributes['MOORING'] = mooring_id file_global_attributes['original_filename'] = fname file_global_attributes['original_folder'] = project_name if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in ['id', 'title', 'catalog_xml', 'project_name']: file_global_attributes[k] = v times = nc.variables.get('time')[:] # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [ x for x in nc.variables.get(dv).dimensions if 'depth' in x ] depth_variables = sorted(list(set(depth_variables))) depth_values = np.asarray([ nc.variables.get(x)[:] for x in depth_variables ]).flatten() # Convert everything to positive up, unless it is specifically specified as "up" already depth_conversion = -1.0 if depth_variables: pull_positive = nc.variables.get(depth_variables[0]) if pull_positive and hasattr(pull_positive, 'positive') and pull_positive.positive.lower() == 'up': depth_conversion = 1.0 depth_values = depth_values * depth_conversion ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name, vertical_positive='up') v = [] for other in sorted(nc.variables): # Sorted for a reason... don't change! if other in coord_vars: continue old_var = nc.variables.get(other) variable_attributes = { k : getattr(old_var, k) for k in old_var.ncattrs() } # Remove/rename some attributes # https://github.com/USGS-CMG/usgs-cmg-portal/issues/67 if 'valid_range' in variable_attributes: del variable_attributes['valid_range'] if 'minimum' in variable_attributes: variable_attributes['actual_min'] = variable_attributes['minimum'] del variable_attributes['minimum'] if 'maximum' in variable_attributes: variable_attributes['actual_max'] = variable_attributes['maximum'] del variable_attributes['maximum'] if 'sensor_depth' in variable_attributes: # Convert to the correct positive "up" or "down" variable_attributes['sensor_depth'] = variable_attributes['sensor_depth'] * depth_conversion fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split('_')[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [ x for x in old_var.dimensions if 'depth' in x ] if depth_variable and len(old_var.dimensions) > 1 and 'time' in old_var.dimensions: depth_index = np.squeeze(np.where(depth_values == (nc.variables.get(depth_variable[0])[:] * depth_conversion))) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if search_var != other and search_var.split('_')[0] == new_var_name and \ depth_variable[0] != [ x for x in nc.variables[search_var].dimensions if 'depth' in x ][0]: # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get(search_var).dimensions if 'depth' in x ] depth_index = np.squeeze(np.where(depth_values == (nc.variables.get(search_depth_variable[0])[:] * depth_conversion))) depth_indexes.append((search_var, depth_index)) logger.info("Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)".format(search_var, other, new_var_name)) values = np.ma.empty((times.size, len(depth_values))) values.fill_value = fillvalue values.mask = True for nm, index in depth_indexes: values[:, index] = np.squeeze(nc.variables.get(nm)[:]) # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other # Create this one, should be the first we encounter for this type ts.add_variable(new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) elif len(old_var.dimensions) == 1 and old_var.dimensions[0] == 'time': # A single time dimensioned variable, like pitch, roll, record count, etc. ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) elif depth_variable and 'time' not in old_var.dimensions: # Metadata variable like bin distance meta_var = ts.ncd.createVariable(other, old_var.dtype, ('z',), fill_value=fillvalue) for k, v in variable_attributes.iteritems(): if k != '_FillValue': meta_var.setncattr(k, v) meta_var[:] = old_var[:] elif depth_values.size == 1 and not depth_variable and 'time' in old_var.dimensions: # There is a single depth_value for most variables, but this one does not have a depth dimension # Instead, it has a sensor_depth attribute that defines the Z index. These need to be put into # a different file to remain CF compliant. new_file_name = file_name.replace('.nc', '_{}.nc'.format(other)) new_ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=[old_var.sensor_depth*depth_conversion], output_filename=new_file_name, vertical_positive='up') new_ts.add_variable(other, values=old_var[:], times=times, verticals=[old_var.sensor_depth*depth_conversion], fillvalue=fillvalue, attributes=variable_attributes) new_ts.close() elif depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions: if hasattr(old_var, 'sensor_depth'): # An ADCP or profiling dataset, but this variable is measued at a single depth. # Example: Bottom Temperature on an ADCP ts.add_variable(other, values=old_var[:], times=times, verticals=[old_var.sensor_depth*depth_conversion], unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=old_var[:], times=times, fillvalue=fillvalue, attributes=variable_attributes) ts.ncd.sync() ts.ncd.close() except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: nc_close(nc) if os.path.isfile(temp_file): os.remove(temp_file)
def main(output, download_folder, do_download, projects, csv_metadata_file, filesubset=None): project_metadata = dict() with open(csv_metadata_file, "r") as f: reader = csv.DictReader(f) for row in reader: project_name = row["project_name"] if isinstance(project_name, str) and project_name[0] == "#": continue if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata, filesubset) except KeyboardInterrupt: logger.exception("Error downloading datasets from THREDDS") downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, "**", "*")) for down_file in sorted(downloaded_files): _, temp_file = tempfile.mkstemp(prefix="cmg_collector", suffix="nc") try: if filesubset is not None: if os.path.basename(down_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue project_name = os.path.basename(os.path.dirname(down_file)) if projects: if project_name.lower() not in projects: # Skip this project! continue shutil.copy(down_file, temp_file) # Cleanup to CF-1.6 try: first_time = normalize_time(temp_file) except (TypeError, ValueError, IndexError): logger.error("Could not normalize the time variable. Skipping {0}.".format(down_file)) continue except OverflowError: logger.error("Dates out of range. Skipping {0}.".format(down_file)) continue normalize_epic_codes(temp_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. mooring_id = None latitude = None longitude = None fname = os.path.basename(down_file) feature_name, file_ext = os.path.splitext(os.path.basename(down_file)) try: if int(fname[0]) <= 9 and int(fname[0]) >= 2: # 1.) everything with first char between 2-9 is 3-digit mooring_id = int(fname[0:3]) elif int(fname[0]) == 1: # 2.) if MOORING starts with 1, and data is newer than 2014, it's 4 digit, otherwise 3 digit. if first_time > datetime(2014, 1, 1, 0): # 4 digit if after Jan 1, 2014 mooring_id = int(fname[0:4]) else: # 3 digit if before mooring_id = int(fname[0:3]) except ValueError: logger.exception("Could not create a suitable station_id. Skipping {0}.".format(down_file)) continue file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info( "Translating {0} into CF1.6 format: {1}".format( down_file, os.path.abspath(os.path.join(output_directory, file_name)) ) ) with EnhancedDataset(temp_file) as nc: try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] except TypeError: logger.error("Could not find lat/lon variables. Skipping {0}.".format(down_file)) file_global_attributes = {k: getattr(nc, k) for k in nc.ncattrs()} file_global_attributes.update(global_attributes) file_global_attributes["id"] = feature_name file_global_attributes["title"] = os.path.basename(down_file) file_global_attributes["description"] = "{0} - {1}".format(project_name, os.path.basename(down_file)) file_global_attributes["MOORING"] = mooring_id file_global_attributes["original_filename"] = fname file_global_attributes["original_folder"] = project_name if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in ["id", "title", "catalog_xml", "project_name"]: file_global_attributes[k] = v times = nc.variables.get("time")[:] # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [x for x in nc.variables.get(dv).dimensions if "depth" in x] depth_variables = sorted(list(set(depth_variables))) try: assert depth_variables depth_values = np.asarray([nc.variables.get(x)[:] for x in depth_variables]).flatten() except (AssertionError, TypeError): logger.warning("No depth variables found in {}, skipping.".format(down_file)) continue # Convert everything to positive up, unless it is specifically specified as "up" already depth_conversion = -1.0 if depth_variables: pull_positive = nc.variables.get(depth_variables[0]) if hasattr(pull_positive, "positive") and pull_positive.positive.lower() == "up": depth_conversion = 1.0 depth_values = depth_values * depth_conversion if not os.path.isdir(output_directory): os.makedirs(output_directory) ts = TimeSeries( output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name, vertical_positive="up", ) # Set the platform type from the global attribute 'platform_type', defaulting to 'fixed' with EnhancedDataset(ts.out_file, "a") as onc: platform_type = getattr(onc, "platform_type", "fixed").lower() onc.variables["platform"].setncattr("type", platform_type) onc.variables["platform"].setncattr("nodc_name", "FIXED PLATFORM, MOORINGS") v = [] depth_files = [] for other in sorted(nc.variables): # Sorted for a reason... don't change! try: if other in coord_vars: continue old_var = nc.variables.get(other) variable_attributes = {k: getattr(old_var, k) for k in old_var.ncattrs()} # Remove/rename some attributes # https://github.com/USGS-CMG/usgs-cmg-portal/issues/67 if "valid_range" in variable_attributes: del variable_attributes["valid_range"] if "minimum" in variable_attributes: variable_attributes["actual_min"] = variable_attributes["minimum"] del variable_attributes["minimum"] if "maximum" in variable_attributes: variable_attributes["actual_max"] = variable_attributes["maximum"] del variable_attributes["maximum"] if "sensor_depth" in variable_attributes: # Convert to the correct positive "up" or "down" variable_attributes["sensor_depth"] = variable_attributes["sensor_depth"] * depth_conversion fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split("_")[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [x for x in old_var.dimensions if "depth" in x] if depth_variable and len(old_var.dimensions) > 1 and "time" in old_var.dimensions: depth_index = np.squeeze( np.where(depth_values == (nc.variables.get(depth_variable[0])[:] * depth_conversion)) ) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if ( search_var != other and search_var.split("_")[0] == new_var_name and depth_variable[0] != [x for x in nc.variables[search_var].dimensions if "depth" in x][0] ): # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get(search_var).dimensions if "depth" in x ] depth_index = np.squeeze( np.where( depth_values == (nc.variables.get(search_depth_variable[0])[:] * depth_conversion) ) ) depth_indexes.append((search_var, depth_index)) logger.info( "Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)".format( search_var, other, new_var_name ) ) values = np.ma.empty((times.size, len(depth_values)), dtype=old_var.dtype) values.fill_value = fillvalue values.mask = True inconsistent = False for nm, index in depth_indexes: try: values[:, index] = np.squeeze(nc.variables.get(nm)[:]) except ValueError: inconsistent = True break # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other if inconsistent is True: # Incorrect array size, most likely a strange variable ts.add_variable_object(old_var, dimension_map=dict(depth="z"), reduce_dims=True) else: # Create this one, should be the first we encounter for this type ts.add_variable( new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes, ) elif len(old_var.dimensions) == 1 and old_var.dimensions[0] == "time": # A single time dimensioned variable, like pitch, roll, record count, etc. ts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes, ) elif ( old_var.ndim <= 3 and hasattr(old_var, "sensor_depth") and ( (depth_values.size == 1 and not depth_variable and "time" in old_var.dimensions) or ( depth_values.size > 1 and not depth_variable and "time" in old_var.dimensions and "sensor_depth" in ts.ncd.variables ) ) ): if "sensor_depth" in ts.ncd.variables and np.isclose( ts.ncd.variables["sensor_depth"][:], old_var.sensor_depth * depth_conversion ): ts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[old_var.sensor_depth * depth_conversion], fillvalue=fillvalue, attributes=variable_attributes, ) else: # Search through secondary files that have been created for detached variables at a certain depth and # try to match this variable with one of the depths. found_df = False for dfts in depth_files: if isinstance(old_var.sensor_depth, np.ndarray): # Well, this is a bad file. raise ValueError( "The sensor_depth attribute has more than one value, please fix the source NetCDF: {}".format( down_file ) ) if np.isclose( dfts.ncd.variables[ts.vertical_axis_name][:], old_var.sensor_depth * depth_conversion, ): dfts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[old_var.sensor_depth * depth_conversion], fillvalue=fillvalue, attributes=variable_attributes, ) found_df = True break # If we couldn't match the current or one of the existing secondary depth files, create a new one. if found_df is False: new_file_name = file_name.replace( file_ext, "_z{}{}".format(len(depth_files) + 1, file_ext) ) fga = copy(file_global_attributes) fga["id"] = os.path.splitext(new_file_name)[0] fga["title"] = "{0} - {1}".format(os.path.basename(down_file), other) fga["description"] = "{0} - {1} - {2}".format( project_name, os.path.basename(down_file), other ) new_ts = TimeSeries( output_directory, latitude, longitude, feature_name, fga, times=times, verticals=[old_var.sensor_depth * depth_conversion], output_filename=new_file_name, vertical_positive="up", ) new_ts.add_variable( other, values=old_var[:], times=times, verticals=[old_var.sensor_depth * depth_conversion], fillvalue=fillvalue, attributes=variable_attributes, ) depth_files.append(new_ts) elif old_var.ndim <= 3 and ( depth_values.size > 1 and not depth_variable and "time" in old_var.dimensions ): if hasattr(old_var, "sensor_depth"): # An ADCP or profiling dataset, but this variable is measued at a single depth. # Example: Bottom Temperature on an ADCP # Skip things with a dimension over 3 (some beam variables like `brange`) ts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[old_var.sensor_depth * depth_conversion], fillvalue=fillvalue, attributes=variable_attributes, ) else: ts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes, ) else: if "time" in old_var.dimensions and old_var.ndim <= 3: ts.add_variable( other, values=old_var[:], times=times, fillvalue=fillvalue, attributes=variable_attributes, ) else: ts.add_variable_object(old_var, dimension_map=dict(depth="z"), reduce_dims=True) except BaseException: logger.exception("Error processing variable {0} in {1}. Skipping it.".format(other, down_file)) except KeyboardInterrupt: logger.info("Breaking out of Translate loop!") break except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: if os.path.isfile(temp_file): os.remove(temp_file)
# ### Add data variables # In[10]: df.columns.tolist() # In[11]: for c in df.columns: if c in ts._nc.variables: print("Skipping '{}' (already in file)".format(c)) continue if c in ['time', 'lat', 'lon', 'depth', 'cpm_date_time_string']: print("Skipping axis '{}' (already in file)".format(c)) continue print("Adding {}".format(c)) try: ts.add_variable(c, df[c].values) except: print('skipping, hit object') # In[ ]:
def main(output, download_folder, do_download, projects, csv_metadata_file, filesubset=None, since=None): project_metadata = dict() with open(csv_metadata_file, 'r') as f: reader = csv.DictReader(f) for row in reader: project_name = row['project_name'] if isinstance(project_name, str) and project_name[0] == '#': continue if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata, filesubset, since) except KeyboardInterrupt: logger.exception('Error downloading datasets from THREDDS') downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, '**', '*')) if since is not None: def should_keep(d): modt = datetime.utcfromtimestamp(os.path.getmtime(d)).replace(tzinfo=pytz.utc) return modt >= since downloaded_files = [ dl for dl in downloaded_files if should_keep(dl) ] for down_file in sorted(downloaded_files): temp_fd, temp_file = tempfile.mkstemp(prefix='cmg_collector', suffix='nc') try: if filesubset is not None: if os.path.basename(down_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue project_name = os.path.basename(os.path.dirname(down_file)) if projects: if project_name.lower() not in projects: # Skip this project! continue shutil.copy(down_file, temp_file) # Cleanup to CF-1.6 try: first_time = normalize_time(temp_file) except (TypeError, ValueError, IndexError): logger.exception("Could not normalize the time variable. Skipping {0}.".format(down_file)) continue except OverflowError: logger.error("Dates out of range. Skipping {0}.".format(down_file)) continue normalize_variable_attribute_types(temp_file) normalize_epic_codes(temp_file, down_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. mooring_id = None latitude = None longitude = None fname = os.path.basename(down_file) feature_name, file_ext = os.path.splitext(os.path.basename(down_file)) try: if int(fname[0]) <= 9 and int(fname[0]) >= 2: # 1.) everything with first char between 2-9 is 3-digit mooring_id = int(fname[0:3]) elif int(fname[0]) == 1: # 2.) if MOORING starts with 1, and data is newer than 2014, it's 4 digit, otherwise 3 digit. if first_time > datetime(2014, 1, 1, 0): # 4 digit if after Jan 1, 2014 mooring_id = int(fname[0:4]) else: # 3 digit if before mooring_id = int(fname[0:3]) except ValueError: logger.exception("Could not create a suitable station_id. Skipping {0}.".format(down_file)) continue file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info("Translating {0} into CF1.6 format: {1}".format(down_file, os.path.abspath(os.path.join(output_directory, file_name)))) with EnhancedDataset(temp_file) as nc: try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] except TypeError: logger.error("Could not find lat/lon variables. Skipping {0}.".format(down_file)) file_global_attributes = { k : getattr(nc, k) for k in nc.ncattrs() } file_global_attributes.update(global_attributes) file_global_attributes['id'] = feature_name file_global_attributes['MOORING'] = mooring_id file_global_attributes['original_filename'] = fname file_global_attributes['original_folder'] = project_name no_override = ['id', 'MOORING', 'original_filename', 'original_folder', 'catalog_xml', 'project_name'] if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in no_override: file_global_attributes[k] = v if 'summary' in file_global_attributes: # Save the original summary file_global_attributes['WHOI_Buoy_Group_summary'] = file_global_attributes['summary'] # Better title/summary for discovery via catalogs project_title = file_global_attributes.get('project_title', project_name).strip() project_summary = file_global_attributes.get('project_summary', '').strip() file_global_attributes['title'] = 'USGS-CMG time-series data: {0} - {1} - {2}'.format(project_name, mooring_id, feature_name) file_global_attributes['summary'] = 'USGS-CMG time-series data from the {} project, mooring {} and package {}. {}'.format(project_title, mooring_id, feature_name, project_summary).strip() times = nc.variables.get('time')[:] # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [ x for x in nc.variables.get(dv).dimensions if 'depth' in x ] depth_variables = sorted(list(set(depth_variables))) try: assert depth_variables depth_values = np.asarray([ nc.variables.get(x)[:] for x in depth_variables ]).flatten() except (AssertionError, TypeError): logger.warning("No depth variables found in {}, skipping.".format(down_file)) continue # Convert everything to positive up, unless it is specifically specified as "up" already depth_conversion = -1.0 if depth_variables: pull_positive = nc.variables.get(depth_variables[0]) if hasattr(pull_positive, 'positive') and pull_positive.positive.lower() == 'up': depth_conversion = 1.0 depth_values = depth_values * depth_conversion if not os.path.isdir(output_directory): os.makedirs(output_directory) ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name, vertical_positive='up') # Set the platform type from the global attribute 'platform_type', defaulting to 'fixed' with EnhancedDataset(ts.out_file, 'a') as onc: platform_type = getattr(onc, 'platform_type', 'fixed').lower() onc.variables['platform'].setncattr('type', platform_type) onc.variables['platform'].setncattr('nodc_name', "FIXED PLATFORM, MOORINGS") # Add ERDDAP variables onc.cdm_data_type = "TimeSeries" onc.cdm_timeseries_variables = "latitude,longitude,z,feature_type_instance" v = [] depth_files = [] for other in sorted(nc.variables): # Sorted for a reason... don't change! try: if other in coord_vars: continue ovsd = None # old var sensor depth old_var = nc.variables.get(other) variable_attributes = { k : getattr(old_var, k) for k in old_var.ncattrs() } # Remove/rename some attributes # https://github.com/USGS-CMG/usgs-cmg-portal/issues/67 if 'valid_range' in variable_attributes: del variable_attributes['valid_range'] if 'minimum' in variable_attributes: variable_attributes['actual_min'] = variable_attributes['minimum'] del variable_attributes['minimum'] if 'maximum' in variable_attributes: variable_attributes['actual_max'] = variable_attributes['maximum'] del variable_attributes['maximum'] if 'sensor_depth' in variable_attributes: # sensor_depth is ALWAYS positive "down", so don't convert! # This is contrary to the "positive" attribute on the Z axis. # variable_attributes['sensor_depth'] = variable_attributes['sensor_depth'] * -1 # Round the sensor_depth attribute variable_attributes['sensor_depth'] = np.around(variable_attributes['sensor_depth'], decimals=4) ovsd = np.around(old_var.sensor_depth * depth_conversion, decimals=4) fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split('_')[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [ x for x in old_var.dimensions if 'depth' in x ] if depth_variable and len(old_var.dimensions) > 1 and 'time' in old_var.dimensions: depth_index = np.squeeze(np.where(depth_values == (nc.variables.get(depth_variable[0])[:] * depth_conversion))) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if search_var != other and search_var.split('_')[0] == new_var_name and \ depth_variable[0] != [ x for x in nc.variables[search_var].dimensions if 'depth' in x ][0]: # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get(search_var).dimensions if 'depth' in x ] depth_index = np.squeeze(np.where(depth_values == (nc.variables.get(search_depth_variable[0])[:] * depth_conversion))) depth_indexes.append((search_var, depth_index)) logger.info("Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)".format(search_var, other, new_var_name)) values = np.ma.empty((times.size, len(depth_values)), dtype=old_var.dtype) values.fill_value = values.dtype.type(fillvalue) fillvalue = values.dtype.type(fillvalue) values.mask = True inconsistent = False for nm, index in depth_indexes: try: values[:, index] = np.squeeze(nc.variables.get(nm)[:]) except ValueError: inconsistent = True break # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other if inconsistent is True: # Incorrect array size, most likely a strange variable ts.add_variable_object(old_var, dimension_map=dict(depth='z'), reduce_dims=True) else: # Create this one, should be the first we encounter for this type ts.add_variable(new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) elif len(old_var.dimensions) == 1 and old_var.dimensions[0] == 'time': # A single time dimensioned variable, like pitch, roll, record count, etc. ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) elif old_var.ndim <= 3 and ovsd and \ ((depth_values.size == 1 and not depth_variable and 'time' in old_var.dimensions) or (depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions and 'sensor_depth' in ts.ncd.variables)): if 'sensor_depth' in ts.ncd.variables and np.isclose(ts.ncd.variables['sensor_depth'][:], ovsd): ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: # Search through secondary files that have been created for detached variables at a certain depth and # try to match this variable with one of the depths. found_df = False for dfts in depth_files: if isinstance(ovsd, np.ndarray): # Well, this is a bad file. raise ValueError("The sensor_depth attribute has more than one value, please fix the source NetCDF: {}".format(down_file)) if np.isclose(dfts.ncd.variables[ts.vertical_axis_name][:], ovsd): dfts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) found_df = True break # If we couldn't match the current or one of the existing secondary depth files, create a new one. if found_df is False: new_file_name = file_name.replace(file_ext, '_z{}{}'.format(len(depth_files) + 1, file_ext)) fga = copy(file_global_attributes) fga['id'] = os.path.splitext(new_file_name)[0] new_ts = TimeSeries(output_directory, latitude, longitude, feature_name, fga, times=times, verticals=[ovsd], output_filename=new_file_name, vertical_positive='up') new_ts.add_variable(other, values=old_var[:], times=times, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) depth_files.append(new_ts) elif old_var.ndim <= 3 and (depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions): if ovsd: # An ADCP or profiling dataset, but this variable is measued at a single depth. # Example: Bottom Temperature on an ADCP # Skip things with a dimension over 3 (some beam variables like `brange`) ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: if 'time' in old_var.dimensions and old_var.ndim <= 3: ts.add_variable(other, values=old_var[:], times=times, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable_object(old_var, dimension_map=dict(depth='z'), reduce_dims=True) except BaseException: logger.exception("Error processing variable {0} in {1}. Skipping it.".format(other, down_file)) except KeyboardInterrupt: logger.info("Breaking out of Translate loop!") break except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: try: for df in depth_files: del df except NameError: pass try: del ts except NameError: pass os.close(temp_fd) if os.path.isfile(temp_file): os.remove(temp_file)
def parse_type_1(output_format, site_id, contents, output, csv_link): """ # ---------------------------------- WARNING ---------------------------------------- # The data you have obtained from this automated U.S. Geological Survey database # have not received Director's approval and as such are provisional and subject to # revision. The data are released on the condition that neither the USGS nor the # United States Government may be held liable for any damages resulting from its use. # Additional info: http://waterdata.usgs.gov/ga/nwis/help/?provisional # # File-format description: http://waterdata.usgs.gov/nwis/?tab_delimited_format_info # Automated-retrieval info: http://waterdata.usgs.gov/nwis/?automated_retrieval_info # # Contact: [email protected] # retrieved: 2012-11-20 12:05:22 EST (caww01) # # Data for the following 1 site(s) are contained in this file # USGS 395740074482628 South Branch Rancocas Cr at S Main St nr Lumberton # ----------------------------------------------------------------------------------- # # Data provided for site 395740074482628 # DD parameter Description # 03 00035 Wind speed, miles per hour # 07 00025 Barometric pressure, millimeters of mercury # 09 00045 Precipitation, total, inches # 19 63160 Stream water level elevation above NAVD 1988, in feet # # Data-value qualification codes included in this output: # P Provisional data subject to revision. # agency_cd site_no datetime tz_cd 03_00035 03_00035_cd 07_00025 07_00025_cd 09_00045 09_00045_cd 19_63160 19_63160_cd 5s 15s 20d 6s 14n 10s 14n 10s 14n 10s 14n 10s USGS 395740074482628 2012-10-28 13:00 EST 4.2 P 755 P 3.22 P USGS 395740074482628 2012-10-28 13:15 EST 6.4 P 754 P 0.00 P 3.36 P USGS 395740074482628 2012-10-28 13:30 EST 3.6 P 754 P 0.00 P 3.50 P USGS 395740074482628 2012-10-28 13:45 EST 3.2 P 754 P 0.00 P 3.63 P USGS 395740074482628 2012-10-28 14:00 EST 7.0 P 754 P 0.00 P 3.76 P USGS 395740074482628 2012-10-28 14:15 EST 4.0 P 754 P 0.00 P 3.87 P ... """ # lat/lon point: http://waterservices.usgs.gov/nwis/site/?sites=395740074482628 variable_map = { '01_00065' : {'long_name' : 'Gage height', 'geoid_name' : 'NAVD88', 'vertical_datum' : 'NAVD88', 'water_surface_reference_datum' : 'NAVD88', 'standard_name' : 'water_surface_height_above_reference_datum', 'units': 'feet'}, '03_00035' : {'long_name' : 'Wind Speed', 'standard_name' : 'wind_speed', 'units': 'mph'}, '04_00035' : {'long_name' : 'Wind Gust', 'standard_name' : 'wind_speed_of_gust', 'units': 'mph'}, '05_00035' : {'long_name' : 'Wind Speed', 'standard_name' : 'wind_speed', 'units': 'mph'}, '06_00035' : {'long_name' : 'Wind Gust', 'standard_name' : 'wind_speed_of_gust', 'units': 'mph'}, '04_00036' : {'long_name' : 'Wind Direction', 'standard_name' : 'wind_from_direction', 'units': 'degrees'}, '02_00036' : {'long_name' : 'Wind Direction', 'standard_name' : 'wind_from_direction', 'units': 'degrees'}, '05_00025' : {'long_name' : 'Air Pressure', 'standard_name' : 'air_pressure', 'units': 'mm of mercury'}, '07_00025' : {'long_name' : 'Air Pressure', 'standard_name' : 'air_pressure', 'units': 'mm of mercury'}, '09_00025' : {'long_name' : 'Air Pressure', 'standard_name' : 'air_pressure', 'units': 'mm of mercury'}, '03_00045' : {'long_name' : 'Total Precipitation', 'standard_name' : 'lwe_thickness_of_precipitation_amount', 'units': 'inches'}, '08_00045' : {'long_name' : 'Total Precipitation', 'standard_name' : 'lwe_thickness_of_precipitation_amount', 'units': 'inches'}, '09_00045' : {'long_name' : 'Total Precipitation', 'standard_name' : 'lwe_thickness_of_precipitation_amount', 'units': 'inches'}, '06_00052' : {'long_name' : 'Relative Humidity', 'standard_name' : 'relative_humidity', 'units': 'percent'}, '07_00052' : {'long_name' : 'Relative Humidity', 'standard_name' : 'relative_humidity', 'units': 'percent'}, '08_00052' : {'long_name' : 'Relative Humidity', 'standard_name' : 'relative_humidity', 'units': 'percent'}, '05_00020' : {'long_name' : 'Air Temperature', 'standard_name' : 'air_temperature', 'units': 'degrees_Celsius'}, '06_00020' : {'long_name' : 'Air Temperature', 'standard_name' : 'air_temperature', 'units': 'degrees_Celsius'}, '07_00020' : {'long_name' : 'Air Temperature', 'standard_name' : 'air_temperature', 'units': 'degrees_Celsius'}, '19_63160' : {'long_name' : 'Water Surface Height Above Reference Datum (NAVD88)', 'geoid_name' : 'NAVD88', 'vertical_datum' : 'NAVD88', 'water_surface_reference_datum' : 'NAVD88', 'standard_name' : 'water_surface_height_above_reference_datum', 'units': 'feet'}, '01_63160' : {'long_name' : 'Water Surface Height Above Reference Datum (NAVD88)', 'geoid_name' : 'NAVD88', 'vertical_datum' : 'NAVD88', 'water_surface_reference_datum' : 'NAVD88', 'standard_name' : 'water_surface_height_above_reference_datum', 'units': 'feet'}, } # Get metadata from a seperate endpoint. d = requests.get("http://waterservices.usgs.gov/nwis/site/?sites={!s}".format(site_id)) try: d.raise_for_status() except requests.exceptions.HTTPError: logger.error("Could not find lat/lon endpoint for station {!s}, skipping. Status code: {!s}".format(site_id, d.status_code)) return _, hz, dz = split_file(d.text, "agency_cd") # Strip off the one line after the headers dz = dz[1:] dfz = pd.DataFrame(dz, columns=hz) lat = float(dfz["dec_lat_va"][0]) lon = float(dfz["dec_long_va"][0]) sensor_vertical_datum = dfz["alt_datum_cd"][0] or "NAVD88" try: z = float(dfz["alt_va"][0]) except ValueError: z = 0. loc = "POINT({!s} {!s} {!s})".format(lon, lat, z) name = dfz["station_nm"][0] comments, headers, data = split_file(contents, "agency_cd") df = pd.DataFrame(data, columns=headers) fillvalue = -9999.9 # Combine date columns dates = df["datetime"] tz = df["tz_cd"] new_dates = list() for i in range(len(dates)): try: new_dates.append(parse(dates[i] + " " + tz[i]).astimezone(pytz.utc)) except BaseException: # Remove row. Bad date. df.drop(i, axis=0, inplace=True) continue df['time'] = new_dates df['depth'] = [ z for x in range(len(df['time'])) ] # Strip out "_cd" columns (quality checks for USGS) for h in headers: if "_cd" in h: df.drop(h, axis=1, inplace=True) # Add global attributes to appear in the resulting NetCDF file global_attributes = dict( title=name, summary='USGS Hurricane Sandy Rapid Response Stations. Data acquired from "http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/.', keywords="usgs, waterdata, elevation, water, waterlevel, sandy, hurricane, rapid, response, %s" % site_id, keywords_vocaublary="None", naming_authority='gov.usgs', id=site_id, cdm_data_type="Station", history="NetCDF file generated from {!s}".format(csv_link), creator="USGS", creator_url="http://waterdata.usgs.gov", creator_institution="USGS", creator_urn="gov.usgs", publisher="Axiom Data Science", publisher_uri="http://axiomdatascience.com", processing_level="None", acknowledgement="None", geospatial_bounds=loc, geospatial_lat_min=lat, geospatial_lat_max=lat, geospatial_lon_min=lon, geospatial_lon_max=lon, license="Freely Distributed", date_created=datetime.utcnow().replace(second=0, microsecond=0).isoformat() ) def to_floats(x): try: return float(x) except ValueError: return fillvalue min_time = df['time'].min() max_time = df['time'].max() full_station_urn = "urn:ioos:station:{!s}:{!s}".format(global_attributes["naming_authority"], site_id) if output_format == 'cf16': output_filename = '{}_{}-{}.nc'.format(site_id, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) times = [ calendar.timegm(x.timetuple()) for x in df["time"] ] verticals = df['depth'].values ts = TimeSeries(output, latitude=lat, longitude=lon, station_name=full_station_urn, global_attributes=global_attributes, output_filename=output_filename, times=times, verticals=verticals, vertical_axis_name='z') for var in df.columns: if var in ['datetime', 'time', 'depth', 'tz_cd', 'site_no', 'agency_cd']: continue try: var_meta = variable_map[var] except KeyError: logger.error("Variable {!s} was not found in variable map!".format(var)) continue # Convert to floats df[var] = df[var].map(to_floats) # Change feet to meters if var_meta["units"] in ["feet", "ft"]: df[var] = np.asarray([ v * 0.3048 if v != fillvalue else v for v in df[var] ]) var_meta["units"] = "meters" if output_format == 'axiom': full_sensor_urn = "urn:ioos:sensor:{!s}:{!s}:{!s}".format(global_attributes["naming_authority"], site_id, var_meta["standard_name"]) output_directory = os.path.join(output, full_sensor_urn) output_filename = '{}_{}-{}.nc'.format(var, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries.from_dataframe(df, output_directory, output_filename, lat, lon, full_station_urn, global_attributes, var_meta["standard_name"], var_meta, sensor_vertical_datum=sensor_vertical_datum, fillvalue=fillvalue, data_column=var, vertical_axis_name='height') ts.add_instrument_metadata(urn=full_sensor_urn) ts.close() elif output_format == 'cf16': # Variable names shouldn't start with a number try: int(var[0]) variable_name = 'v_{}'.format(var) except: variable_name = var ts.add_variable(variable_name, values=df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum=sensor_vertical_datum) if output_format == 'cf16': ts.close()
def parse_type_2(output_format, site_id, contents, output, csv_link): """ # These data are provisional and subject to revision. # Data processed as of 12/05/2012 11:54:29. # Data collected as part of Hurricane Sandy (2012) Storm Tide project. # Data are archived at http://water.usgs.gov/floods/events/2012/isaac/index.php # Elevation determined from GPS surveys (NAVD 88). # Time datum is GMT (Greenwich Mean Time). # Water density estimated on basis of sensor location # where saltwater = 63.989 lb/ft3 (Saltwater = dissolved solids concentration greater than 20000 milligrams per liter) # where brackish water = 63.052 lb/ft3 (Brackish water = dissolved solids concentration between 1000 and 20000 milligrams per liter) # where freshwater = 62.428 lb/ft3 (Freshwater = dissolved solids concentration less than 1000 milligrams per liter) # The equation used to compute elevation from recorded pressure is # (((sp-bp)*144)/d)+e # Where sp = surge pressure in psi; bp = barometric pressure in psi; # d = water density in lb/ft3; and e = elevation of sensor in ft above NAVD 88. # Barometric data from nearest pressure sensor. Location for the barometric sensor is listed below. # Elevation is computer-rounded to two decimal places. # Sensor information # Site id = SSS-NY-WES-001WL # Site type = water level # Horizontal datum used is NAD 83 # Sensor location latitude 40.942755 # Sensor location longitude -73.719828 # Sensor elevation above NAVD 88 = -3.97 ft # Lowest recordable water elevation is -3.90 ft # Water density value used = 63.989 lb/ft3 # Barometric sensor site (source of bp) = SSS-NY-WES-002BP # Barometric sensor location latitude 40.90754368 # Barometric sensor location longitude -73.8692184 date_time_GMT elevation nearest_barometric_sensor_psi 10-28-2012 06:00:00 0.88 14.5145 10-28-2012 06:00:30 0.86 14.5145 10-28-2012 06:01:00 0.85 14.5170 10-28-2012 06:01:30 0.85 14.5145 10-28-2012 06:02:00 0.84 14.5170 10-28-2012 06:02:30 0.81 14.5145 10-28-2012 06:03:00 0.76 14.5145 ... """ variable_map = { 'elevation' : {'long_name' : 'Water Level Elevation above Reference Datum (NAVD88)', 'geoid_name' : 'NAVD88', 'vertical_datum' : 'NAVD88', 'water_surface_reference_datum' : 'NAVD88', 'standard_name' : 'water_surface_height_above_reference_datum', 'units': 'feet'}, } def to_floats(x): try: return float(x) except ValueError: return fillvalue comments, headers, data = split_file(contents, "date_time_GMT") df = pd.DataFrame(data, columns=headers) fillvalue = -9999.9 lat = None lon = None z = 0 name = site_id sensor_vertical_datum = "NAVD88" for c in comments: if "Sensor location latitude" in c: lat = float(filter(None, map(lambda x: x.strip(), c.split(" ")))[-1]) elif "Sensor location longitude" in c: lon = float(filter(None, map(lambda x: x.strip(), c.split(" ")))[-1]) elif "Site id" in c: site_id = filter(None, map(lambda x: x.strip(), c.split(" ")))[-1] name = site_id elif "Sensor elevation" in c: sensor_vertical_datum = "".join(c.split("=")[0].split(" ")[4:6]) l = filter(None, map(lambda x: x.strip(), c.split(" "))) z = float(l[-2]) if l[-1] in ["feet", "ft"]: z *= 0.3048 loc = "POINT({!s} {!s} {!s})".format(lon, lat, z) df['time'] = df["date_time_GMT"].map(lambda x: parse(x + " UTC")) df['depth'] = [ z for x in range(len(df['time'])) ] # Add global attributes to appear in the resulting NetCDF file global_attributes = dict( title=name, summary='USGS Hurricane Sandy Rapid Response Stations. Data acquired from http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/.', keywords="usgs, waterdata, elevation, water, waterlevel, sandy, hurricane, rapid, response, %s" % site_id, keywords_vocaublary="None", naming_authority='gov.usgs', id=site_id, cdm_data_type="Station", history="NetCDF file generated from {!s}".format(csv_link), creator="USGS", creator_url="http://waterdata.usgs.gov", creator_institution="USGS", creator_urn="gov.usgs", publisher="Axiom Data Science", publisher_uri="http://axiomdatascience.com", processing_level="None", acknowledgement="None", geospatial_bounds=loc, geospatial_lat_min=lat, geospatial_lat_max=lat, geospatial_lon_min=lon, geospatial_lon_max=lon, license="Freely Distributed", date_created=datetime.utcnow().replace(second=0, microsecond=0).isoformat() ) full_station_urn = "urn:ioos:station:{!s}:{!s}".format(global_attributes["naming_authority"], site_id) min_time = df["time"].min() max_time = df["time"].max() if output_format == 'cf16': times = [ calendar.timegm(x.timetuple()) for x in df['time'] ] verticals = df['depth'].values output_filename = '{}_{}-{}.nc'.format(site_id, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries(output, latitude=lat, longitude=lon, station_name=full_station_urn, global_attributes=global_attributes, output_filename=output_filename, times=times, verticals=verticals) for var in df.columns: if var in ['date_time_GMT', 'time', 'depth']: continue try: int(var[0]) variable_name = 'v_{}'.format(var) except: variable_name = var try: var_meta = variable_map[var] except KeyError: logger.error("Variable {!s} was not found in variable map!".format(var)) continue # Convert to floats df[var] = df[var].map(to_floats) if var_meta["units"] in ["feet", "ft"]: df[var] = [ v * 0.3048 if v != fillvalue else v for v in df[var] ] var_meta["units"] = "meters" if output_format == 'axiom': full_sensor_urn = "urn:ioos:sensor:{!s}:{!s}:{!s}".format(global_attributes["naming_authority"], site_id, var_meta["standard_name"]) output_directory = os.path.join(output, full_sensor_urn) output_filename = '{}_{}-{}.nc'.format(var, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries.from_dataframe(df, output_directory, output_filename, lat, lon, full_station_urn, global_attributes, var_meta["standard_name"], var_meta, sensor_vertical_datum=sensor_vertical_datum, fillvalue=fillvalue, data_column=var) ts.add_instrument_metadata(urn=full_sensor_urn) ts.close() elif output_format == 'cf16': ts.add_variable(variable_name, values=df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum=sensor_vertical_datum) if output_format == 'cf16': ts.close()
class TestTimeseriesTimeBounds(unittest.TestCase): def setUp(self): self.output_directory = os.path.join(os.path.dirname(__file__), "output") self.latitude = 34 self.longitude = -72 self.station_name = "PytoolsTestStation" self.global_attributes = dict(id='this.is.the.id') self.filename = 'test_timeseries_bounds.nc' self.times = [0, 1000, 2000, 3000, 4000, 5000] verticals = [0] self.ts = TimeSeries(output_directory=self.output_directory, latitude=self.latitude, longitude=self.longitude, station_name=self.station_name, global_attributes=self.global_attributes, output_filename=self.filename, times=self.times, verticals=verticals) self.values = [20, 21, 22, 23, 24, 25] attrs = dict(standard_name='sea_water_temperature') self.ts.add_variable('temperature', values=self.values, attributes=attrs) def tearDown(self): os.remove(os.path.join(self.output_directory, self.filename)) def test_time_bounds_start(self): delta = timedelta(seconds=1000) self.ts.add_time_bounds(delta=delta, position='start') nc = netCDF4.Dataset(os.path.join(self.output_directory, self.filename)) assert nc.variables.get('time_bounds').shape == (len(self.times), 2,) assert (nc.variables.get('time_bounds')[:] == np.asarray([ [0, 1000], [1000, 2000], [2000, 3000], [3000, 4000], [4000, 5000], [5000, 6000] ])).all() nc.close() def test_time_bounds_middle(self): delta = timedelta(seconds=1000) self.ts.add_time_bounds(delta=delta, position='middle') nc = netCDF4.Dataset(os.path.join(self.output_directory, self.filename)) assert nc.variables.get('time_bounds').shape == (len(self.times), 2,) assert (nc.variables.get('time_bounds')[:] == np.asarray([ [ -500, 500], [ 500, 1500], [ 1500, 2500], [ 2500, 3500], [ 3500, 4500], [ 4500, 5500] ])).all() nc.close() def test_time_bounds_end(self): delta = timedelta(seconds=1000) self.ts.add_time_bounds(delta=delta, position='end') nc = netCDF4.Dataset(os.path.join(self.output_directory, self.filename)) assert nc.variables.get('time_bounds').shape == (len(self.times), 2,) assert (nc.variables.get('time_bounds')[:] == np.asarray([ [-1000, 0], [ 0, 1000], [ 1000, 2000], [ 2000, 3000], [ 3000, 4000], [ 4000, 5000] ])).all() nc.close()
return re.sub(r'[^_a-zA-Z0-9]', "_", name) return name # <codecell> import os out_file = os.path.join(output_dir, output_file) if os.path.isfile(out_file): os.remove(out_file) from pyaxiom.netcdf.sensors import TimeSeries ts = TimeSeries(output_dir, latitude=0.39, longitude=36.7, station_name='urn:ioos:station:edu.princeton.ecohydrolab:MainTower', global_attributes={}, times=pd_to_secs(df), verticals=[10], output_filename=output_file) # <codecell> for c in df.columns[::-1]: # Add units based on column name? var_attributes = dict() ts.add_variable(cf_safe_name(c), df[c].values, attributes=var_attributes, fillvalue=-9999.9) # <codecell>
def main(output_format, output, do_download, download_folder, filesubset=None): if do_download is True: try: os.makedirs(download_folder) except OSError: pass waf = 'http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/' r = requests.get(waf) soup = BeautifulSoup(r.text, "lxml") for link in soup.find_all('a'): # Skip non .txt files site_id, ext = os.path.splitext(link['href']) if ext != ".txt": continue if filesubset and site_id.lower() not in filesubset: # Skip this file! continue csv_link = waf + link['href'] logger.info("Downloading '{}'".format(csv_link)) d = requests.get(csv_link) try: d.raise_for_status() except requests.exceptions.HTTPError: logger.error( "Could not download: {!s}, skipping. Status code: {!s}". format(csv_link, d.status_code)) continue with open( os.path.join(download_folder, os.path.basename(csv_link)), 'wt') as f: f.write(d.text) # Yes, this uses lots of RAM, but we need to match up lon/lat positions later on. results = [] for datafile in os.listdir(download_folder): site_id = os.path.splitext(os.path.basename(datafile))[0] if filesubset and site_id.lower() not in filesubset: # Skip this file! continue with open(os.path.join(download_folder, datafile)) as d: contents = d.read() r = None for line in contents.split("\n"): if "agency_cd" in line: r = parse_type_1(output_format, site_id, contents, output) break elif "date_time_GMT" in line: r = parse_type_2(output_format, site_id, contents, output) break else: continue if r is None: logger.error('Could not process file: {}'.format(datafile)) else: logger.info("Processed {}".format(datafile)) results.append(r) results = sorted(results, key=attrgetter('lon', 'lat')) gresults = groupby(results, attrgetter('lon', 'lat')) for (glon, glat), group in gresults: groups = [x for x in list(group) if x] # Strip off the variable type if need be gsite = groups[0].site if gsite[-2:] in ['WV', 'BP', 'WL']: gsite = gsite[:-2] for result in groups: gas = get_globals(glat, glon, result.z, result.name, gsite) station_urn = IoosUrn(asset_type='station', authority=gas['naming_authority'], label=gsite) if output_format == 'cf16': # If CF, a file for each result dataframe times = [ calendar.timegm(x.timetuple()) for x in result.df['time'] ] verticals = result.df['depth'].values output_filename = '{}.nc'.format(result.site) ts = TimeSeries(output, latitude=glat, longitude=glon, station_name=gsite, global_attributes=gas, output_filename=output_filename, times=times, verticals=verticals) for var in result.df.columns: if var in [ 'date_time_GMT', 'datetime', 'time', 'depth', 'tz_cd', 'site_no', 'agency_cd' ]: continue try: var_meta = copy(variable_map[var]) except KeyError: logger.error( "Variable {!s} was not found in variable map!".format( var)) continue # Convert to floats result.df[var] = result.df[var].map(to_floats) if var_meta["units"].lower() in ["feet", "ft"]: result.df[var] = result.df[var].apply( lambda x: None if pd.isnull(x) else x * 0.3048) var_meta["units"] = "meters" elif var_meta["units"].lower() in ["psi"]: result.df[var] = result.df[var].apply( lambda x: None if pd.isnull(x) else x * 68.9476) var_meta["units"] = "mbar" elif var_meta["units"].lower() in ['millimeters of mercury']: result.df[var] = result.df[var].apply( lambda x: None if pd.isnull(x) else x * 1.33322) var_meta["units"] = "mbar" # Now put the fillvalue we want to be interpreted result.df.fillna(fillvalue, inplace=True) if output_format == 'axiom': # If Axiom, a file for each variable output_directory = os.path.join(output, gsite) output_filename = '{}_{}.nc'.format( result.site, var_meta['standard_name']) ts = TimeSeries.from_dataframe( result.df, output_directory, output_filename, glat, glon, station_urn.urn, gas, var_meta["standard_name"], var_meta, sensor_vertical_datum='NAVD88', fillvalue=fillvalue, data_column=var, vertical_axis_name='height') sensor_urn = urnify(station_urn.authority, station_urn.label, var_meta) ts.add_instrument_metadata(urn=sensor_urn) elif output_format == 'cf16': # If CF, add variable to existing TimeSeries try: int(var[0]) variable_name = 'v_{}'.format(var) except BaseException: variable_name = var ts.add_variable(variable_name, values=result.df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum='NAVD88')
def main(output, download_folder, do_download, projects, csv_metadata_file): project_metadata = dict() with open(csv_metadata_file, 'r') as f: reader = csv.DictReader(f) for row in reader: project_name = row['project_name'] if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata) except KeyboardInterrupt: downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, "*")) temp_folder = os.path.abspath(os.path.join(".", "temp")) shutil.rmtree(temp_folder, ignore_errors=True) try: os.makedirs(temp_folder) except OSError: pass # Exists for down_file in downloaded_files: # For debugging #if os.path.basename(down_file) != "8451met-a.nc": # continue nc = None try: temp_file = os.path.join(temp_folder, os.path.basename(down_file)) shutil.copy(down_file, temp_file) if projects: tmpnc = netCDF4.Dataset(temp_file) project_name, _ = tmpnc.id.split("/") nc_close(tmpnc) if project_name.lower() not in projects: # Skip this project! continue # Cleanup to CF-1.6 normalize_time(temp_file) normalize_epic_codes(temp_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. station_id = None latitude = None longitude = None nc = netCDF4.Dataset(temp_file) # Default station_id project_name, _ = nc.id.split("/") # Now try to come up with a better one. if hasattr(nc, 'MOORING') and hasattr(nc, 'id'): mooring_id = str(nc.MOORING).replace(':', '').strip() station_id = "{0}_{1}".format(project_name, mooring_id[0:3]).lower() else: try: # Mooring ID is the first three numbers of the file station_id = int(os.path.basename(down_file)[0:3]) station_id = "{0}_mooring_{0}".format(project_name, station_id) except BaseException: logger.error("Could not create a suitable station_id. Skipping {0}.".format(down_file)) continue try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info("Translating {0} into CF1.6 format: {1}".format(down_file, os.path.abspath(os.path.join(output_directory, file_name)))) if not os.path.isdir(output_directory): os.makedirs(output_directory) file_global_attributes = { k : getattr(nc, k) for k in nc.ncattrs() } file_global_attributes.update(global_attributes) file_global_attributes['id'] = station_id file_global_attributes['title'] = '{0} - {1}'.format(project_name, os.path.basename(down_file)) if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in ['id', 'title', 'catalog_xml', 'project_name']: file_global_attributes[k] = v times = nc.variables.get('time')[:] feature_name, _ = os.path.splitext(os.path.basename(down_file)) # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [ x for x in nc.variables.get(dv).dimensions if 'depth' in x ] depth_variables = sorted(list(set(depth_variables))) depth_values = np.asarray([ nc.variables.get(x)[:] for x in depth_variables ]).flatten() ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name) v = [] for other in sorted(nc.variables): # Sorted for a reason... don't change! if other in coord_vars: continue old_var = nc.variables.get(other) variable_attributes = { k : getattr(old_var, k) for k in old_var.ncattrs() } fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split('_')[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [ x for x in old_var.dimensions if 'depth' in x ] if depth_variable and len(old_var.dimensions) > 1 and 'time' in old_var.dimensions: depth_index = np.squeeze(np.where(depth_values == nc.variables.get(depth_variable[0])[:])) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if search_var != other and search_var.split('_')[0] == new_var_name and \ depth_variable[0] != [ x for x in nc.variables[search_var].dimensions if 'depth' in x ][0]: # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get(search_var).dimensions if 'depth' in x ] depth_index = np.squeeze(np.where(depth_values == nc.variables.get(search_depth_variable[0])[:])) depth_indexes.append((search_var, depth_index)) logger.info("Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)".format(search_var, other, new_var_name)) values = np.ma.empty((times.size, len(depth_values))) values.fill_value = fillvalue values.mask = True for nm, index in depth_indexes: values[:, index] = np.squeeze(nc.variables.get(nm)[:]) # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other # Create this one, should be the first we encounter for this type ts.add_variable(new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) elif depth_variable and 'time' not in old_var.dimensions: # elif (depth_variable and len(old_var.dimensions) == 1 and 'depth' == old_var.dimensions[0]) or \ # Metadata variable like bin distance meta_var = ts.ncd.createVariable(other, old_var.dtype, ('z',), fill_value=fillvalue) for k, v in variable_attributes.iteritems(): if k != '_FillValue': setattr(meta_var, k, v) meta_var[:] = old_var[:] else: values = old_var[:] if len(old_var.dimensions) == 1 and old_var.dimensions[0] == 'time': # Metadata variables like pitch, roll, record count, etc. ts.add_variable(other, values=values, times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) elif depth_values.size > 1: # No Z variables in a profile dataset, aka Bottom Temperature ts.add_variable(other, values=values, times=times, verticals=[old_var.sensor_depth], unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) ts.ncd.sync() ts.ncd.close() except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: nc_close(nc) os.remove(temp_file) shutil.rmtree(temp_folder, ignore_errors=True)
def parse_type_2(output_format, site_id, contents, output, csv_link): """ # These data are provisional and subject to revision. # Data processed as of 12/05/2012 11:54:29. # Data collected as part of Hurricane Sandy (2012) Storm Tide project. # Data are archived at http://water.usgs.gov/floods/events/2012/isaac/index.php # Elevation determined from GPS surveys (NAVD 88). # Time datum is GMT (Greenwich Mean Time). # Water density estimated on basis of sensor location # where saltwater = 63.989 lb/ft3 (Saltwater = dissolved solids concentration greater than 20000 milligrams per liter) # where brackish water = 63.052 lb/ft3 (Brackish water = dissolved solids concentration between 1000 and 20000 milligrams per liter) # where freshwater = 62.428 lb/ft3 (Freshwater = dissolved solids concentration less than 1000 milligrams per liter) # The equation used to compute elevation from recorded pressure is # (((sp-bp)*144)/d)+e # Where sp = surge pressure in psi; bp = barometric pressure in psi; # d = water density in lb/ft3; and e = elevation of sensor in ft above NAVD 88. # Barometric data from nearest pressure sensor. Location for the barometric sensor is listed below. # Elevation is computer-rounded to two decimal places. # Sensor information # Site id = SSS-NY-WES-001WL # Site type = water level # Horizontal datum used is NAD 83 # Sensor location latitude 40.942755 # Sensor location longitude -73.719828 # Sensor elevation above NAVD 88 = -3.97 ft # Lowest recordable water elevation is -3.90 ft # Water density value used = 63.989 lb/ft3 # Barometric sensor site (source of bp) = SSS-NY-WES-002BP # Barometric sensor location latitude 40.90754368 # Barometric sensor location longitude -73.8692184 date_time_GMT elevation nearest_barometric_sensor_psi 10-28-2012 06:00:00 0.88 14.5145 10-28-2012 06:00:30 0.86 14.5145 10-28-2012 06:01:00 0.85 14.5170 10-28-2012 06:01:30 0.85 14.5145 10-28-2012 06:02:00 0.84 14.5170 10-28-2012 06:02:30 0.81 14.5145 10-28-2012 06:03:00 0.76 14.5145 ... """ variable_map = { 'elevation': { 'long_name': 'Water Level Elevation above Reference Datum (NAVD88)', 'geoid_name': 'NAVD88', 'vertical_datum': 'NAVD88', 'water_surface_reference_datum': 'NAVD88', 'standard_name': 'water_surface_height_above_reference_datum', 'units': 'feet' }, } def to_floats(x): try: return float(x) except ValueError: return fillvalue comments, headers, data = split_file(contents, "date_time_GMT") df = pd.DataFrame(data, columns=headers) fillvalue = -9999.9 lat = None lon = None z = 0 name = site_id sensor_vertical_datum = "NAVD88" for c in comments: if "Sensor location latitude" in c: lat = float( list(filter(None, map(lambda x: x.strip(), c.split(" "))))[-1]) elif "Sensor location longitude" in c: lon = float( list(filter(None, map(lambda x: x.strip(), c.split(" "))))[-1]) elif "Site id" in c: site_id = list(filter(None, map(lambda x: x.strip(), c.split(" "))))[-1] name = site_id elif "Sensor elevation" in c: sensor_vertical_datum = "".join(c.split("=")[0].split(" ")[4:6]) l = list(filter(None, map(lambda x: x.strip(), c.split(" ")))) z = float(l[-2]) if l[-1] in ["feet", "ft"]: z *= 0.3048 loc = "POINT({!s} {!s} {!s})".format(lon, lat, z) df['time'] = df["date_time_GMT"].map(lambda x: parse(x + " UTC")) df['depth'] = [z for x in range(len(df['time']))] # Add global attributes to appear in the resulting NetCDF file global_attributes = dict( title=name, summary= 'USGS Hurricane Sandy Rapid Response Stations. Data acquired from http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/.', keywords= "usgs, waterdata, elevation, water, waterlevel, sandy, hurricane, rapid, response, %s" % site_id, keywords_vocaublary="None", naming_authority='gov.usgs', id=site_id, cdm_data_type="Station", history="NetCDF file generated from {!s}".format(csv_link), creator="USGS", creator_url="http://waterdata.usgs.gov", creator_institution="USGS", creator_urn="gov.usgs", publisher="Axiom Data Science", publisher_uri="http://axiomdatascience.com", processing_level="None", acknowledgement="None", geospatial_bounds=loc, geospatial_lat_min=lat, geospatial_lat_max=lat, geospatial_lon_min=lon, geospatial_lon_max=lon, license="Freely Distributed", date_created=datetime.utcnow().replace(second=0, microsecond=0).isoformat()) full_station_urn = "urn:ioos:station:{!s}:{!s}".format( global_attributes["naming_authority"], site_id) min_time = df["time"].min() max_time = df["time"].max() if output_format == 'cf16': times = [calendar.timegm(x.timetuple()) for x in df['time']] verticals = df['depth'].values output_filename = '{}_{}-{}.nc'.format( site_id, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries(output, latitude=lat, longitude=lon, station_name=full_station_urn, global_attributes=global_attributes, output_filename=output_filename, times=times, verticals=verticals) for var in df.columns: if var in ['date_time_GMT', 'time', 'depth']: continue try: int(var[0]) variable_name = 'v_{}'.format(var) except: variable_name = var try: var_meta = variable_map[var] except KeyError: logger.error( "Variable {!s} was not found in variable map!".format(var)) continue # Convert to floats df[var] = df[var].map(to_floats) if var_meta["units"] in ["feet", "ft"]: df[var] = [v * 0.3048 if v != fillvalue else v for v in df[var]] var_meta["units"] = "meters" if output_format == 'axiom': full_sensor_urn = "urn:ioos:sensor:{!s}:{!s}:{!s}".format( global_attributes["naming_authority"], site_id, var_meta["standard_name"]) output_directory = os.path.join(output, full_sensor_urn) output_filename = '{}_{}-{}.nc'.format( var, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries.from_dataframe( df, output_directory, output_filename, lat, lon, full_station_urn, global_attributes, var_meta["standard_name"], var_meta, sensor_vertical_datum=sensor_vertical_datum, fillvalue=fillvalue, data_column=var) ts.add_instrument_metadata(urn=full_sensor_urn) elif output_format == 'cf16': ts.add_variable(variable_name, values=df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum=sensor_vertical_datum)
def main(output, download_folder, do_download, projects, csv_metadata_file, filesubset=None, since=None): project_metadata = dict() with open(csv_metadata_file, 'r') as f: reader = csv.DictReader(f) for row in reader: project_name = row['project_name'] if isinstance(project_name, str) and project_name[0] == '#': continue if projects and project_name.lower() not in projects: # Skip projects if a subset was defined continue project_metadata[project_name] = dict() for k, v in row.items(): project_metadata[project_name][k] = v if do_download: try: downloaded_files = download(download_folder, project_metadata, filesubset, since) except KeyboardInterrupt: logger.exception('Error downloading datasets from THREDDS') downloaded_files = [] else: downloaded_files = glob(os.path.join(download_folder, '**', '*')) if since is not None: def should_keep(d): modt = datetime.utcfromtimestamp( os.path.getmtime(d)).replace(tzinfo=pytz.utc) return modt >= since downloaded_files = [ dl for dl in downloaded_files if should_keep(dl) ] for down_file in sorted(downloaded_files): temp_fd, temp_file = tempfile.mkstemp(prefix='cmg_collector', suffix='nc') try: if filesubset is not None: if os.path.basename(down_file).lower() not in filesubset: # aka "9631ecp-a.nc" # Skip this file! continue project_name = os.path.basename(os.path.dirname(down_file)) if projects: if project_name.lower() not in projects: # Skip this project! continue shutil.copy(down_file, temp_file) # Cleanup to CF-1.6 try: first_time = normalize_time(temp_file) except (TypeError, ValueError, IndexError): logger.exception( "Could not normalize the time variable. Skipping {0}.". format(down_file)) continue except OverflowError: logger.error( "Dates out of range. Skipping {0}.".format(down_file)) continue normalize_epic_codes(temp_file, down_file) normalize_vectors(temp_file) normalize_units(temp_file) # Create list of variables that we want to save. mooring_id = None latitude = None longitude = None fname = os.path.basename(down_file) feature_name, file_ext = os.path.splitext( os.path.basename(down_file)) try: mooring_id = int(9999) except ValueError: logger.exception( "Could not create a suitable station_id. Skipping {0}.". format(down_file)) continue file_name = os.path.basename(down_file) output_directory = os.path.join(output, project_name) logger.info("Translating {0} into CF1.6 format: {1}".format( down_file, os.path.abspath(os.path.join(output_directory, file_name)))) with EnhancedDataset(temp_file) as nc: try: latitude = nc.variables.get("lat")[0] longitude = nc.variables.get("lon")[0] except IndexError: latitude = nc.variables.get("lat")[:] longitude = nc.variables.get("lon")[:] except TypeError: logger.error( "Could not find lat/lon variables. Skipping {0}.". format(down_file)) file_global_attributes = { k: getattr(nc, k) for k in nc.ncattrs() } file_global_attributes.update(global_attributes) file_global_attributes['id'] = feature_name file_global_attributes['MOORING'] = mooring_id file_global_attributes['original_filename'] = fname file_global_attributes['original_folder'] = project_name no_override = [ 'id', 'MOORING', 'original_filename', 'original_folder', 'catalog_xml', 'project_name' ] if project_name in project_metadata: for k, v in project_metadata[project_name].items(): if v and k.lower() not in no_override: file_global_attributes[k] = v if 'summary' in file_global_attributes: # Save the original summary file_global_attributes[ 'WHOI_Buoy_Group_summary'] = file_global_attributes[ 'summary'] # Better title/summary for discovery via catalogs project_title = file_global_attributes.get( 'project_title', project_name).strip() project_summary = file_global_attributes.get( 'project_summary', '').strip() file_global_attributes[ 'title'] = 'USGS-CMG time-series data: {0} - {1} - {2}'.format( project_name, mooring_id, feature_name) file_global_attributes[ 'summary'] = 'USGS-CMG time-series data from the {} project, mooring {} and package {}. {}'.format( project_title, mooring_id, feature_name, project_summary).strip() times = nc.variables.get('time')[:] # Get all depth values depth_variables = [] for dv in nc.variables: depth_variables += [ x for x in nc.variables.get(dv).dimensions if 'depth' in x ] depth_variables = sorted(list(set(depth_variables))) try: assert depth_variables depth_values = np.asarray([ nc.variables.get(x)[:] for x in depth_variables ]).flatten() except (AssertionError, TypeError): logger.warning( "No depth variables found in {}, skipping.".format( down_file)) continue # Convert everything to positive up, unless it is specifically specified as "up" already depth_conversion = -1.0 if depth_variables: pull_positive = nc.variables.get(depth_variables[0]) if hasattr(pull_positive, 'positive' ) and pull_positive.positive.lower() == 'up': depth_conversion = 1.0 depth_values = depth_values * depth_conversion if not os.path.isdir(output_directory): os.makedirs(output_directory) ts = TimeSeries(output_directory, latitude, longitude, feature_name, file_global_attributes, times=times, verticals=depth_values, output_filename=file_name, vertical_positive='up') # Set the platform type from the global attribute 'platform_type', defaulting to 'fixed' with EnhancedDataset(ts.out_file, 'a') as onc: platform_type = getattr(onc, 'platform_type', 'fixed').lower() onc.variables['platform'].setncattr('type', platform_type) onc.variables['platform'].setncattr( 'nodc_name', "FIXED PLATFORM, MOORINGS") # Add ERDDAP variables onc.cdm_data_type = "TimeSeries" onc.cdm_timeseries_variables = "latitude,longitude,z,feature_type_instance" v = [] depth_files = [] for other in sorted( nc.variables): # Sorted for a reason... don't change! try: if other in coord_vars: continue ovsd = None # old var sensor depth old_var = nc.variables.get(other) variable_attributes = { k: getattr(old_var, k) for k in old_var.ncattrs() } # Remove/rename some attributes # https://github.com/USGS-CMG/usgs-cmg-portal/issues/67 if 'valid_range' in variable_attributes: del variable_attributes['valid_range'] if 'minimum' in variable_attributes: variable_attributes[ 'actual_min'] = variable_attributes['minimum'] del variable_attributes['minimum'] if 'maximum' in variable_attributes: variable_attributes[ 'actual_max'] = variable_attributes['maximum'] del variable_attributes['maximum'] if 'sensor_depth' in variable_attributes: # sensor_depth is ALWAYS positive "down", so don't convert! # This is contrary to the "positive" attribute on the Z axis. # variable_attributes['sensor_depth'] = variable_attributes['sensor_depth'] * -1 # Round the sensor_depth attribute variable_attributes['sensor_depth'] = np.around( variable_attributes['sensor_depth'], decimals=4) ovsd = np.around(old_var.sensor_depth * depth_conversion, decimals=4) fillvalue = None if hasattr(old_var, "_FillValue"): fillvalue = old_var._FillValue # Figure out if this is a variable that is repeated at different depths # as different variable names. Assumes sorted. new_var_name = other.split('_')[0] if new_var_name in ts.ncd.variables: # Already in new file (processed when the first was encountered in the loop below) continue # Get the depth index depth_variable = [ x for x in old_var.dimensions if 'depth' in x ] if depth_variable and len( old_var.dimensions ) > 1 and 'time' in old_var.dimensions: depth_index = np.squeeze( np.where(depth_values == ( nc.variables.get(depth_variable[0])[:] * depth_conversion))) # Find other variable names like this one depth_indexes = [(other, depth_index)] for search_var in sorted(nc.variables): # If they have different depth dimension names we need to combine them into one variable if search_var != other and search_var.split('_')[0] == new_var_name and \ depth_variable[0] != [ x for x in nc.variables[search_var].dimensions if 'depth' in x ][0]: # Found a match at a different depth search_depth_variable = [ x for x in nc.variables.get( search_var).dimensions if 'depth' in x ] depth_index = np.squeeze( np.where(depth_values == ( nc.variables.get( search_depth_variable[0])[:] * depth_conversion))) depth_indexes.append( (search_var, depth_index)) logger.info( "Combining '{}' with '{}' as '{}' (different variables at different depths but are the same parameter)" .format(search_var, other, new_var_name)) values = np.ma.empty( (times.size, len(depth_values)), dtype=old_var.dtype) values.fill_value = fillvalue values.mask = True inconsistent = False for nm, index in depth_indexes: try: values[:, index] = np.squeeze( nc.variables.get(nm)[:]) except ValueError: inconsistent = True break # If we just have one index we want to use the original name if len(depth_indexes) == 1: # Just use the original variable name new_var_name = other if inconsistent is True: # Incorrect array size, most likely a strange variable ts.add_variable_object( old_var, dimension_map=dict(depth='z'), reduce_dims=True) else: # Create this one, should be the first we encounter for this type ts.add_variable(new_var_name, values=values, times=times, fillvalue=fillvalue, attributes=variable_attributes) elif len(old_var.dimensions ) == 1 and old_var.dimensions[0] == 'time': # A single time dimensioned variable, like pitch, roll, record count, etc. ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) elif old_var.ndim <= 3 and ovsd and \ ((depth_values.size == 1 and not depth_variable and 'time' in old_var.dimensions) or (depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions and 'sensor_depth' in ts.ncd.variables)): if 'sensor_depth' in ts.ncd.variables and np.isclose( ts.ncd.variables['sensor_depth'][:], ovsd): ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: # Search through secondary files that have been created for detached variables at a certain depth and # try to match this variable with one of the depths. found_df = False for dfts in depth_files: if isinstance(ovsd, np.ndarray): # Well, this is a bad file. raise ValueError( "The sensor_depth attribute has more than one value, please fix the source NetCDF: {}" .format(down_file)) if np.isclose( dfts.ncd.variables[ ts.vertical_axis_name][:], ovsd): dfts.add_variable( other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) found_df = True break # If we couldn't match the current or one of the existing secondary depth files, create a new one. if found_df is False: new_file_name = file_name.replace( file_ext, '_z{}{}'.format( len(depth_files) + 1, file_ext)) fga = copy(file_global_attributes) fga['id'] = os.path.splitext( new_file_name)[0] new_ts = TimeSeries( output_directory, latitude, longitude, feature_name, fga, times=times, verticals=[ovsd], output_filename=new_file_name, vertical_positive='up') new_ts.add_variable( other, values=old_var[:], times=times, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) depth_files.append(new_ts) elif old_var.ndim <= 3 and ( depth_values.size > 1 and not depth_variable and 'time' in old_var.dimensions): if ovsd: # An ADCP or profiling dataset, but this variable is measued at a single depth. # Example: Bottom Temperature on an ADCP # Skip things with a dimension over 3 (some beam variables like `brange`) ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, verticals=[ovsd], fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable(other, values=old_var[:], times=times, unlink_from_profile=True, fillvalue=fillvalue, attributes=variable_attributes) else: if 'time' in old_var.dimensions and old_var.ndim <= 3: ts.add_variable(other, values=old_var[:], times=times, fillvalue=fillvalue, attributes=variable_attributes) else: ts.add_variable_object( old_var, dimension_map=dict(depth='z'), reduce_dims=True) except BaseException: logger.exception( "Error processing variable {0} in {1}. Skipping it." .format(other, down_file)) except KeyboardInterrupt: logger.info("Breaking out of Translate loop!") break except BaseException: logger.exception("Error. Skipping {0}.".format(down_file)) continue finally: try: for df in depth_files: del df except NameError: pass try: del ts except NameError: pass os.close(temp_fd) if os.path.isfile(temp_file): os.remove(temp_file)
def parse_type_1(output_format, site_id, contents, output, csv_link): """ # ---------------------------------- WARNING ---------------------------------------- # The data you have obtained from this automated U.S. Geological Survey database # have not received Director's approval and as such are provisional and subject to # revision. The data are released on the condition that neither the USGS nor the # United States Government may be held liable for any damages resulting from its use. # Additional info: http://waterdata.usgs.gov/ga/nwis/help/?provisional # # File-format description: http://waterdata.usgs.gov/nwis/?tab_delimited_format_info # Automated-retrieval info: http://waterdata.usgs.gov/nwis/?automated_retrieval_info # # Contact: [email protected] # retrieved: 2012-11-20 12:05:22 EST (caww01) # # Data for the following 1 site(s) are contained in this file # USGS 395740074482628 South Branch Rancocas Cr at S Main St nr Lumberton # ----------------------------------------------------------------------------------- # # Data provided for site 395740074482628 # DD parameter Description # 03 00035 Wind speed, miles per hour # 07 00025 Barometric pressure, millimeters of mercury # 09 00045 Precipitation, total, inches # 19 63160 Stream water level elevation above NAVD 1988, in feet # # Data-value qualification codes included in this output: # P Provisional data subject to revision. # agency_cd site_no datetime tz_cd 03_00035 03_00035_cd 07_00025 07_00025_cd 09_00045 09_00045_cd 19_63160 19_63160_cd 5s 15s 20d 6s 14n 10s 14n 10s 14n 10s 14n 10s USGS 395740074482628 2012-10-28 13:00 EST 4.2 P 755 P 3.22 P USGS 395740074482628 2012-10-28 13:15 EST 6.4 P 754 P 0.00 P 3.36 P USGS 395740074482628 2012-10-28 13:30 EST 3.6 P 754 P 0.00 P 3.50 P USGS 395740074482628 2012-10-28 13:45 EST 3.2 P 754 P 0.00 P 3.63 P USGS 395740074482628 2012-10-28 14:00 EST 7.0 P 754 P 0.00 P 3.76 P USGS 395740074482628 2012-10-28 14:15 EST 4.0 P 754 P 0.00 P 3.87 P ... """ # lat/lon point: http://waterservices.usgs.gov/nwis/site/?sites=395740074482628 variable_map = { '01_00065': { 'long_name': 'Gage height', 'geoid_name': 'NAVD88', 'vertical_datum': 'NAVD88', 'water_surface_reference_datum': 'NAVD88', 'standard_name': 'water_surface_height_above_reference_datum', 'units': 'feet' }, '03_00035': { 'long_name': 'Wind Speed', 'standard_name': 'wind_speed', 'units': 'mph' }, '04_00035': { 'long_name': 'Wind Gust', 'standard_name': 'wind_speed_of_gust', 'units': 'mph' }, '05_00035': { 'long_name': 'Wind Speed', 'standard_name': 'wind_speed', 'units': 'mph' }, '06_00035': { 'long_name': 'Wind Gust', 'standard_name': 'wind_speed_of_gust', 'units': 'mph' }, '04_00036': { 'long_name': 'Wind Direction', 'standard_name': 'wind_from_direction', 'units': 'degrees' }, '02_00036': { 'long_name': 'Wind Direction', 'standard_name': 'wind_from_direction', 'units': 'degrees' }, '05_00025': { 'long_name': 'Air Pressure', 'standard_name': 'air_pressure', 'units': 'mm of mercury' }, '07_00025': { 'long_name': 'Air Pressure', 'standard_name': 'air_pressure', 'units': 'mm of mercury' }, '09_00025': { 'long_name': 'Air Pressure', 'standard_name': 'air_pressure', 'units': 'mm of mercury' }, '03_00045': { 'long_name': 'Total Precipitation', 'standard_name': 'lwe_thickness_of_precipitation_amount', 'units': 'inches' }, '08_00045': { 'long_name': 'Total Precipitation', 'standard_name': 'lwe_thickness_of_precipitation_amount', 'units': 'inches' }, '09_00045': { 'long_name': 'Total Precipitation', 'standard_name': 'lwe_thickness_of_precipitation_amount', 'units': 'inches' }, '06_00052': { 'long_name': 'Relative Humidity', 'standard_name': 'relative_humidity', 'units': 'percent' }, '07_00052': { 'long_name': 'Relative Humidity', 'standard_name': 'relative_humidity', 'units': 'percent' }, '08_00052': { 'long_name': 'Relative Humidity', 'standard_name': 'relative_humidity', 'units': 'percent' }, '05_00020': { 'long_name': 'Air Temperature', 'standard_name': 'air_temperature', 'units': 'degrees_Celsius' }, '06_00020': { 'long_name': 'Air Temperature', 'standard_name': 'air_temperature', 'units': 'degrees_Celsius' }, '07_00020': { 'long_name': 'Air Temperature', 'standard_name': 'air_temperature', 'units': 'degrees_Celsius' }, '19_63160': { 'long_name': 'Water Surface Height Above Reference Datum (NAVD88)', 'geoid_name': 'NAVD88', 'vertical_datum': 'NAVD88', 'water_surface_reference_datum': 'NAVD88', 'standard_name': 'water_surface_height_above_reference_datum', 'units': 'feet' }, '01_63160': { 'long_name': 'Water Surface Height Above Reference Datum (NAVD88)', 'geoid_name': 'NAVD88', 'vertical_datum': 'NAVD88', 'water_surface_reference_datum': 'NAVD88', 'standard_name': 'water_surface_height_above_reference_datum', 'units': 'feet' }, } # Get metadata from a seperate endpoint. d = requests.get( "http://waterservices.usgs.gov/nwis/site/?sites={!s}".format(site_id)) try: d.raise_for_status() except requests.exceptions.HTTPError: logger.error( "Could not find lat/lon endpoint for station {!s}, skipping. Status code: {!s}" .format(site_id, d.status_code)) return _, hz, dz = split_file(d.text, "agency_cd") # Strip off the one line after the headers dz = dz[1:] dfz = pd.DataFrame(dz, columns=hz) lat = float(dfz["dec_lat_va"][0]) lon = float(dfz["dec_long_va"][0]) sensor_vertical_datum = dfz["alt_datum_cd"][0] or "NAVD88" try: z = float(dfz["alt_va"][0]) except ValueError: z = 0. loc = "POINT({!s} {!s} {!s})".format(lon, lat, z) name = dfz["station_nm"][0] comments, headers, data = split_file(contents, "agency_cd") df = pd.DataFrame(data, columns=headers) fillvalue = -9999.9 # Combine date columns dates = df["datetime"] tz = df["tz_cd"] new_dates = list() for i in range(len(dates)): try: new_dates.append( parse(dates[i] + " " + tz[i]).astimezone(pytz.utc)) except BaseException: # Remove row. Bad date. df.drop(i, axis=0, inplace=True) continue df['time'] = new_dates df['depth'] = [z for x in range(len(df['time']))] # Strip out "_cd" columns (quality checks for USGS) for h in headers: if "_cd" in h: df.drop(h, axis=1, inplace=True) # Add global attributes to appear in the resulting NetCDF file global_attributes = dict( title=name, summary= 'USGS Hurricane Sandy Rapid Response Stations. Data acquired from "http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/.', keywords= "usgs, waterdata, elevation, water, waterlevel, sandy, hurricane, rapid, response, %s" % site_id, keywords_vocaublary="None", naming_authority='gov.usgs', id=site_id, cdm_data_type="Station", history="NetCDF file generated from {!s}".format(csv_link), creator="USGS", creator_url="http://waterdata.usgs.gov", creator_institution="USGS", creator_urn="gov.usgs", publisher="Axiom Data Science", publisher_uri="http://axiomdatascience.com", processing_level="None", acknowledgement="None", geospatial_bounds=loc, geospatial_lat_min=lat, geospatial_lat_max=lat, geospatial_lon_min=lon, geospatial_lon_max=lon, license="Freely Distributed", date_created=datetime.utcnow().replace(second=0, microsecond=0).isoformat()) def to_floats(x): try: return float(x) except ValueError: return fillvalue min_time = df['time'].min() max_time = df['time'].max() full_station_urn = "urn:ioos:station:{!s}:{!s}".format( global_attributes["naming_authority"], site_id) if output_format == 'cf16': output_filename = '{}_{}-{}.nc'.format( site_id, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) times = [calendar.timegm(x.timetuple()) for x in df["time"]] verticals = df['depth'].values ts = TimeSeries(output, latitude=lat, longitude=lon, station_name=full_station_urn, global_attributes=global_attributes, output_filename=output_filename, times=times, verticals=verticals, vertical_axis_name='height', vertical_positive='down') for var in df.columns: if var in [ 'datetime', 'time', 'depth', 'tz_cd', 'site_no', 'agency_cd' ]: continue try: var_meta = variable_map[var] except KeyError: logger.error( "Variable {!s} was not found in variable map!".format(var)) continue # Convert to floats df[var] = df[var].map(to_floats) # Change feet to meters if var_meta["units"] in ["feet", "ft"]: df[var] = np.asarray( [v * 0.3048 if v != fillvalue else v for v in df[var]]) var_meta["units"] = "meters" if output_format == 'axiom': full_sensor_urn = "urn:ioos:sensor:{!s}:{!s}:{!s}".format( global_attributes["naming_authority"], site_id, var_meta["standard_name"]) output_directory = os.path.join(output, full_sensor_urn) output_filename = '{}_{}-{}.nc'.format( var, min_time.strftime('%Y%m%dT%H%M%S'), max_time.strftime('%Y%m%dT%H%M%S')) ts = TimeSeries.from_dataframe( df, output_directory, output_filename, lat, lon, full_station_urn, global_attributes, var_meta["standard_name"], var_meta, sensor_vertical_datum=sensor_vertical_datum, fillvalue=fillvalue, data_column=var, vertical_axis_name='height', vertical_positive='down') ts.add_instrument_metadata(urn=full_sensor_urn) elif output_format == 'cf16': # Variable names shouldn't start with a number try: int(var[0]) variable_name = 'v_{}'.format(var) except: variable_name = var ts.add_variable(variable_name, values=df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum=sensor_vertical_datum)
def main(output_format, output, do_download, download_folder, filesubset=None): if do_download is True: try: os.makedirs(download_folder) except OSError: pass waf = 'http://ga.water.usgs.gov/flood/hurricane/sandy/datafiles/' r = requests.get(waf) soup = BeautifulSoup(r.text, "lxml") for link in soup.find_all('a'): # Skip non .txt files site_id, ext = os.path.splitext(link['href']) if ext != ".txt": continue if filesubset and site_id.lower() not in filesubset: # Skip this file! continue csv_link = waf + link['href'] logger.info("Downloading '{}'".format(csv_link)) d = requests.get(csv_link) try: d.raise_for_status() except requests.exceptions.HTTPError: logger.error("Could not download: {!s}, skipping. Status code: {!s}".format(csv_link, d.status_code)) continue with open(os.path.join(download_folder, os.path.basename(csv_link)), 'wt') as f: f.write(d.text) # Yes, this uses lots of RAM, but we need to match up lon/lat positions later on. results = [] for datafile in os.listdir(download_folder): site_id = os.path.splitext(os.path.basename(datafile))[0] if filesubset and site_id.lower() not in filesubset: # Skip this file! continue with open(os.path.join(download_folder, datafile)) as d: contents = d.read() r = None for line in contents.split("\n"): if "agency_cd" in line: r = parse_type_1(output_format, site_id, contents, output) break elif "date_time_GMT" in line: r = parse_type_2(output_format, site_id, contents, output) break else: continue if r is None: logger.error('Could not process file: {}'.format(datafile)) else: logger.info("Processed {}".format(datafile)) results.append(r) results = sorted(results, key=attrgetter('lon', 'lat')) gresults = groupby(results, attrgetter('lon', 'lat')) for (glon, glat), group in gresults: groups = [ x for x in list(group) if x ] # Strip off the variable type if need be gsite = groups[0].site if gsite[-2:] in ['WV', 'BP', 'WL']: gsite = gsite[:-2] for result in groups: gas = get_globals(glat, glon, result.z, result.name, gsite) station_urn = IoosUrn(asset_type='station', authority=gas['naming_authority'], label=gsite) if output_format == 'cf16': # If CF, a file for each result dataframe times = [ calendar.timegm(x.timetuple()) for x in result.df['time'] ] verticals = result.df['depth'].values output_filename = '{}.nc'.format(result.site) ts = TimeSeries(output, latitude=glat, longitude=glon, station_name=gsite, global_attributes=gas, output_filename=output_filename, times=times, verticals=verticals) for var in result.df.columns: if var in ['date_time_GMT', 'datetime', 'time', 'depth', 'tz_cd', 'site_no', 'agency_cd']: continue try: var_meta = copy(variable_map[var]) except KeyError: logger.error("Variable {!s} was not found in variable map!".format(var)) continue # Convert to floats result.df[var] = result.df[var].map(to_floats) if var_meta["units"].lower() in ["feet", "ft"]: result.df[var] = result.df[var].apply(lambda x: None if pd.isnull(x) else x * 0.3048) var_meta["units"] = "meters" elif var_meta["units"].lower() in ["psi"]: result.df[var] = result.df[var].apply(lambda x: None if pd.isnull(x) else x * 68.9476) var_meta["units"] = "mbar" elif var_meta["units"].lower() in ['millimeters of mercury']: result.df[var] = result.df[var].apply(lambda x: None if pd.isnull(x) else x * 1.33322) var_meta["units"] = "mbar" # Now put the fillvalue we want to be interpreted result.df.fillna(fillvalue, inplace=True) if output_format == 'axiom': # If Axiom, a file for each variable output_directory = os.path.join(output, gsite) output_filename = '{}_{}.nc'.format(result.site, var_meta['standard_name']) ts = TimeSeries.from_dataframe(result.df, output_directory, output_filename, glat, glon, station_urn.urn, gas, var_meta["standard_name"], var_meta, sensor_vertical_datum='NAVD88', fillvalue=fillvalue, data_column=var, vertical_axis_name='height') sensor_urn = urnify(station_urn.authority, station_urn.label, var_meta) ts.add_instrument_metadata(urn=sensor_urn) elif output_format == 'cf16': # If CF, add variable to existing TimeSeries try: int(var[0]) variable_name = 'v_{}'.format(var) except BaseException: variable_name = var ts.add_variable(variable_name, values=result.df[var].values, attributes=var_meta, fillvalue=fillvalue, sensor_vertical_datum='NAVD88')
verticals=df.depth.values, output_filename=ofile, vertical_positive='down') # ### Add data variables # In[9]: df.columns.tolist() # In[10]: for c in df.columns: if c in ts._nc.variables: print("Skipping '{}' (already in file)".format(c)) continue if c in ['time', 'lat', 'lon', 'depth', 'cpm_date_time_string']: print("Skipping axis '{}' (already in file)".format(c)) continue print("Adding {}".format(c)) try: ts.add_variable(c, df[c].values) except: print('skipping, hit object') # In[13]: df['error_flag1'].dtype.name # In[ ]:
# In[11]: for c in df.columns: if c in ts._nc.variables: print("Skipping '{}' (already in file)".format(c)) continue if c in ['time', 'lat', 'lon', 'depth', 'cpm_date_time_string']: print("Skipping axis '{}' (already in file)".format(c)) continue if 'object' in df[c].dtype.name: print("Skipping object {}".format(c)) continue print("Adding {}".format(c)) # add variable values and variable attributes here ts.add_variable(c, df[c].values, attributes=atts.get(c)) # In[12]: df['error_flag3'][0] # In[13]: ts.ncd # In[14]: import netCDF4 nc = netCDF4.Dataset(outfile) # In[15]: