def create_netcdf(self, storage_indices, data_descriptor): ''' Function to create netCDF-CF file for specified storage indices ''' temp_storage_path = self.get_temp_storage_path(storage_indices) storage_path = self.get_storage_path(self.storage_type, storage_indices) make_dir(os.path.dirname(storage_path)) if self.dryrun: return storage_path if os.path.isfile(storage_path) and not self.force: logger.warning('Skipping existing storage unit %s' % storage_path) return # return storage_path #TODO: Remove this temporary debugging hack t_indices = np.array([dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor]) gdfnetcdf = GDFNetCDF(storage_config=self.storage_config[self.storage_type]) logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor)) gdfnetcdf.create(netcdf_filename=temp_storage_path, index_tuple=storage_indices, dimension_index_dict={'T': t_indices}, netcdf_format=None) del t_indices # Set georeferencing from first or second tile for fault tolerance try: gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname']) except: gdfnetcdf.georeference_from_file(data_descriptor[1]['tile_pathname']) variable_dict = self.storage_config[self.storage_type]['measurement_types'] variable_names = variable_dict.keys() array_shape = tuple([len(variable_dict)] + [dim['dimension_elements'] for dim in self.storage_config[self.storage_type]['dimensions'].values() if dim['indexing_type'] == 'regular'] ) # All data types and no-data values should be the same - just use first one array_dtype = variable_dict[variable_dict.keys()[0]]['numpy_datatype_name'] nodata_value = variable_dict[variable_dict.keys()[0]]['nodata_value'] if nodata_value is None: nodata_value = np.nan slice_index = 0 for record_dict in data_descriptor: try: tile_dataset = gdal.Open(record_dict['tile_pathname']) assert tile_dataset, 'Failed to open tile file %s' % record_dict['tile_pathname'] logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor)) data_array = tile_dataset.ReadAsArray() assert data_array.shape == array_shape, 'Tile array shape is not %s' % array_shape except Exception, e: # Can't read data_array from GeoTIFF - create empty data_array instead logger.warning('WARNING: Unable to read array from tile - empty array created: %s', e.message) data_array = np.ones(array_shape, array_dtype) * nodata_value logger.debug('data_array.shape = %s', data_array.shape) #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges for variable_index in range(len(variable_dict)): variable_name = variable_names[variable_index] logger.debug('Writing array to variable %s', variable_name) if len(data_array.shape) == 3: gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index}) elif len(data_array.shape) == 2: gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index}) gdfnetcdf.sync() # Write cached data to disk slice_index += 1
def write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path): ''' Function to write records to database. Must occur in a single transaction ''' def get_storage_key(record, storage_unit_path): ''' Function to write storage unit record if required and return storage unit ID (tuple containing storage_type_id & storage_id) ''' SQL ='''-- Attempt to insert a storage record and return storage key insert into storage( storage_type_id, storage_id, storage_version, storage_location, md5_checksum, storage_bytes, spatial_footprint_id ) select %(storage_type_id)s, nextval('storage_id_seq'::regclass), 0, -- storage_version %(storage_location)s, NULL, NULL, NULL where not exists ( select storage_type_id, storage_id, storage_version from storage where storage_type_id =%(storage_type_id)s and storage_location = %(storage_location)s ); select storage_type_id, storage_id, storage_version from storage where storage_type_id =%(storage_type_id)s and storage_location = %(storage_location)s; ''' params = {'storage_type_id': self.storage_type_config['storage_type_id'], 'storage_location': self.get_storage_filename(self.storage_type, storage_indices) } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return (None, None, None) storage_id_result = self.database.submit_query(SQL, params) assert storage_id_result.record_count == 1, '%d records retrieved for storage_id query' return (storage_id_result.field_values['storage_type_id'][0], storage_id_result.field_values['storage_id'][0], storage_id_result.field_values['storage_version'][0]) def get_observation_key(record): ''' Function to write observation (acquisition) record if required and return observation ID (tuple containing observation_type_id and observation_id) ''' SQL = '''-- Attempt to insert an observation record and return observation key insert into observation( observation_type_id, observation_id, observation_start_datetime, observation_end_datetime, instrument_type_id, instrument_id ) select 1, -- Optical Satellite nextval('observation_id_seq'::regclass), %(observation_start_datetime)s, %(observation_end_datetime)s, 1, -- Passive Satellite-borne (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) where not exists ( select observation_id from observation where observation_type_id = 1 -- Optical Satellite and instrument_type_id = 1 -- Passive Satellite-borne and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) and observation_start_datetime = %(observation_start_datetime)s and observation_end_datetime = %(observation_end_datetime)s ); select observation_type_id, observation_id from observation where observation_type_id = 1 -- Optical Satellite and instrument_type_id = 1 -- Passive Satellite-borne and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) and observation_start_datetime = %(observation_start_datetime)s and observation_end_datetime = %(observation_end_datetime)s; ''' params = {'instrument_tag': record['sensor_name'], 'observation_start_datetime': record['start_datetime'], 'observation_end_datetime': record['end_datetime'] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return (None, None) observation_id_result = self.database.submit_query(SQL, params) assert observation_id_result.record_count == 1, '%d records retrieved for observation_id query' return (observation_id_result.field_values['observation_type_id'][0], observation_id_result.field_values['observation_id'][0]) def get_dataset_key(record, observation_key): ''' Function to write observation (acquisition) record if required and return dataset ID (tuple containing dataset_type_id & dataset_id) ''' SQL = '''-- Attempt to insert a dataset record and return dataset_id insert into dataset( dataset_type_id, dataset_id, observation_type_id, observation_id, dataset_location, creation_datetime ) select (select dataset_type_id from dataset_type where dataset_type_tag = %(dataset_type_tag)s), nextval('dataset_id_seq'::regclass), %(observation_type_id)s, %(observation_id)s, %(dataset_location)s, %(creation_datetime)s where not exists ( select dataset_id from dataset where observation_type_id = %(observation_type_id)s and observation_id = %(observation_id)s and dataset_location = %(dataset_location)s ); select dataset_type_id, dataset_id from dataset where observation_type_id = %(observation_type_id)s and observation_id = %(observation_id)s and dataset_location = %(dataset_location)s ''' params = {'dataset_type_tag': 'PQ' if record['level_name'] == 'PQA' else record['level_name'], 'observation_type_id': observation_key[0], 'observation_id': observation_key[1], 'dataset_location': record['dataset_path'], 'creation_datetime': record['datetime_processed'].replace(tzinfo=pytz.UTC) # Convert naiive time to UTC } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return -1 dataset_id_result = self.database.submit_query(SQL, params) assert dataset_id_result.record_count == 1, '%d records retrieved for dataset_id query' return (dataset_id_result.field_values['dataset_type_id'][0], dataset_id_result.field_values['dataset_id'][0]) def set_dataset_metadata(record, dataset_key): SQL = '''-- Attempt to insert dataset_metadata records insert into dataset_metadata( dataset_type_id, dataset_id, metadata_xml ) select %(dataset_type_id)s, %(dataset_id)s, %(xml_text)s::xml where not exists ( select * from dataset_metadata where dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s ) and xml_is_well_formed(%(xml_text)s) ''' params = {'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], 'xml_text': record['xml_text'] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple): ''' Function to write dataset_dimension record if required ''' SQL = '''-- Attempt to insert dataset_dimension records insert into dataset_dimension( dataset_type_id, dataset_id, domain_id, dimension_id, min_value, max_value, indexing_value ) select %(dataset_type_id)s, %(dataset_id)s, %(domain_id)s, %(dimension_id)s, %(min_value)s, %(max_value)s, %(indexing_value)s where not exists ( select * from dataset_dimension where dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s and domain_id = %(domain_id)s and dimension_id = %(dimension_id)s ); ''' params = {'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], 'domain_id': dimension_key[0], 'dimension_id': dimension_key[1], 'min_value': min_index_max_tuple[0], 'indexing_value': min_index_max_tuple[1], 'max_value': min_index_max_tuple[2] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_storage_dataset(storage_key, dataset_key): ''' Function to write storage_dataset record if required ''' SQL = '''-- Attempt to insert storage_dataset record insert into storage_dataset( storage_type_id, storage_id, storage_version, dataset_type_id, dataset_id ) select %(storage_type_id)s, %(storage_id)s, %(storage_version)s, %(dataset_type_id)s, %(dataset_id)s where not exists ( select * from storage_dataset where storage_type_id = %(storage_type_id)s and storage_id = %(storage_id)s and storage_version = %(storage_version)s and dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s ); ''' params = {'storage_type_id': storage_key[0], 'storage_id': storage_key[1], 'storage_version': storage_key[2], 'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_storage_dimension(storage_key, dimension_key, min_index_max_tuple): ''' Function to write storage_dimension record if required ''' SQL = '''-- Attempt to insert storage_dimension record insert into storage_dimension( storage_type_id, storage_id, storage_version, domain_id, dimension_id, storage_dimension_index, storage_dimension_min, storage_dimension_max ) select %(storage_type_id)s, %(storage_id)s, %(storage_version)s, %(domain_id)s, %(dimension_id)s, %(storage_dimension_index)s, %(storage_dimension_min)s, %(storage_dimension_max)s where not exists ( select * from storage_dimension where storage_type_id = %(storage_type_id)s and storage_id = %(storage_id)s and storage_version = %(storage_version)s and domain_id = %(domain_id)s and dimension_id = %(dimension_id)s ); ''' params = {'storage_type_id': storage_key[0], 'storage_id': storage_key[1], 'storage_version': storage_key[2], 'domain_id': dimension_key[0], 'dimension_id': dimension_key[1], 'storage_dimension_min': min_index_max_tuple[0], 'storage_dimension_index': min_index_max_tuple[1], 'storage_dimension_max': min_index_max_tuple[2] } log_multiline(logger.debug, SQL, 'SQL', '\t') log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) # Start of write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path) definition assert os.path.isfile(storage_unit_path), 'Storage unit file does not exist' # Keep all database operations in the same transaction self.database.keep_connection = True self.database.autocommit = False try: # Get storage unit ID - this doesn't change from record to record record = data_descriptor[0] storage_key = get_storage_key(record, storage_unit_path) logger.debug('storage_key = %s', storage_key) # Set storage_dimension record for each dimension logger.debug('self.dimensions = %s', self.dimensions) for dimension_index in range(len(self.dimensions)): logger.debug('dimension_index = %d', dimension_index) dimension = self.dimensions.keys()[dimension_index] logger.debug('dimension = %s', dimension) dimension_key = (self.dimensions[dimension]['domain_id'], self.dimensions[dimension]['dimension_id'] ) logger.debug('dimension_key = %s', dimension_key) min_index_max_tuple = (self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index]), storage_indices[dimension_index], # Indexing value self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index] + 1) ) set_storage_dimension(storage_key, dimension_key, min_index_max_tuple) # Process each tile record for record in data_descriptor: observation_key = get_observation_key(record) logger.debug('observation_key = %s', observation_key) dataset_key = get_dataset_key(record, observation_key) logger.debug('dataset_key = %s', dataset_key) set_dataset_metadata(record, dataset_key) # Set dataset_dimension record for each dimension for dimension in self.dimensions: dimension_key = (self.storage_type_config['dimensions'][dimension]['domain_id'], self.storage_type_config['dimensions'][dimension]['dimension_id'] ) if dimension == 'X': min_index_max_tuple = (min(record['ul_x'], record['ll_x']), None, # No indexing value for regular dimension max(record['ur_x'], record['lr_x']) ) elif dimension == 'Y': min_index_max_tuple = (min(record['ll_y'], record['lr_y']), None, # No indexing value for regular dimension max(record['ul_y'], record['ur_y']) ) elif dimension == 'T': min_value = dt2secs(record['start_datetime']) max_value = dt2secs(record['end_datetime']) min_index_max_tuple = (min_value, int((min_value + max_value) / 2.0 + 0.5), max_value ) set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple) set_storage_dataset(storage_key, dataset_key) self.database.commit() # Commit transaction except Exception, caught_exception: try: self.database.rollback() # Rollback transaction except: pass raise caught_exception
import calendar from datetime import datetime, date, timedelta from gdf import GDF from pprint import pprint from gdf import dt2secs, secs2dt import matplotlib.pyplot as plt # In[4]: g = GDF() g.debug = False # In[5]: start_date = dt2secs(date(year=2010, month=1, day=1)) end_date = dt2secs(date(year=2010, month=1, day=18)) data_request_descriptor = {'storage_type': 'LS5TM', 'variables': ('B40', 'B30',), 'dimensions': {'X': {'range': (147.875, 148.125)}, 'Y': {'range': (-37.0 + 0.875, -36.0 + 0.125)}, # 'T': {'range': (start_date, end_date), # 'array_range': (0, 4) # 'crs': 'SSE', # Seconds since epoch # 'grouping_function': g.null_grouping # } } }
def create_netcdf(self, storage_indices, data_descriptor): ''' Function to create netCDF-CF file for specified storage indices ''' temp_storage_path = self.get_temp_storage_path(storage_indices) storage_path = self.get_storage_path(self.storage_type, storage_indices) make_dir(os.path.dirname(storage_path)) if self.dryrun: return storage_path if os.path.isfile(storage_path) and not self.force: logger.warning('Skipping existing storage unit %s' % storage_path) return # return storage_path #TODO: Remove this temporary debugging hack t_indices = np.array([dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor]) gdfnetcdf = GDFNetCDF(storage_config=self.storage_config[self.storage_type]) logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor)) gdfnetcdf.create(netcdf_filename=temp_storage_path, index_tuple=storage_indices, dimension_index_dict={'T': t_indices}, netcdf_format=None) del t_indices # Set georeferencing from first tile gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname']) variable_dict = self.storage_config[self.storage_type]['measurement_types'] variable_names = variable_dict.keys() slice_index = 0 for record_dict in data_descriptor: tile_dataset = gdal.Open(record_dict['tile_pathname']) assert tile_dataset, 'Failed to open tile file %s' % record_dict['tile_pathname'] logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor)) data_array = tile_dataset.ReadAsArray() logger.debug('data_array.shape = %s', data_array.shape) #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges for variable_index in range(len(variable_dict)): variable_name = variable_names[variable_index] logger.debug('Writing array to variable %s', variable_name) if len(data_array.shape) == 3: gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index}) elif len(data_array.shape) == 2: gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index}) gdfnetcdf.sync() # Write cached data to disk slice_index += 1 del gdfnetcdf # Close the netCDF logger.debug('Moving temporary storage unit %s to %s', temp_storage_path, storage_path) if os.path.isfile(storage_path): logger.debug('Removing existing storage unit %s' % storage_path) os.remove(storage_path) shutil.move(temp_storage_path, storage_path) return storage_path
def write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path): ''' Function to write records to database. Must occur in a single transaction ''' def get_storage_key(record, storage_unit_path): ''' Function to write storage unit record if required and return storage unit ID (tuple containing storage_type_id & storage_id) ''' SQL = '''-- Attempt to insert a storage record and return storage key insert into storage( storage_type_id, storage_id, storage_version, storage_location, md5_checksum, storage_bytes, spatial_footprint_id ) select %(storage_type_id)s, nextval('storage_id_seq'::regclass), 0, -- storage_version %(storage_location)s, NULL, NULL, NULL where not exists ( select storage_type_id, storage_id, storage_version from storage where storage_type_id =%(storage_type_id)s and storage_location = %(storage_location)s ); select storage_type_id, storage_id, storage_version from storage where storage_type_id =%(storage_type_id)s and storage_location = %(storage_location)s; ''' params = { 'storage_type_id': self.storage_type_config['storage_type_id'], 'storage_location': self.get_storage_filename(self.storage_type, storage_indices) } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return (None, None, None) storage_id_result = self.database.submit_query(SQL, params) assert storage_id_result.record_count == 1, '%d records retrieved for storage_id query' return (storage_id_result.field_values['storage_type_id'][0], storage_id_result.field_values['storage_id'][0], storage_id_result.field_values['storage_version'][0]) def get_observation_key(record): ''' Function to write observation (acquisition) record if required and return observation ID (tuple containing observation_type_id and observation_id) ''' SQL = '''-- Attempt to insert an observation record and return observation key insert into observation( observation_type_id, observation_id, observation_start_datetime, observation_end_datetime, instrument_type_id, instrument_id ) select 1, -- Optical Satellite nextval('observation_id_seq'::regclass), %(observation_start_datetime)s, %(observation_end_datetime)s, 1, -- Passive Satellite-borne (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) where not exists ( select observation_id from observation where observation_type_id = 1 -- Optical Satellite and instrument_type_id = 1 -- Passive Satellite-borne and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) and observation_start_datetime = %(observation_start_datetime)s and observation_end_datetime = %(observation_end_datetime)s ); select observation_type_id, observation_id from observation where observation_type_id = 1 -- Optical Satellite and instrument_type_id = 1 -- Passive Satellite-borne and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s) and observation_start_datetime = %(observation_start_datetime)s and observation_end_datetime = %(observation_end_datetime)s; ''' params = { 'instrument_tag': record['sensor_name'], 'observation_start_datetime': record['start_datetime'], 'observation_end_datetime': record['end_datetime'] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return (None, None) observation_id_result = self.database.submit_query(SQL, params) assert observation_id_result.record_count == 1, '%d records retrieved for observation_id query' return ( observation_id_result.field_values['observation_type_id'][0], observation_id_result.field_values['observation_id'][0]) def get_dataset_key(record, observation_key): ''' Function to write observation (acquisition) record if required and return dataset ID (tuple containing dataset_type_id & dataset_id) ''' SQL = '''-- Attempt to insert a dataset record and return dataset_id insert into dataset( dataset_type_id, dataset_id, observation_type_id, observation_id, dataset_location, creation_datetime ) select (select dataset_type_id from dataset_type where dataset_type_tag = %(dataset_type_tag)s), nextval('dataset_id_seq'::regclass), %(observation_type_id)s, %(observation_id)s, %(dataset_location)s, %(creation_datetime)s where not exists ( select dataset_id from dataset where observation_type_id = %(observation_type_id)s and observation_id = %(observation_id)s and dataset_location = %(dataset_location)s ); select dataset_type_id, dataset_id from dataset where observation_type_id = %(observation_type_id)s and observation_id = %(observation_id)s and dataset_location = %(dataset_location)s ''' params = { 'dataset_type_tag': 'PQ' if record['level_name'] == 'PQA' else record['level_name'], 'observation_type_id': observation_key[0], 'observation_id': observation_key[1], 'dataset_location': record['dataset_path'], 'creation_datetime': record['datetime_processed'].replace( tzinfo=pytz.UTC) # Convert naiive time to UTC } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return -1 dataset_id_result = self.database.submit_query(SQL, params) assert dataset_id_result.record_count == 1, '%d records retrieved for dataset_id query' return (dataset_id_result.field_values['dataset_type_id'][0], dataset_id_result.field_values['dataset_id'][0]) def set_dataset_metadata(record, dataset_key): SQL = '''-- Attempt to insert dataset_metadata records insert into dataset_metadata( dataset_type_id, dataset_id, metadata_xml ) select %(dataset_type_id)s, %(dataset_id)s, %(xml_text)s::xml where not exists ( select * from dataset_metadata where dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s ) and xml_is_well_formed(%(xml_text)s) ''' params = { 'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], 'xml_text': record['xml_text'] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple): ''' Function to write dataset_dimension record if required ''' SQL = '''-- Attempt to insert dataset_dimension records insert into dataset_dimension( dataset_type_id, dataset_id, domain_id, dimension_id, min_value, max_value, indexing_value ) select %(dataset_type_id)s, %(dataset_id)s, %(domain_id)s, %(dimension_id)s, %(min_value)s, %(max_value)s, %(indexing_value)s where not exists ( select * from dataset_dimension where dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s and domain_id = %(domain_id)s and dimension_id = %(dimension_id)s ); ''' params = { 'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], 'domain_id': dimension_key[0], 'dimension_id': dimension_key[1], 'min_value': min_index_max_tuple[0], 'indexing_value': min_index_max_tuple[1], 'max_value': min_index_max_tuple[2] } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_storage_dataset(storage_key, dataset_key): ''' Function to write storage_dataset record if required ''' SQL = '''-- Attempt to insert storage_dataset record insert into storage_dataset( storage_type_id, storage_id, storage_version, dataset_type_id, dataset_id ) select %(storage_type_id)s, %(storage_id)s, %(storage_version)s, %(dataset_type_id)s, %(dataset_id)s where not exists ( select * from storage_dataset where storage_type_id = %(storage_type_id)s and storage_id = %(storage_id)s and storage_version = %(storage_version)s and dataset_type_id = %(dataset_type_id)s and dataset_id = %(dataset_id)s ); ''' params = { 'storage_type_id': storage_key[0], 'storage_id': storage_key[1], 'storage_version': storage_key[2], 'dataset_type_id': dataset_key[0], 'dataset_id': dataset_key[1], } log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) def set_storage_dimension(storage_key, dimension_key, min_index_max_tuple): ''' Function to write storage_dimension record if required ''' SQL = '''-- Attempt to insert storage_dimension record insert into storage_dimension( storage_type_id, storage_id, storage_version, domain_id, dimension_id, storage_dimension_index, storage_dimension_min, storage_dimension_max ) select %(storage_type_id)s, %(storage_id)s, %(storage_version)s, %(domain_id)s, %(dimension_id)s, %(storage_dimension_index)s, %(storage_dimension_min)s, %(storage_dimension_max)s where not exists ( select * from storage_dimension where storage_type_id = %(storage_type_id)s and storage_id = %(storage_id)s and storage_version = %(storage_version)s and domain_id = %(domain_id)s and dimension_id = %(dimension_id)s ); ''' params = { 'storage_type_id': storage_key[0], 'storage_id': storage_key[1], 'storage_version': storage_key[2], 'domain_id': dimension_key[0], 'dimension_id': dimension_key[1], 'storage_dimension_min': min_index_max_tuple[0], 'storage_dimension_index': min_index_max_tuple[1], 'storage_dimension_max': min_index_max_tuple[2] } log_multiline(logger.debug, SQL, 'SQL', '\t') log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t') if self.dryrun: return self.database.submit_query(SQL, params) # Start of write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path) definition assert os.path.isfile( storage_unit_path), 'Storage unit file does not exist' # Keep all database operations in the same transaction self.database.keep_connection = True self.database.autocommit = False try: # Get storage unit ID - this doesn't change from record to record record = data_descriptor[0] storage_key = get_storage_key(record, storage_unit_path) logger.debug('storage_key = %s', storage_key) # Set storage_dimension record for each dimension logger.debug('self.dimensions = %s', self.dimensions) for dimension_index in range(len(self.dimensions)): logger.debug('dimension_index = %d', dimension_index) dimension = self.dimensions.keys()[dimension_index] logger.debug('dimension = %s', dimension) dimension_key = (self.dimensions[dimension]['domain_id'], self.dimensions[dimension]['dimension_id']) logger.debug('dimension_key = %s', dimension_key) min_index_max_tuple = ( self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index]), storage_indices[dimension_index], # Indexing value self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index] + 1)) set_storage_dimension(storage_key, dimension_key, min_index_max_tuple) # Process each tile record for record in data_descriptor: observation_key = get_observation_key(record) logger.debug('observation_key = %s', observation_key) dataset_key = get_dataset_key(record, observation_key) logger.debug('dataset_key = %s', dataset_key) set_dataset_metadata(record, dataset_key) # Set dataset_dimension record for each dimension for dimension in self.dimensions: dimension_key = (self.storage_type_config['dimensions'] [dimension]['domain_id'], self.storage_type_config['dimensions'] [dimension]['dimension_id']) if dimension == 'X': min_index_max_tuple = ( min(record['ul_x'], record['ll_x']), None, # No indexing value for regular dimension max(record['ur_x'], record['lr_x'])) elif dimension == 'Y': min_index_max_tuple = ( min(record['ll_y'], record['lr_y']), None, # No indexing value for regular dimension max(record['ul_y'], record['ur_y'])) elif dimension == 'T': min_value = dt2secs(record['start_datetime']) max_value = dt2secs(record['end_datetime']) min_index_max_tuple = ( min_value, int((min_value + max_value) / 2.0 + 0.5), max_value) set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple) set_storage_dataset(storage_key, dataset_key) self.database.commit() # Commit transaction except Exception, caught_exception: try: self.database.rollback() # Rollback transaction except: pass raise caught_exception
def create_netcdf(self, storage_indices, data_descriptor): ''' Function to create netCDF-CF file for specified storage indices ''' temp_storage_path = self.get_temp_storage_path(storage_indices) storage_path = self.get_storage_path(self.storage_type, storage_indices) make_dir(os.path.dirname(storage_path)) if self.dryrun: return storage_path if os.path.isfile(storage_path) and not self.force: logger.warning('Skipping existing storage unit %s' % storage_path) return # return storage_path #TODO: Remove this temporary debugging hack t_indices = np.array([ dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor ]) gdfnetcdf = GDFNetCDF( storage_config=self.storage_config[self.storage_type]) logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor)) gdfnetcdf.create(netcdf_filename=temp_storage_path, index_tuple=storage_indices, dimension_index_dict={'T': t_indices}, netcdf_format=None) del t_indices # Set georeferencing from first or second tile for fault tolerance try: gdfnetcdf.georeference_from_file( data_descriptor[0]['tile_pathname']) except: gdfnetcdf.georeference_from_file( data_descriptor[1]['tile_pathname']) variable_dict = self.storage_config[ self.storage_type]['measurement_types'] variable_names = variable_dict.keys() array_shape = tuple([len(variable_dict)] + [ dim['dimension_elements'] for dim in self.storage_config[self.storage_type] ['dimensions'].values() if dim['indexing_type'] == 'regular' ]) # All data types and no-data values should be the same - just use first one array_dtype = variable_dict[variable_dict.keys() [0]]['numpy_datatype_name'] nodata_value = variable_dict[variable_dict.keys()[0]]['nodata_value'] if nodata_value is None: nodata_value = np.nan slice_index = 0 for record_dict in data_descriptor: try: tile_dataset = gdal.Open(record_dict['tile_pathname']) assert tile_dataset, 'Failed to open tile file %s' % record_dict[ 'tile_pathname'] logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor)) data_array = tile_dataset.ReadAsArray() assert data_array.shape == array_shape, 'Tile array shape is not %s' % array_shape except Exception, e: # Can't read data_array from GeoTIFF - create empty data_array instead logger.warning( 'WARNING: Unable to read array from tile - empty array created: %s', e.message) data_array = np.ones(array_shape, array_dtype) * nodata_value logger.debug('data_array.shape = %s', data_array.shape) #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges for variable_index in range(len(variable_dict)): variable_name = variable_names[variable_index] logger.debug('Writing array to variable %s', variable_name) if len(data_array.shape) == 3: gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index}) elif len(data_array.shape) == 2: gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index}) gdfnetcdf.sync() # Write cached data to disk slice_index += 1
def create_netcdf(self, storage_indices, data_descriptor): ''' Function to create netCDF-CF file for specified storage indices ''' temp_storage_path = self.get_temp_storage_path(storage_indices) storage_path = self.get_storage_path(self.storage_type, storage_indices) make_dir(os.path.dirname(storage_path)) if self.dryrun: return storage_path if os.path.isfile(storage_path) and not self.force: logger.warning('Skipping existing storage unit %s' % storage_path) return # return storage_path #TODO: Remove this temporary debugging hack t_indices = np.array([ dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor ]) gdfnetcdf = GDFNetCDF( storage_config=self.storage_config[self.storage_type]) logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor)) gdfnetcdf.create(netcdf_filename=temp_storage_path, index_tuple=storage_indices, dimension_index_dict={'T': t_indices}, netcdf_format=None) del t_indices # Set georeferencing from first tile gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname']) variable_dict = self.storage_config[ self.storage_type]['measurement_types'] variable_names = variable_dict.keys() slice_index = 0 for record_dict in data_descriptor: tile_dataset = gdal.Open(record_dict['tile_pathname']) assert tile_dataset, 'Failed to open tile file %s' % record_dict[ 'tile_pathname'] logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor)) data_array = tile_dataset.ReadAsArray() logger.debug('data_array.shape = %s', data_array.shape) #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges for variable_index in range(len(variable_dict)): variable_name = variable_names[variable_index] logger.debug('Writing array to variable %s', variable_name) if len(data_array.shape) == 3: gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index}) elif len(data_array.shape) == 2: gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index}) gdfnetcdf.sync() # Write cached data to disk slice_index += 1 del gdfnetcdf # Close the netCDF logger.debug('Moving temporary storage unit %s to %s', temp_storage_path, storage_path) if os.path.isfile(storage_path): logger.debug('Removing existing storage unit %s' % storage_path) os.remove(storage_path) shutil.move(temp_storage_path, storage_path) return storage_path