Exemplo n.º 1
0
    def create_netcdf(self, storage_indices, data_descriptor):
        '''
        Function to create netCDF-CF file for specified storage indices
        '''
        temp_storage_path = self.get_temp_storage_path(storage_indices)
        storage_path = self.get_storage_path(self.storage_type, storage_indices)
        make_dir(os.path.dirname(storage_path))        
        
        if self.dryrun:
            return storage_path
        
        if os.path.isfile(storage_path) and not self.force: 
            logger.warning('Skipping existing storage unit %s' % storage_path)
            return 
#            return storage_path #TODO: Remove this temporary debugging hack
        
        t_indices = np.array([dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor])
        
        gdfnetcdf = GDFNetCDF(storage_config=self.storage_config[self.storage_type])
        
        logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor))
        gdfnetcdf.create(netcdf_filename=temp_storage_path, 
                         index_tuple=storage_indices, 
                         dimension_index_dict={'T': t_indices}, netcdf_format=None)
        del t_indices
        
        # Set georeferencing from first or second tile for fault tolerance
        try:
            gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname'])
        except:
            gdfnetcdf.georeference_from_file(data_descriptor[1]['tile_pathname'])

        variable_dict = self.storage_config[self.storage_type]['measurement_types']
        variable_names = variable_dict.keys()
                
        array_shape = tuple([len(variable_dict)] +
                            [dim['dimension_elements'] 
                             for dim in self.storage_config[self.storage_type]['dimensions'].values() 
                             if dim['indexing_type'] == 'regular']
                            )
        
        # All data types and no-data values should be the same - just use first one
        array_dtype = variable_dict[variable_dict.keys()[0]]['numpy_datatype_name']

        nodata_value = variable_dict[variable_dict.keys()[0]]['nodata_value']
        if nodata_value is None:
            nodata_value = np.nan
                    
        slice_index = 0
        for record_dict in data_descriptor:
            try:
                tile_dataset = gdal.Open(record_dict['tile_pathname'])
                assert tile_dataset, 'Failed to open tile file %s' % record_dict['tile_pathname']
            
                logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor))
                data_array = tile_dataset.ReadAsArray()

                assert data_array.shape == array_shape, 'Tile array shape is not %s' % array_shape
            except Exception, e:
                # Can't read data_array from GeoTIFF - create empty data_array instead
                logger.warning('WARNING: Unable to read array from tile - empty array created: %s', e.message)

                data_array = np.ones(array_shape, array_dtype) * nodata_value
                
            logger.debug('data_array.shape = %s', data_array.shape)
            
            #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges
            for variable_index in range(len(variable_dict)):
                variable_name = variable_names[variable_index]
                logger.debug('Writing array to variable %s', variable_name)
                if len(data_array.shape) == 3:
                    gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index})
                elif len(data_array.shape) == 2:
                    gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index})

            gdfnetcdf.sync() # Write cached data to disk      
            slice_index += 1
Exemplo n.º 2
0
    def write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path):
        '''
        Function to write records to database. Must occur in a single transaction
        '''

        def get_storage_key(record, storage_unit_path):
            '''
            Function to write storage unit record if required and return storage unit ID (tuple containing storage_type_id & storage_id)
            '''
            SQL ='''-- Attempt to insert a storage record and return storage key 
insert into storage(
    storage_type_id,
    storage_id,
    storage_version,
    storage_location,
    md5_checksum,
    storage_bytes,
    spatial_footprint_id
    )  
select
    %(storage_type_id)s,
    nextval('storage_id_seq'::regclass),
    0, -- storage_version
    %(storage_location)s,
    NULL,
    NULL,
    NULL
where not exists (
    select storage_type_id, storage_id, storage_version from storage 
    where storage_type_id =%(storage_type_id)s
    and storage_location = %(storage_location)s
    );
            
select storage_type_id, storage_id, storage_version from storage
where storage_type_id =%(storage_type_id)s
    and storage_location = %(storage_location)s;
'''            
            params = {'storage_type_id': self.storage_type_config['storage_type_id'],
                      'storage_location': self.get_storage_filename(self.storage_type, storage_indices)
                      }
            
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return (None, None, None)
            
            storage_id_result = self.database.submit_query(SQL, params)
            assert storage_id_result.record_count == 1, '%d records retrieved for storage_id query'
            return (storage_id_result.field_values['storage_type_id'][0], 
                    storage_id_result.field_values['storage_id'][0],
                    storage_id_result.field_values['storage_version'][0])
            
        def get_observation_key(record):
            '''
            Function to write observation (acquisition) record if required and return observation ID (tuple containing observation_type_id and observation_id)
            '''
            SQL = '''-- Attempt to insert an observation record and return observation key
insert into observation(
    observation_type_id,
    observation_id,
    observation_start_datetime,
    observation_end_datetime,
    instrument_type_id,
    instrument_id
    )
select
    1, -- Optical Satellite
    nextval('observation_id_seq'::regclass),
    %(observation_start_datetime)s,
    %(observation_end_datetime)s,
    1, -- Passive Satellite-borne
    (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
where not exists (
    select observation_id from observation
    where observation_type_id = 1 -- Optical Satellite
    and instrument_type_id = 1 -- Passive Satellite-borne
    and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
    and observation_start_datetime = %(observation_start_datetime)s
    and observation_end_datetime = %(observation_end_datetime)s
    );

select observation_type_id, observation_id from observation
where observation_type_id = 1 -- Optical Satellite
    and instrument_type_id = 1 -- Passive Satellite-borne
    and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
    and observation_start_datetime = %(observation_start_datetime)s
    and observation_end_datetime = %(observation_end_datetime)s;
'''
            params = {'instrument_tag': record['sensor_name'],
                      'observation_start_datetime': record['start_datetime'],
                      'observation_end_datetime': record['end_datetime']
                      }
            
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return (None, None)
            
            observation_id_result = self.database.submit_query(SQL, params)
            assert observation_id_result.record_count == 1, '%d records retrieved for observation_id query'
            return (observation_id_result.field_values['observation_type_id'][0], 
                    observation_id_result.field_values['observation_id'][0])
           
        
        def get_dataset_key(record, observation_key):
            '''
            Function to write observation (acquisition) record if required and return dataset ID (tuple containing dataset_type_id & dataset_id)
            '''
            SQL = '''-- Attempt to insert a dataset record and return dataset_id
insert into dataset(
    dataset_type_id,
    dataset_id,
    observation_type_id,
    observation_id,
    dataset_location,
    creation_datetime
    )
select
    (select dataset_type_id from dataset_type where dataset_type_tag = %(dataset_type_tag)s),
    nextval('dataset_id_seq'::regclass),
    %(observation_type_id)s,
    %(observation_id)s,
    %(dataset_location)s,
    %(creation_datetime)s
where not exists (
    select dataset_id from dataset
    where observation_type_id = %(observation_type_id)s
        and observation_id = %(observation_id)s
        and dataset_location = %(dataset_location)s
    );

select dataset_type_id, dataset_id from dataset
where observation_type_id = %(observation_type_id)s
    and observation_id = %(observation_id)s
    and dataset_location = %(dataset_location)s
'''
            params = {'dataset_type_tag': 'PQ' if record['level_name'] == 'PQA' else record['level_name'],
                      'observation_type_id': observation_key[0],
                      'observation_id': observation_key[1],
                      'dataset_location': record['dataset_path'],
                      'creation_datetime': record['datetime_processed'].replace(tzinfo=pytz.UTC) # Convert naiive time to UTC
                      }
            
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return -1
            
            dataset_id_result = self.database.submit_query(SQL, params)
            assert dataset_id_result.record_count == 1, '%d records retrieved for dataset_id query'
            return (dataset_id_result.field_values['dataset_type_id'][0], 
                    dataset_id_result.field_values['dataset_id'][0])
        
        
        def set_dataset_metadata(record, dataset_key):
            SQL = '''-- Attempt to insert dataset_metadata records
insert into dataset_metadata(
    dataset_type_id,
    dataset_id,
    metadata_xml
    )
select
  %(dataset_type_id)s,
  %(dataset_id)s,
  %(xml_text)s::xml
where not exists (
    select * from dataset_metadata
    where dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
        )
    and xml_is_well_formed(%(xml_text)s)
'''
            params = {'dataset_type_id': dataset_key[0],
                      'dataset_id': dataset_key[1],
                      'xml_text': record['xml_text']
                      }
        
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return
            
            self.database.submit_query(SQL, params)
            

        def set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple):
            '''
            Function to write dataset_dimension record if required
            '''
            SQL = '''-- Attempt to insert dataset_dimension records
insert into dataset_dimension(
    dataset_type_id,
    dataset_id,
    domain_id,
    dimension_id,
    min_value,
    max_value,
    indexing_value
    )
select
  %(dataset_type_id)s,
  %(dataset_id)s,
  %(domain_id)s,
  %(dimension_id)s,
  %(min_value)s,
  %(max_value)s,
  %(indexing_value)s
where not exists (
    select * from dataset_dimension
    where dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
        and domain_id = %(domain_id)s
        and dimension_id = %(dimension_id)s
    );
'''
            params = {'dataset_type_id': dataset_key[0],
                      'dataset_id': dataset_key[1],
                      'domain_id': dimension_key[0],
                      'dimension_id': dimension_key[1],
                      'min_value': min_index_max_tuple[0],
                      'indexing_value': min_index_max_tuple[1],
                      'max_value': min_index_max_tuple[2]
                      }
            
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return
            
            self.database.submit_query(SQL, params)
        
        
        def set_storage_dataset(storage_key, dataset_key):
            '''
            Function to write storage_dataset record if required
            '''
            SQL = '''-- Attempt to insert storage_dataset record
insert into storage_dataset(
    storage_type_id,
    storage_id,
    storage_version,
    dataset_type_id,
    dataset_id
    )
select
    %(storage_type_id)s,
    %(storage_id)s,
    %(storage_version)s,
    %(dataset_type_id)s,
    %(dataset_id)s
where not exists (
    select * from storage_dataset
    where storage_type_id = %(storage_type_id)s
        and storage_id = %(storage_id)s
        and storage_version = %(storage_version)s
        and dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
    );
'''
            params = {'storage_type_id': storage_key[0],
                      'storage_id': storage_key[1],
                      'storage_version': storage_key[2],
                      'dataset_type_id': dataset_key[0],
                      'dataset_id': dataset_key[1],
                      }
            
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return
            
            self.database.submit_query(SQL, params)
        
        
        def set_storage_dimension(storage_key, dimension_key, min_index_max_tuple):
            '''
            Function to write storage_dimension record if required
            '''
            SQL = '''-- Attempt to insert storage_dimension record
insert into storage_dimension(
    storage_type_id,
    storage_id,
    storage_version,
    domain_id,
    dimension_id,
    storage_dimension_index,
    storage_dimension_min,
    storage_dimension_max
    )
select
    %(storage_type_id)s,
    %(storage_id)s,
    %(storage_version)s,
    %(domain_id)s,
    %(dimension_id)s,
    %(storage_dimension_index)s,
    %(storage_dimension_min)s,
    %(storage_dimension_max)s
where not exists (
    select * from storage_dimension
    where storage_type_id = %(storage_type_id)s
        and storage_id = %(storage_id)s
        and storage_version = %(storage_version)s
        and domain_id = %(domain_id)s
        and dimension_id = %(dimension_id)s
    );
'''
            params = {'storage_type_id': storage_key[0],
                      'storage_id': storage_key[1],
                      'storage_version': storage_key[2],
                      'domain_id': dimension_key[0],
                      'dimension_id': dimension_key[1],
                      'storage_dimension_min': min_index_max_tuple[0],
                      'storage_dimension_index': min_index_max_tuple[1],
                      'storage_dimension_max': min_index_max_tuple[2]
                      }

            log_multiline(logger.debug, SQL, 'SQL', '\t')
            log_multiline(logger.debug, self.database.default_cursor.mogrify(SQL, params), 'Mogrified SQL', '\t')
            
            if self.dryrun:
                return
            
            self.database.submit_query(SQL, params)
        
        
        # Start of write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path) definition
        assert os.path.isfile(storage_unit_path), 'Storage unit file does not exist'
        
        # Keep all database operations in the same transaction
        self.database.keep_connection = True
        self.database.autocommit = False
        
        try:
            # Get storage unit ID - this doesn't change from record to record
            record = data_descriptor[0]
            storage_key = get_storage_key(record, storage_unit_path)
            logger.debug('storage_key = %s', storage_key)

            # Set storage_dimension record for each dimension
            logger.debug('self.dimensions = %s', self.dimensions)
            for dimension_index in range(len(self.dimensions)):
                logger.debug('dimension_index = %d', dimension_index)
                dimension = self.dimensions.keys()[dimension_index]
                logger.debug('dimension = %s', dimension)
                dimension_key = (self.dimensions[dimension]['domain_id'],
                                 self.dimensions[dimension]['dimension_id']
                                 )
                logger.debug('dimension_key = %s', dimension_key)

                min_index_max_tuple = (self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index]),
                                       storage_indices[dimension_index], # Indexing value
                                       self.index2ordinate(self.storage_type, dimension, storage_indices[dimension_index] + 1)
                                       )

                set_storage_dimension(storage_key, dimension_key, min_index_max_tuple)
                
            # Process each tile record
            for record in data_descriptor:
                observation_key = get_observation_key(record)
                logger.debug('observation_key = %s', observation_key)

                dataset_key = get_dataset_key(record, observation_key)
                logger.debug('dataset_key = %s', dataset_key)
                
                set_dataset_metadata(record, dataset_key)
                
                # Set dataset_dimension record for each dimension
                for dimension in self.dimensions:
                    dimension_key = (self.storage_type_config['dimensions'][dimension]['domain_id'],
                                     self.storage_type_config['dimensions'][dimension]['dimension_id']
                                     )

                    if dimension == 'X':
                        min_index_max_tuple = (min(record['ul_x'], record['ll_x']),
                                               None, # No indexing value for regular dimension
                                               max(record['ur_x'], record['lr_x'])
                                               )
                    elif dimension == 'Y':
                        min_index_max_tuple = (min(record['ll_y'], record['lr_y']),
                                               None, # No indexing value for regular dimension
                                               max(record['ul_y'], record['ur_y'])
                                               )
                    elif dimension == 'T':
                        min_value = dt2secs(record['start_datetime'])
                        max_value = dt2secs(record['end_datetime'])
                        min_index_max_tuple = (min_value,
                                               int((min_value + max_value) / 2.0 + 0.5),
                                               max_value
                                               )
                        
                    set_dataset_dimensions(dataset_key, dimension_key, min_index_max_tuple)
                
                set_storage_dataset(storage_key, dataset_key)
                
            self.database.commit() # Commit transaction    
        except Exception, caught_exception:
            try:
                self.database.rollback() # Rollback transaction
            except:
                pass 
            raise caught_exception
Exemplo n.º 3
0
import calendar
from datetime import datetime, date, timedelta
from gdf import GDF
from pprint import pprint
from gdf import dt2secs, secs2dt
import matplotlib.pyplot as plt

# In[4]:

g = GDF()
g.debug = False


# In[5]:

start_date = dt2secs(date(year=2010, month=1, day=1))
end_date = dt2secs(date(year=2010, month=1, day=18))
data_request_descriptor = {'storage_type': 'LS5TM',
                           'variables': ('B40', 'B30',),
                           'dimensions': {'X': {'range': (147.875, 148.125)},
                                          'Y': {'range': (-37.0 + 0.875, -36.0 + 0.125)},
                                          # 'T': {'range': (start_date, end_date),
                                          #      'array_range': (0, 4)
                                          # 'crs': 'SSE', # Seconds since epoch
                                          # 'grouping_function': g.null_grouping
                                          #     }
                                          }
                           }


Exemplo n.º 4
0
    def create_netcdf(self, storage_indices, data_descriptor):
        '''
        Function to create netCDF-CF file for specified storage indices
        '''
        temp_storage_path = self.get_temp_storage_path(storage_indices)
        storage_path = self.get_storage_path(self.storage_type, storage_indices)
        make_dir(os.path.dirname(storage_path))        
        
        if self.dryrun:
            return storage_path
        
        if os.path.isfile(storage_path) and not self.force: 
            logger.warning('Skipping existing storage unit %s' % storage_path)
            return 
#            return storage_path #TODO: Remove this temporary debugging hack
        
        t_indices = np.array([dt2secs(record_dict['end_datetime']) for record_dict in data_descriptor])
        
        gdfnetcdf = GDFNetCDF(storage_config=self.storage_config[self.storage_type])
        
        logger.debug('Creating temporary storage unit %s with %d timeslices', temp_storage_path, len(data_descriptor))
        gdfnetcdf.create(netcdf_filename=temp_storage_path, 
                         index_tuple=storage_indices, 
                         dimension_index_dict={'T': t_indices}, netcdf_format=None)
        del t_indices
        
        # Set georeferencing from first tile
        gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname'])

        variable_dict = self.storage_config[self.storage_type]['measurement_types']
        variable_names = variable_dict.keys()
                
        slice_index = 0
        for record_dict in data_descriptor:
            tile_dataset = gdal.Open(record_dict['tile_pathname'])
            assert tile_dataset, 'Failed to open tile file %s' % record_dict['tile_pathname']
            
            logger.debug('Reading array data from tile file %s (%d/%d)', record_dict['tile_pathname'], slice_index + 1, len(data_descriptor))
            data_array = tile_dataset.ReadAsArray()
            logger.debug('data_array.shape = %s', data_array.shape)
            
            #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges
            for variable_index in range(len(variable_dict)):
                variable_name = variable_names[variable_index]
                logger.debug('Writing array to variable %s', variable_name)
                if len(data_array.shape) == 3:
                    gdfnetcdf.write_slice(variable_name, data_array[variable_index], {'T': slice_index})
                elif len(data_array.shape) == 2:
                    gdfnetcdf.write_slice(variable_name, data_array, {'T': slice_index})

            gdfnetcdf.sync() # Write cached data to disk      
            slice_index += 1
            
        del gdfnetcdf # Close the netCDF
        
        logger.debug('Moving temporary storage unit %s to %s', temp_storage_path, storage_path)
        if os.path.isfile(storage_path):
            logger.debug('Removing existing storage unit %s' % storage_path)
            os.remove(storage_path)
        shutil.move(temp_storage_path, storage_path)
        
        return storage_path
Exemplo n.º 5
0
    def write_gdf_data(self, storage_indices, data_descriptor,
                       storage_unit_path):
        '''
        Function to write records to database. Must occur in a single transaction
        '''
        def get_storage_key(record, storage_unit_path):
            '''
            Function to write storage unit record if required and return storage unit ID (tuple containing storage_type_id & storage_id)
            '''
            SQL = '''-- Attempt to insert a storage record and return storage key 
insert into storage(
    storage_type_id,
    storage_id,
    storage_version,
    storage_location,
    md5_checksum,
    storage_bytes,
    spatial_footprint_id
    )  
select
    %(storage_type_id)s,
    nextval('storage_id_seq'::regclass),
    0, -- storage_version
    %(storage_location)s,
    NULL,
    NULL,
    NULL
where not exists (
    select storage_type_id, storage_id, storage_version from storage 
    where storage_type_id =%(storage_type_id)s
    and storage_location = %(storage_location)s
    );
            
select storage_type_id, storage_id, storage_version from storage
where storage_type_id =%(storage_type_id)s
    and storage_location = %(storage_location)s;
'''
            params = {
                'storage_type_id':
                self.storage_type_config['storage_type_id'],
                'storage_location':
                self.get_storage_filename(self.storage_type, storage_indices)
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return (None, None, None)

            storage_id_result = self.database.submit_query(SQL, params)
            assert storage_id_result.record_count == 1, '%d records retrieved for storage_id query'
            return (storage_id_result.field_values['storage_type_id'][0],
                    storage_id_result.field_values['storage_id'][0],
                    storage_id_result.field_values['storage_version'][0])

        def get_observation_key(record):
            '''
            Function to write observation (acquisition) record if required and return observation ID (tuple containing observation_type_id and observation_id)
            '''
            SQL = '''-- Attempt to insert an observation record and return observation key
insert into observation(
    observation_type_id,
    observation_id,
    observation_start_datetime,
    observation_end_datetime,
    instrument_type_id,
    instrument_id
    )
select
    1, -- Optical Satellite
    nextval('observation_id_seq'::regclass),
    %(observation_start_datetime)s,
    %(observation_end_datetime)s,
    1, -- Passive Satellite-borne
    (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
where not exists (
    select observation_id from observation
    where observation_type_id = 1 -- Optical Satellite
    and instrument_type_id = 1 -- Passive Satellite-borne
    and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
    and observation_start_datetime = %(observation_start_datetime)s
    and observation_end_datetime = %(observation_end_datetime)s
    );

select observation_type_id, observation_id from observation
where observation_type_id = 1 -- Optical Satellite
    and instrument_type_id = 1 -- Passive Satellite-borne
    and instrument_id = (select instrument_id from instrument where instrument_tag = %(instrument_tag)s)
    and observation_start_datetime = %(observation_start_datetime)s
    and observation_end_datetime = %(observation_end_datetime)s;
'''
            params = {
                'instrument_tag': record['sensor_name'],
                'observation_start_datetime': record['start_datetime'],
                'observation_end_datetime': record['end_datetime']
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return (None, None)

            observation_id_result = self.database.submit_query(SQL, params)
            assert observation_id_result.record_count == 1, '%d records retrieved for observation_id query'
            return (
                observation_id_result.field_values['observation_type_id'][0],
                observation_id_result.field_values['observation_id'][0])

        def get_dataset_key(record, observation_key):
            '''
            Function to write observation (acquisition) record if required and return dataset ID (tuple containing dataset_type_id & dataset_id)
            '''
            SQL = '''-- Attempt to insert a dataset record and return dataset_id
insert into dataset(
    dataset_type_id,
    dataset_id,
    observation_type_id,
    observation_id,
    dataset_location,
    creation_datetime
    )
select
    (select dataset_type_id from dataset_type where dataset_type_tag = %(dataset_type_tag)s),
    nextval('dataset_id_seq'::regclass),
    %(observation_type_id)s,
    %(observation_id)s,
    %(dataset_location)s,
    %(creation_datetime)s
where not exists (
    select dataset_id from dataset
    where observation_type_id = %(observation_type_id)s
        and observation_id = %(observation_id)s
        and dataset_location = %(dataset_location)s
    );

select dataset_type_id, dataset_id from dataset
where observation_type_id = %(observation_type_id)s
    and observation_id = %(observation_id)s
    and dataset_location = %(dataset_location)s
'''
            params = {
                'dataset_type_tag':
                'PQ'
                if record['level_name'] == 'PQA' else record['level_name'],
                'observation_type_id':
                observation_key[0],
                'observation_id':
                observation_key[1],
                'dataset_location':
                record['dataset_path'],
                'creation_datetime':
                record['datetime_processed'].replace(
                    tzinfo=pytz.UTC)  # Convert naiive time to UTC
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return -1

            dataset_id_result = self.database.submit_query(SQL, params)
            assert dataset_id_result.record_count == 1, '%d records retrieved for dataset_id query'
            return (dataset_id_result.field_values['dataset_type_id'][0],
                    dataset_id_result.field_values['dataset_id'][0])

        def set_dataset_metadata(record, dataset_key):
            SQL = '''-- Attempt to insert dataset_metadata records
insert into dataset_metadata(
    dataset_type_id,
    dataset_id,
    metadata_xml
    )
select
  %(dataset_type_id)s,
  %(dataset_id)s,
  %(xml_text)s::xml
where not exists (
    select * from dataset_metadata
    where dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
        )
    and xml_is_well_formed(%(xml_text)s)
'''
            params = {
                'dataset_type_id': dataset_key[0],
                'dataset_id': dataset_key[1],
                'xml_text': record['xml_text']
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return

            self.database.submit_query(SQL, params)

        def set_dataset_dimensions(dataset_key, dimension_key,
                                   min_index_max_tuple):
            '''
            Function to write dataset_dimension record if required
            '''
            SQL = '''-- Attempt to insert dataset_dimension records
insert into dataset_dimension(
    dataset_type_id,
    dataset_id,
    domain_id,
    dimension_id,
    min_value,
    max_value,
    indexing_value
    )
select
  %(dataset_type_id)s,
  %(dataset_id)s,
  %(domain_id)s,
  %(dimension_id)s,
  %(min_value)s,
  %(max_value)s,
  %(indexing_value)s
where not exists (
    select * from dataset_dimension
    where dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
        and domain_id = %(domain_id)s
        and dimension_id = %(dimension_id)s
    );
'''
            params = {
                'dataset_type_id': dataset_key[0],
                'dataset_id': dataset_key[1],
                'domain_id': dimension_key[0],
                'dimension_id': dimension_key[1],
                'min_value': min_index_max_tuple[0],
                'indexing_value': min_index_max_tuple[1],
                'max_value': min_index_max_tuple[2]
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return

            self.database.submit_query(SQL, params)

        def set_storage_dataset(storage_key, dataset_key):
            '''
            Function to write storage_dataset record if required
            '''
            SQL = '''-- Attempt to insert storage_dataset record
insert into storage_dataset(
    storage_type_id,
    storage_id,
    storage_version,
    dataset_type_id,
    dataset_id
    )
select
    %(storage_type_id)s,
    %(storage_id)s,
    %(storage_version)s,
    %(dataset_type_id)s,
    %(dataset_id)s
where not exists (
    select * from storage_dataset
    where storage_type_id = %(storage_type_id)s
        and storage_id = %(storage_id)s
        and storage_version = %(storage_version)s
        and dataset_type_id = %(dataset_type_id)s
        and dataset_id = %(dataset_id)s
    );
'''
            params = {
                'storage_type_id': storage_key[0],
                'storage_id': storage_key[1],
                'storage_version': storage_key[2],
                'dataset_type_id': dataset_key[0],
                'dataset_id': dataset_key[1],
            }

            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return

            self.database.submit_query(SQL, params)

        def set_storage_dimension(storage_key, dimension_key,
                                  min_index_max_tuple):
            '''
            Function to write storage_dimension record if required
            '''
            SQL = '''-- Attempt to insert storage_dimension record
insert into storage_dimension(
    storage_type_id,
    storage_id,
    storage_version,
    domain_id,
    dimension_id,
    storage_dimension_index,
    storage_dimension_min,
    storage_dimension_max
    )
select
    %(storage_type_id)s,
    %(storage_id)s,
    %(storage_version)s,
    %(domain_id)s,
    %(dimension_id)s,
    %(storage_dimension_index)s,
    %(storage_dimension_min)s,
    %(storage_dimension_max)s
where not exists (
    select * from storage_dimension
    where storage_type_id = %(storage_type_id)s
        and storage_id = %(storage_id)s
        and storage_version = %(storage_version)s
        and domain_id = %(domain_id)s
        and dimension_id = %(dimension_id)s
    );
'''
            params = {
                'storage_type_id': storage_key[0],
                'storage_id': storage_key[1],
                'storage_version': storage_key[2],
                'domain_id': dimension_key[0],
                'dimension_id': dimension_key[1],
                'storage_dimension_min': min_index_max_tuple[0],
                'storage_dimension_index': min_index_max_tuple[1],
                'storage_dimension_max': min_index_max_tuple[2]
            }

            log_multiline(logger.debug, SQL, 'SQL', '\t')
            log_multiline(logger.debug,
                          self.database.default_cursor.mogrify(SQL, params),
                          'Mogrified SQL', '\t')

            if self.dryrun:
                return

            self.database.submit_query(SQL, params)

        # Start of write_gdf_data(self, storage_indices, data_descriptor, storage_unit_path) definition
        assert os.path.isfile(
            storage_unit_path), 'Storage unit file does not exist'

        # Keep all database operations in the same transaction
        self.database.keep_connection = True
        self.database.autocommit = False

        try:
            # Get storage unit ID - this doesn't change from record to record
            record = data_descriptor[0]
            storage_key = get_storage_key(record, storage_unit_path)
            logger.debug('storage_key = %s', storage_key)

            # Set storage_dimension record for each dimension
            logger.debug('self.dimensions = %s', self.dimensions)
            for dimension_index in range(len(self.dimensions)):
                logger.debug('dimension_index = %d', dimension_index)
                dimension = self.dimensions.keys()[dimension_index]
                logger.debug('dimension = %s', dimension)
                dimension_key = (self.dimensions[dimension]['domain_id'],
                                 self.dimensions[dimension]['dimension_id'])
                logger.debug('dimension_key = %s', dimension_key)

                min_index_max_tuple = (
                    self.index2ordinate(self.storage_type, dimension,
                                        storage_indices[dimension_index]),
                    storage_indices[dimension_index],  # Indexing value
                    self.index2ordinate(self.storage_type, dimension,
                                        storage_indices[dimension_index] + 1))

                set_storage_dimension(storage_key, dimension_key,
                                      min_index_max_tuple)

            # Process each tile record
            for record in data_descriptor:
                observation_key = get_observation_key(record)
                logger.debug('observation_key = %s', observation_key)

                dataset_key = get_dataset_key(record, observation_key)
                logger.debug('dataset_key = %s', dataset_key)

                set_dataset_metadata(record, dataset_key)

                # Set dataset_dimension record for each dimension
                for dimension in self.dimensions:
                    dimension_key = (self.storage_type_config['dimensions']
                                     [dimension]['domain_id'],
                                     self.storage_type_config['dimensions']
                                     [dimension]['dimension_id'])

                    if dimension == 'X':
                        min_index_max_tuple = (
                            min(record['ul_x'], record['ll_x']),
                            None,  # No indexing value for regular dimension
                            max(record['ur_x'], record['lr_x']))
                    elif dimension == 'Y':
                        min_index_max_tuple = (
                            min(record['ll_y'], record['lr_y']),
                            None,  # No indexing value for regular dimension
                            max(record['ul_y'], record['ur_y']))
                    elif dimension == 'T':
                        min_value = dt2secs(record['start_datetime'])
                        max_value = dt2secs(record['end_datetime'])
                        min_index_max_tuple = (
                            min_value, int((min_value + max_value) / 2.0 +
                                           0.5), max_value)

                    set_dataset_dimensions(dataset_key, dimension_key,
                                           min_index_max_tuple)

                set_storage_dataset(storage_key, dataset_key)

            self.database.commit()  # Commit transaction
        except Exception, caught_exception:
            try:
                self.database.rollback()  # Rollback transaction
            except:
                pass
            raise caught_exception
Exemplo n.º 6
0
    def create_netcdf(self, storage_indices, data_descriptor):
        '''
        Function to create netCDF-CF file for specified storage indices
        '''
        temp_storage_path = self.get_temp_storage_path(storage_indices)
        storage_path = self.get_storage_path(self.storage_type,
                                             storage_indices)
        make_dir(os.path.dirname(storage_path))

        if self.dryrun:
            return storage_path

        if os.path.isfile(storage_path) and not self.force:
            logger.warning('Skipping existing storage unit %s' % storage_path)
            return


#            return storage_path #TODO: Remove this temporary debugging hack

        t_indices = np.array([
            dt2secs(record_dict['end_datetime'])
            for record_dict in data_descriptor
        ])

        gdfnetcdf = GDFNetCDF(
            storage_config=self.storage_config[self.storage_type])

        logger.debug('Creating temporary storage unit %s with %d timeslices',
                     temp_storage_path, len(data_descriptor))
        gdfnetcdf.create(netcdf_filename=temp_storage_path,
                         index_tuple=storage_indices,
                         dimension_index_dict={'T': t_indices},
                         netcdf_format=None)
        del t_indices

        # Set georeferencing from first or second tile for fault tolerance
        try:
            gdfnetcdf.georeference_from_file(
                data_descriptor[0]['tile_pathname'])
        except:
            gdfnetcdf.georeference_from_file(
                data_descriptor[1]['tile_pathname'])

        variable_dict = self.storage_config[
            self.storage_type]['measurement_types']
        variable_names = variable_dict.keys()

        array_shape = tuple([len(variable_dict)] + [
            dim['dimension_elements']
            for dim in self.storage_config[self.storage_type]
            ['dimensions'].values() if dim['indexing_type'] == 'regular'
        ])

        # All data types and no-data values should be the same - just use first one
        array_dtype = variable_dict[variable_dict.keys()
                                    [0]]['numpy_datatype_name']

        nodata_value = variable_dict[variable_dict.keys()[0]]['nodata_value']
        if nodata_value is None:
            nodata_value = np.nan

        slice_index = 0
        for record_dict in data_descriptor:
            try:
                tile_dataset = gdal.Open(record_dict['tile_pathname'])
                assert tile_dataset, 'Failed to open tile file %s' % record_dict[
                    'tile_pathname']

                logger.debug('Reading array data from tile file %s (%d/%d)',
                             record_dict['tile_pathname'], slice_index + 1,
                             len(data_descriptor))
                data_array = tile_dataset.ReadAsArray()

                assert data_array.shape == array_shape, 'Tile array shape is not %s' % array_shape
            except Exception, e:
                # Can't read data_array from GeoTIFF - create empty data_array instead
                logger.warning(
                    'WARNING: Unable to read array from tile - empty array created: %s',
                    e.message)

                data_array = np.ones(array_shape, array_dtype) * nodata_value

            logger.debug('data_array.shape = %s', data_array.shape)

            #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges
            for variable_index in range(len(variable_dict)):
                variable_name = variable_names[variable_index]
                logger.debug('Writing array to variable %s', variable_name)
                if len(data_array.shape) == 3:
                    gdfnetcdf.write_slice(variable_name,
                                          data_array[variable_index],
                                          {'T': slice_index})
                elif len(data_array.shape) == 2:
                    gdfnetcdf.write_slice(variable_name, data_array,
                                          {'T': slice_index})

            gdfnetcdf.sync()  # Write cached data to disk
            slice_index += 1
Exemplo n.º 7
0
    def create_netcdf(self, storage_indices, data_descriptor):
        '''
        Function to create netCDF-CF file for specified storage indices
        '''
        temp_storage_path = self.get_temp_storage_path(storage_indices)
        storage_path = self.get_storage_path(self.storage_type,
                                             storage_indices)
        make_dir(os.path.dirname(storage_path))

        if self.dryrun:
            return storage_path

        if os.path.isfile(storage_path) and not self.force:
            logger.warning('Skipping existing storage unit %s' % storage_path)
            return


#            return storage_path #TODO: Remove this temporary debugging hack

        t_indices = np.array([
            dt2secs(record_dict['end_datetime'])
            for record_dict in data_descriptor
        ])

        gdfnetcdf = GDFNetCDF(
            storage_config=self.storage_config[self.storage_type])

        logger.debug('Creating temporary storage unit %s with %d timeslices',
                     temp_storage_path, len(data_descriptor))
        gdfnetcdf.create(netcdf_filename=temp_storage_path,
                         index_tuple=storage_indices,
                         dimension_index_dict={'T': t_indices},
                         netcdf_format=None)
        del t_indices

        # Set georeferencing from first tile
        gdfnetcdf.georeference_from_file(data_descriptor[0]['tile_pathname'])

        variable_dict = self.storage_config[
            self.storage_type]['measurement_types']
        variable_names = variable_dict.keys()

        slice_index = 0
        for record_dict in data_descriptor:
            tile_dataset = gdal.Open(record_dict['tile_pathname'])
            assert tile_dataset, 'Failed to open tile file %s' % record_dict[
                'tile_pathname']

            logger.debug('Reading array data from tile file %s (%d/%d)',
                         record_dict['tile_pathname'], slice_index + 1,
                         len(data_descriptor))
            data_array = tile_dataset.ReadAsArray()
            logger.debug('data_array.shape = %s', data_array.shape)

            #TODO: Set up proper mapping between AGDC & GDF bands so this works with non-contiguous ranges
            for variable_index in range(len(variable_dict)):
                variable_name = variable_names[variable_index]
                logger.debug('Writing array to variable %s', variable_name)
                if len(data_array.shape) == 3:
                    gdfnetcdf.write_slice(variable_name,
                                          data_array[variable_index],
                                          {'T': slice_index})
                elif len(data_array.shape) == 2:
                    gdfnetcdf.write_slice(variable_name, data_array,
                                          {'T': slice_index})

            gdfnetcdf.sync()  # Write cached data to disk
            slice_index += 1

        del gdfnetcdf  # Close the netCDF

        logger.debug('Moving temporary storage unit %s to %s',
                     temp_storage_path, storage_path)
        if os.path.isfile(storage_path):
            logger.debug('Removing existing storage unit %s' % storage_path)
            os.remove(storage_path)
        shutil.move(temp_storage_path, storage_path)

        return storage_path