Пример #1
0
 def test_validate_uri_opendap_does_not_exist(self):
     uri = 'http://www.ifremer.fr/opendap/cerdap1/globcurrent/' \
             'v2.0/global_012_deg/geostrophic/2014/001/' \
             '20140101000000-GLOBCURRENT-L4-CURgeo_0m-ALT_OI-v02.0-fv01.0.nc.tull'
     with self.assertRaises(ConnectionError) as cm:
         utils.validate_uri(uri)
     self.assertEqual(uri, cm.exception.args[0])
    def get_or_create(self, uri, force):
        # Validate uri - this should raise an exception if the uri doesn't
        # point to a valid file or stream
        validate_uri(uri)
        # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we
        # need to pass uri_filter_args
        uris = DatasetURI.objects.filter(uri=uri)
        # If the ingested uri is already in the database and not <force> ingestion then stop
        if uris.exists() and not force:
            return uris[0].dataset, False
        elif uris.exists() and force:
            uris[0].dataset.delete()
        # Open file with Nansat
        n = Nansat(nansat_filename(uri))
        # get metadata from Nansat and get objects from vocabularies
        n_metadata = n.get_metadata()
        # set compulsory metadata (source)
        platform, _ = Platform.objects.get_or_create(
            json.loads(n_metadata['platform']))
        instrument, _ = Instrument.objects.get_or_create(
            json.loads(n_metadata['instrument']))
        specs = n_metadata.get('specs', '')
        source, _ = Source.objects.get_or_create(platform=platform,
                                                 instrument=instrument,
                                                 specs=specs)
        footprint = Polygon(list(zip(*n.get_border())))
        geolocation = GeographicLocation.objects.get_or_create(
            geometry=footprint)[0]
        data_center = DataCenter.objects.get_or_create(
            json.loads(n_metadata['Data Center']))[0]
        iso_category = ISOTopicCategory.objects.get_or_create(
            pti.get_iso19115_topic_category('Oceans'))[0]
        location = Location.objects.get_or_create(
            json.loads(n_metadata['gcmd_location']))[0]
        # create dataset
        ds, created = Dataset.objects.get_or_create(
            time_coverage_start=make_aware(n.time_coverage_start),
            time_coverage_end=make_aware(
                n.time_coverage_start +
                timedelta(hours=23, minutes=59, seconds=59)),
            source=source,
            geographic_location=geolocation,
            ISO_topic_category=iso_category,
            data_center=data_center,
            summary='',
            gcmd_location=location,
            access_constraints='',
            entry_id=lambda: 'NERSC_' + str(uuid.uuid4()))

        ds_uri, _ = DatasetURI.objects.get_or_create(
            name=FILE_SERVICE_NAME,
            service=LOCAL_FILE_SERVICE,
            uri=uri,
            dataset=ds)
        return ds, created
def crawl(url, **options):
    validate_uri(url)

    skips = Crawl.SKIPS + ['.*ncml']
    c = Crawl(url, skip=skips, debug=True)
    added = 0
    for ds in c.datasets:
        url = [s.get('url') for s in ds.services if
                s.get('service').lower()=='opendap'][0]
        metno_obs_stat, cr = MetObsStation.objects.get_or_create(url)
        if cr:
            added += 1
            print('Added %s, no. %d/%d'%(url, added, len(c.datasets)))
    return added
def crawl(url, **options):
    validate_uri(url)

    date = options.get('date', None)
    filename = options.get('filename', None)
    if date:
        select = ['(.*%s.*\.nc)' %date]
    elif filename:
        select = ['(.*%s)' %filename]
    else:
        select = None

    skips = Crawl.SKIPS + ['.*ncml']
    c = Crawl(url, select=select, skip=skips, debug=True)
    added = 0
    for ds in c.datasets:
        url = [s.get('url') for s in ds.services if
                s.get('service').lower()=='opendap'][0]
        try:
            gds, cr = NansatDataset.objects.get_or_create(url)
        except (IOError, AttributeError) as e:
            #warnings.warn(e.message)
            continue
        else:
            if cr:
                added += 1
                print('Added %s, no. %d/%d'%(url, added, len(c.datasets)))
        # Connect all service uris to the dataset
        for s in ds.services:
            try:
                ds_uri, _ = DatasetURI.objects.get_or_create(name=s.get('name'),
                    service=s.get('service'), uri=s.get('url'), dataset=gds)
            except IntegrityError:
                # There is no standard for the name (and possibly the service). This means that the
                # naming defined by geospaas.catalog.managers.DAP_SERVICE_NAME (and assigned to the
                # DatasetURI in geospaas.nansat_ingestor.managers.DatasetManager.get_or_create) may
                # be different from s.get('name').
                # Solution: ignore the error and continue the loop
                continue
    return added
def crawl_and_ingest(url, **options):
    validate_uri(url)

    date = options.get('date', None)
    filename = options.get('filename', None)
    if date:
        select = ['(.*%s.*\.nc)' % date]
    elif filename:
        select = ['(.*%s)' % filename]
    else:
        select = None

    skips = Crawl.SKIPS + ['.*ncml']
    c = Crawl(url, select=select, skip=skips, debug=True)
    added = 0
    for ds in c.datasets:
        for s in ds.services:
            if s.get('service').lower() == 'opendap':
                url = s.get('url')
                name = s.get('name')
                service = s.get('service')
        try:
            # Create Dataset from OPeNDAP url - this is necessary to get all metadata
            gds, cr = NansatDataset.objects.get_or_create(
                url, uri_service_name=name, uri_service_type=service)
        except (IOError, AttributeError, ValueError) as e:
            # warnings.warn(e.message)
            continue
        if cr:
            added += 1
            print('Added %s, no. %d/%d' % (url, added, len(c.datasets)))
        # Connect all service uris to the dataset
        for s in ds.services:
            ds_uri, _ = DatasetURI.objects.get_or_create(
                name=s.get('name'),
                service=s.get('service'),
                uri=s.get('url'),
                dataset=gds)
        print('Added %s, no. %d/%d' % (url, added, len(c.datasets)))
    return added
Пример #6
0
    def get_or_create(self,
                      uri,
                      n_points=10,
                      uri_filter_args=None,
                      *args,
                      **kwargs):
        ''' Create dataset and corresponding metadata

        Parameters:
        ----------
            uri : str
                  URI to file or stream openable by Nansat
            n_points : int
                  Number of border points (default is 10)
            uri_filter_args : dict
                Extra DatasetURI filter arguments if several datasets can refer to the same URI

        Returns:
        -------
            dataset and flag
        '''
        if not uri_filter_args:
            uri_filter_args = {}

        # Validate uri - this should raise an exception if the uri doesn't point to a valid
        # file or stream
        validate_uri(uri)

        # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we
        # need to pass uri_filter_args
        uris = DatasetURI.objects.filter(uri=uri, **uri_filter_args)
        if len(uris) > 0:
            return uris[0].dataset, False

        # Open file with Nansat
        n = Nansat(nansat_filename(uri), **kwargs)

        # get metadata from Nansat and get objects from vocabularies
        n_metadata = n.get_metadata()

        # set compulsory metadata (source)
        platform, _ = Platform.objects.get_or_create(
            json.loads(n_metadata['platform']))
        instrument, _ = Instrument.objects.get_or_create(
            json.loads(n_metadata['instrument']))
        specs = n_metadata.get('specs', '')
        source, _ = Source.objects.get_or_create(platform=platform,
                                                 instrument=instrument,
                                                 specs=specs)

        default_char_fields = {
            'entry_id': lambda: 'NERSC_' + str(uuid.uuid4()),
            'entry_title': lambda: 'NONE',
            'summary': lambda: 'NONE',
        }

        # set optional CharField metadata from Nansat or from default_char_fields
        options = {}
        for name in default_char_fields:
            if name not in n_metadata:
                warnings.warn('%s is not provided in Nansat metadata!' % name)
                options[name] = default_char_fields[name]()
            else:
                options[name] = n_metadata[name]

        default_foreign_keys = {
            'gcmd_location': {
                'model': Location,
                'value': pti.get_gcmd_location('SEA SURFACE')
            },
            'data_center': {
                'model': DataCenter,
                'value': pti.get_gcmd_provider('NERSC')
            },
            'ISO_topic_category': {
                'model': ISOTopicCategory,
                'value': pti.get_iso19115_topic_category('Oceans')
            },
        }

        # set optional ForeignKey metadata from Nansat or from default_foreign_keys
        for name in default_foreign_keys:
            value = default_foreign_keys[name]['value']
            model = default_foreign_keys[name]['model']
            if name not in n_metadata:
                warnings.warn('%s is not provided in Nansat metadata!' % name)
            else:
                try:
                    value = json.loads(n_metadata[name])
                except:
                    warnings.warn(
                        '%s value of %s  metadata provided in Nansat is wrong!'
                        % (n_metadata[name], name))
            options[name], _ = model.objects.get_or_create(value)

        # Find coverage to set number of points in the geolocation
        if len(n.vrt.dataset.GetGCPs()) > 0:
            n.reproject_gcps()
        geolocation = GeographicLocation.objects.get_or_create(
            geometry=WKTReader().read(n.get_border_wkt(nPoints=n_points)))[0]

        # create dataset
        ds, created = Dataset.objects.get_or_create(
            time_coverage_start=n.get_metadata('time_coverage_start'),
            time_coverage_end=n.get_metadata('time_coverage_end'),
            source=source,
            geographic_location=geolocation,
            **options)
        # create dataset URI
        ds_uri, _ = DatasetURI.objects.get_or_create(uri=uri, dataset=ds)

        return ds, created
Пример #7
0
    def get_or_create(self,
                      uri,
                      n_points=10,
                      uri_filter_args=None,
                      uri_service_name=FILE_SERVICE_NAME,
                      uri_service_type=LOCAL_FILE_SERVICE,
                      *args,
                      **kwargs):
        """ Create dataset and corresponding metadata

        Parameters:
        ----------
            uri : str
                  URI to file or stream openable by Nansat
            n_points : int
                  Number of border points (default is 10)
            uri_filter_args : dict
                Extra DatasetURI filter arguments if several datasets can refer to the same URI
            uri_service_name : str
                name of the service which is used  ('dapService', 'fileService', 'http' or 'wms')
            uri_service_type : str
                type of the service which is used  ('OPENDAP', 'local', 'HTTPServer' or 'WMS')

        Returns:
        -------
            dataset and flag
        """
        if not uri_filter_args:
            uri_filter_args = {}

        # Validate uri - this should raise an exception if the uri doesn't point to a valid
        # file or stream
        validate_uri(uri)

        # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we
        # need to pass uri_filter_args
        uris = DatasetURI.objects.filter(uri=uri, **uri_filter_args)
        if len(uris) > 0:
            return uris[0].dataset, False

        # Open file with Nansat
        n = Nansat(nansat_filename(uri), **kwargs)

        # get metadata from Nansat and get objects from vocabularies
        n_metadata = n.get_metadata()

        entry_id = n_metadata.get('entry_id', None)
        # set compulsory metadata (source)
        platform, _ = Platform.objects.get_or_create(
            json.loads(n_metadata['platform']))
        instrument, _ = Instrument.objects.get_or_create(
            json.loads(n_metadata['instrument']))
        specs = n_metadata.get('specs', '')
        source, _ = Source.objects.get_or_create(platform=platform,
                                                 instrument=instrument,
                                                 specs=specs)

        default_char_fields = {
            # Adding NERSC_ in front of the id violates the string representation of the uuid
            #'entry_id': lambda: 'NERSC_' + str(uuid.uuid4()),
            'entry_id': lambda: str(uuid.uuid4()),
            'entry_title': lambda: 'NONE',
            'summary': lambda: 'NONE',
        }

        # set optional CharField metadata from Nansat or from default_char_fields
        options = {}
        try:
            existing_ds = Dataset.objects.get(entry_id=entry_id)
        except Dataset.DoesNotExist:
            existing_ds = None
        for name in default_char_fields:
            if name not in n_metadata:
                warnings.warn('%s is not provided in Nansat metadata!' % name)
                # prevent overwriting of existing values by defaults
                if existing_ds:
                    options[name] = existing_ds.__getattribute__(name)
                else:
                    options[name] = default_char_fields[name]()
            else:
                options[name] = n_metadata[name]

        default_foreign_keys = {
            'gcmd_location': {
                'model': Location,
                'value': pti.get_gcmd_location('SEA SURFACE')
            },
            'data_center': {
                'model': DataCenter,
                'value': pti.get_gcmd_provider('NERSC')
            },
            'ISO_topic_category': {
                'model': ISOTopicCategory,
                'value': pti.get_iso19115_topic_category('Oceans')
            },
        }

        # set optional ForeignKey metadata from Nansat or from default_foreign_keys
        for name in default_foreign_keys:
            value = default_foreign_keys[name]['value']
            model = default_foreign_keys[name]['model']
            if name not in n_metadata:
                warnings.warn('%s is not provided in Nansat metadata!' % name)
            else:
                try:
                    value = json.loads(n_metadata[name])
                except:
                    warnings.warn(
                        '%s value of %s  metadata provided in Nansat is wrong!'
                        % (n_metadata[name], name))
            if existing_ds:
                options[name] = existing_ds.__getattribute__(name)
            else:
                options[name], _ = model.objects.get_or_create(value)

        # Find coverage to set number of points in the geolocation
        if len(n.vrt.dataset.GetGCPs()) > 0:
            n.reproject_gcps()
        geolocation = GeographicLocation.objects.get_or_create(
            geometry=WKTReader().read(n.get_border_wkt(nPoints=n_points)))[0]

        # create dataset
        # - the get_or_create method should use get_or_create here as well,
        #   or its name should be changed - see issue #127
        ds, created = Dataset.objects.update_or_create(
            entry_id=options['entry_id'],
            defaults={
                'time_coverage_start': n.get_metadata('time_coverage_start'),
                'time_coverage_end': n.get_metadata('time_coverage_end'),
                'source': source,
                'geographic_location': geolocation,
                'gcmd_location': options["gcmd_location"],
                'ISO_topic_category': options["ISO_topic_category"],
                "data_center": options["data_center"],
                'entry_title': options["entry_title"],
                'summary': options["summary"]
            })

        # create parameter
        all_band_meta = n.bands()
        for band_id in range(1, len(all_band_meta) + 1):
            band_meta = all_band_meta[band_id]
            standard_name = band_meta.get('standard_name', None)
            short_name = band_meta.get('short_name', None)
            units = band_meta.get('units', None)
            if standard_name in ['latitude', 'longitude', None]:
                continue
            params = Parameter.objects.filter(standard_name=standard_name)
            if params.count() > 1 and short_name is not None:
                params = params.filter(short_name=short_name)
            if params.count() > 1 and units is not None:
                params = params.filter(units=units)
            if params.count() >= 1:
                ds.parameters.add(params[0])

        # create dataset URI
        DatasetURI.objects.get_or_create(name=uri_service_name,
                                         service=uri_service_type,
                                         uri=uri,
                                         dataset=ds)

        return ds, created
Пример #8
0
 def test_validate_uri_opendap_does_not_exist(self):
     uri = 'http://www.ifremer.fr/opendap/cerdap1/cersat/' \
           '20140101000000-GLOBCURRENT-L4-CURgeo_0m-ALT_OI-v02.0-fv01.0.nc.tull'
     with self.assertRaises(OSError) as cm:
         utils.validate_uri(uri)
     self.assertEqual('NetCDF: file not found', cm.exception.args[1])
Пример #9
0
 def test_fail_invalid_uri(self):
     uri = '/this/is/some/file/but/not/an/uri'
     with self.assertRaises(ValueError):
         utils.validate_uri(uri)
Пример #10
0
 def test__validate_uri__opendap_exists(self, mock_PoolManager):
     # Mock request.status so it returns 200, meaning successful connection..
     mock_PoolManager.return_value.request.return_value = PropertyMock(
         status=200)
     uri = 'http://nbstds.met.no/thredds/catalog/NBS/S2A/test_catalog.html'
     self.assertEqual(utils.validate_uri(uri), None)
Пример #11
0
 def test_validate_uri_local_does_not_exist(self):
     uri = 'file://localhost/some/folder/filename.ext'
     with self.assertRaises(FileNotFoundError) as cm:
         utils.validate_uri(uri)
     the_exception = '/some/folder/filename.ext'
     self.assertEqual(the_exception, cm.exception.args[0])
Пример #12
0
 def test_validate_uri_local(self, mock_isfile):
     mock_isfile.return_value = True
     uri = 'file://localhost/some/folder/filename.ext'
     self.assertEqual(utils.validate_uri(uri), None)
Пример #13
0
 def save(self, *args, **kwargs):
     validate_uri(self.uri)
     # Validation is not usually done in the models but rather via form
     # validation. We should discuss if we want it here or not.
     super(DatasetURI, self).save(*args, **kwargs)