Exemplo n.º 1
0
 def test_GetRecords_summary(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords(outputschema=GMD,
                    startposition=1,
                    maxrecords=5,
                    esn="summary")
     nrecords = len(csw.records)
Exemplo n.º 2
0
 def test_GetRecords_dataset(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords(qtype="dataset",
                    outputschema=GMD,
                    startposition=1,
                    maxrecords=5)
     nrecords = len(csw.records)
Exemplo n.º 3
0
 def test_GetRecords(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
     #print csw.response[:1024]
     assert nrecords == 5, nrecords
     for ident in csw.records:
         identifiers.append(ident)
         assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
Exemplo n.º 4
0
def _harvest_csw(csw, maxrecords=10, totalrecords=float('inf')):
    """
    Step through CSW results, and if one seems to be a WMS or Arc REST service then register it
    """
    stop = 0
    flag = 0

    src = CatalogueServiceWeb(csw.base_url)

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results['nextrecord']

        src.getrecords(
            esn='summary', startposition=startposition, maxrecords=maxrecords)

        max = min(src.results['matches'], totalrecords)

        if src.results['nextrecord'] == 0 \
                or src.results['returned'] == 0 \
                or src.results['nextrecord'] > max:  # end the loop, exhausted all records or max records to process
            stop = 1
            break

        # harvest each record to destination CSW
        for record in list(src.records):
            record = src.records[record]
            known_types = {}
            for ref in record.references:
                if ref["scheme"] == "OGC:WMS" or \
                        "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower():
                    print "WMS:%s" % ref["url"]
                    known_types["WMS"] = ref["url"]
                if ref["scheme"] == "OGC:WFS" or \
                        "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower():
                    print "WFS:%s" % ref["url"]
                    known_types["WFS"] = ref["url"]
                if ref["scheme"] == "ESRI":
                    print "ESRI:%s" % ref["url"]
                    known_types["REST"] = ref["url"]

            if "WMS" in known_types:
                type = "OWS" if "WFS" in known_types else "WMS"
                try:
                    _process_wms_service(
                        known_types["WMS"], type, None, None, parent=csw)
                except Exception, e:
                    logger.error("Error registering %s:%s" %
                                 (known_types["WMS"], str(e)))
            elif "REST" in known_types:
                try:
                    _register_arcgis_url(ref["url"], None, None, None, parent=csw)
                except Exception, e:
                    logger.error("Error registering %s:%s" %
                                 (known_types["REST"], str(e)))
Exemplo n.º 5
0
def _harvest_csw(csw, maxrecords=10, totalrecords=float("inf")):
    """
    Step through CSW results, and if one seems to be a WMS or Arc REST service then register it
    """
    stop = 0
    flag = 0

    src = CatalogueServiceWeb(csw.base_url)

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results["nextrecord"]

        src.getrecords(esn="summary", startposition=startposition, maxrecords=maxrecords)

        max = min(src.results["matches"], totalrecords)

        if (
            src.results["nextrecord"] == 0 or src.results["returned"] == 0 or src.results["nextrecord"] > max
        ):  # end the loop, exhausted all records or max records to process
            stop = 1
            break

        # harvest each record to destination CSW
        for record in list(src.records):
            record = src.records[record]
            known_types = {}
            for ref in record.references:
                if (
                    ref["scheme"] == "OGC:WMS"
                    or "service=wms&request=getcapabilities" in urllib.unquote(ref["url"]).lower()
                ):
                    print "WMS:%s" % ref["url"]
                    known_types["WMS"] = ref["url"]
                if (
                    ref["scheme"] == "OGC:WFS"
                    or "service=wfs&request=getcapabilities" in urllib.unquote(ref["url"]).lower()
                ):
                    print "WFS:%s" % ref["url"]
                    known_types["WFS"] = ref["url"]
                if ref["scheme"] == "ESRI":
                    print "ESRI:%s" % ref["url"]
                    known_types["REST"] = ref["url"]

            if "WMS" in known_types:
                type = "OWS" if "WFS" in known_types else "WMS"
                try:
                    _process_wms_service(known_types["WMS"], type, None, None, parent=csw)
                except Exception, e:
                    logger.error("Error registering %s:%s" % (known_types["WMS"], str(e)))
            elif "REST" in known_types:
                try:
                    _register_arcgis_url(ref["url"], None, None, None, parent=csw)
                except Exception, e:
                    logger.error("Error registering %s:%s" % (known_types["REST"], str(e)))
Exemplo n.º 6
0
class HarvestNode(NgdsDataObject):
    """Stores information about harvest endpoints"""
    csw = None
    
    def __init__(self, url, **kwargs):
        # A URL must be given
        p = urlparse(url)
        self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path
        self.frequency = kwargs.get('frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly
        self.title = kwargs.get('title', 'No Title Was Given') # A title for bookkeeping
        self.node_admin_id = kwargs.get('node_admin_id', None) # Foreign Key to a responsible_party who maintains the remote node
        #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests
    
    def setup_csw(self):
        self.csw = CatalogueServiceWeb(self.url)
        
    def do_harvest(self):
        """Perform a harvest from another CSW server"""
        if self.csw == None:
            self.setup_csw()                      
        self.get_records() # Do the first GetRecords request
        ids = self.csw.records.keys() # Start an array to house all of the ids
        print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"])
        
        while self.csw.results["nextrecord"] < self.csw.results["matches"] and self.csw.results["nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything
            self.get_records(self.csw.results["nextrecord"], self.csw.results["returned"]) # Get another set, starting from next_record from previous response
            ids += self.csw.records.keys() # Add new ids to the array
            print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"])
        
        self.parse_records(ids) # Gather the records themselves
                   
    def parse_records(self, ids):
        """Perform as many GetRecordById requests as needed"""
        print "Gathered %s IDs" % str(len(ids))
        for record_id in ids:
            self.get_record_by_id(record_id)
            rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self)
    
    def get_record_by_id(self, record_id):
        """Get a single record, by ID"""
        params = {
            "id": [ record_id ],
            "outputschema": "http://www.isotc211.org/2005/gmd"    
        }
        self.csw.getrecordbyid(**params) # Puts response in self.csw.records        
    
    def get_records(self, start_position=1, max_records=1000):
        """Perform a GetRecords request"""
        params = {
            "typenames": "gmd:MD_Metadata",
            "outputschema": "http://www.isotc211.org/2005/gmd",
            "startposition": start_position,
            "maxrecords": max_records,
            "esn": "brief"          
        }
        self.csw.getrecords(**params) # Puts results in self.csw.records        
Exemplo n.º 7
0
 def test_GetRecords(self):
     # NB: This test fails because no records have been setup...
     raise SkipTest() # therefore skip
     csw = CatalogueServiceWeb(service)
     csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
     #print csw.response[:1024]
     assert nrecords == 5, nrecords
     for ident in csw.records:
         identifiers.append(ident)
         assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
Exemplo n.º 8
0
def getDataSetURI(anyText, CSWURL, BBox):
    """

                            Searches a given CSW server and returns metadata content for the datasets found.

                            Arguments
                            ---------

                            - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.)
                            - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.)
                            - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional)

                            """

    csw = CatalogueServiceWeb(CSWURL, skip_caps=True)
    #Works with owslib version 0.8.6.
    csw.getrecords(keywords=[anyText],
                   outputschema='http://www.isotc211.org/2005/gmd',
                   esn='full',
                   maxrecords=100)
    dataSetURIs = [['title', 'abstract', ['urls']]]
    for rec in csw.records:
        title = csw.records[rec].identification.title
        abstract = csw.records[rec].identification.abstract
        urls = []
        try:
            for onlineresource in range(
                    len(csw.records[rec].distribution.online)):
                urls.append(
                    csw.records[rec].distribution.online[onlineresource].url)
        except AttributeError:
            print  #pass
        else:
            print  #pass
        for ident in range(len(csw.records[rec].identificationinfo)):
            try:
                for operation in range(
                        len(csw.records[rec].identificationinfo[ident].
                            operations)):
                    urls.append(csw.records[rec].identificationinfo[ident].
                                operations[0]['connectpoint'][0].url)
            except AttributeError:
                print  #pass
            else:
                print  #pass
        entry = [title, abstract, urls]
        dataSetURIs.append(entry)
    for i, dataset in enumerate(dataSetURIs):
        dataSetURIs[i][2] = [
            uri.replace("http", "dods") if "/dodsC/" in uri else uri
            for uri in dataset[2]
        ]
    return dataSetURIs
Exemplo n.º 9
0
 def test_GetRecords(self):
     # NB: This test fails because no records have been setup...
     raise SkipTest()  # therefore skip
     csw = CatalogueServiceWeb(service)
     csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
     #print csw.response[:1024]
     assert nrecords == 5, nrecords
     for ident in csw.records:
         identifiers.append(ident)
         assert isinstance(csw.records[ident],
                           MD_Metadata), (ident, csw.records[ident])
Exemplo n.º 10
0
 def run(self, csw=None, offset=0):
     if csw is None:
         endpoint = self.config['endpoint']
         csw = CatalogueServiceWeb(endpoint)
     outputschema = self.config.get('outputschema', csw_ns['gmd'])
     typenames = self.config.get('typenames', 'csw:dataset')
     csw.getrecords(esn="full", outputschema=outputschema,
             typenames=typenames, startposition=offset, 
             maxrecords=PAGE_SIZE)
     for record in csw.records.values(): 
         self.queue(CSWDatasetCrawler, record=record)
     
     if csw.results.get('nextrecord') <= csw.results.get('matches'):
         print "OFFSET", offset
         self.queue(CSWCatalogCrawler, csw=csw, 
                    offset=csw.results.get('nextrecord'))
Exemplo n.º 11
0
def getDataSetURI(anyText, CSWURL, BBox):
                            """

                            Searches a given CSW server and returns metadata content for the datasets found.

                            Arguments
                            ---------

                            - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.)
                            - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.)
                            - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional)

                            """

                            csw = CatalogueServiceWeb(CSWURL, skip_caps=True)
                            #Works with owslib version 0.8.6.
                            csw.getrecords(keywords=[anyText], outputschema='http://www.isotc211.org/2005/gmd', esn='full', maxrecords=100)
                            dataSetURIs = [['title','abstract',['urls']]]
                            for rec in csw.records:
                                    title=csw.records[rec].identification.title
                                    abstract=csw.records[rec].identification.abstract
                                    urls=[]
                                    try:
                                            for onlineresource in range(len(csw.records[rec].distribution.online)):
                                                    urls.append(csw.records[rec].distribution.online[onlineresource].url)
                                    except AttributeError:
                                            print#pass
                                    else:
                                            print#pass
                                    for ident in range(len(csw.records[rec].identificationinfo)):
                                            try:
                                                    for operation in range(len(csw.records[rec].identificationinfo[ident].operations)):
                                                            urls.append(csw.records[rec].identificationinfo[ident].operations[0]['connectpoint'][0].url)
                                            except AttributeError:
                                                    print#pass
                                            else:
                                                    print#pass
                                    entry=[title,abstract,urls]
                                    dataSetURIs.append(entry)
                            for i,dataset in enumerate(dataSetURIs):
                                    dataSetURIs[i][2]=[uri.replace("http", "dods") if "/dodsC/" in uri else uri for uri in dataset[2]]
                            return dataSetURIs
Exemplo n.º 12
0
def harvest(source, dst):
    maxrecords = options["max"]
    if options["max"] == 0 or None:
        maxrecords = 10
    stop = 0
    flag = 0

    src = CatalogueServiceWeb(source)
    dest = CatalogueServiceWeb(dst)

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results["nextrecord"]

        src.getrecords(esn="brief",
                       startposition=startposition,
                       maxrecords=maxrecords)

        print(src.results)

        if (src.results["nextrecord"] == 0 or src.results["returned"] == 0
                or src.results["nextrecord"] >
                src.results["matches"]):  # end the loop, exhausted all records
            stop = 1
            break

        # harvest each record to destination CSW
        for i in list(src.records):
            source = "%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s" % (
                sys.argv[1],
                i,
            )
            dest.harvest(source=source,
                         resourcetype="http://www.isotc211.org/2005/gmd")
            # print dest.request
            # print dest.response

        flag = 1
Exemplo n.º 13
0
def getResource(endpoint = 'http://www.nodc.noaa.gov/geoportal/csw', bbox=None, keywords=None, maxrecords=1, service_type='opendap', verbose=None):
    if service_type == 'opendap':
        service_string='urn:x-esri:specification:ServiceType:OPeNDAP'
    if service_type == 'wms':
        service_string='urn:x-esri:specification:ServiceType:WMS'
    csw = CatalogueServiceWeb(endpoint,timeout=30)
    if keywords is not None:
        csw.getrecords(keywords=keywords, bbox=bbox, maxrecords=maxrecords)
    else :
        csw.getrecords(bbox=bbox, maxrecords=maxrecords)
    csw.records.keys()
    result = {}
    for i in csw.records.keys():
        records=csw.records[i]
        resource = {}
        for key,rec in csw.records.iteritems():
            url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
            print rec.references[0]['url']
            if url is not None:
                resource[rec.title] = url
        result[i] = resource
    if verbose is not None:
        print 'endpoint: ', endpoint, '\n' , 'bbox: ', bbox, '\n' , 'keywords: ', keywords, '\n', 'maxrecords: ', maxrecords , '\n', 'service_type: ' , service_type
    return result
Exemplo n.º 14
0
    def search(self):

        keyword = request.params.get('keyword')
        if keyword is None:
            abort(400, 'no keyword')

        lang = request.params.get('lang')
        if not lang or lang not in supported_langs:
            abort(400, 'unknown lang')
        self._lang = lang

        try:
            limit = int(request.params.get('limit', 150))
        except ValueError:
            abort(400, 'limit param cannot be parsed to an integer')

        csw = CatalogueServiceWeb(geocat_url, timeout=60)
        cql = "keyword = '" + keyword + "'"

        query = request.params.get('query')
        if query is not None:
            cql = cql + " AND AnyText = '" + query + "'"

        csw.getrecords(cql=cql, maxrecords=limit,
                       outputschema='http://www.isotc211.org/2005/gmd')
         
        if not csw.results.has_key('matches'):
            abort(502, 'invalid response from GeoCat')

        # initialize the results object
        results = {'results': []}

        if csw.results['matches'] <= 0:
            return results

        for record in csw.records.values():

            # verify that the record is associated with a supported
            # language, and skip it if it's not
            if not record.language in supported_langs.values():
                log.debug('record language not supported, skip record')
                continue

            # read layer type, layer name, and layer url first. If the record
            # does not represent a supported OGC resource, or if it doesn't
            # have a valid name and url, we skip it
            online_resource = self._read_layer_ogc_online_resource(record)

            if online_resource is None or None in online_resource:
                log.debug('record does not represent a supported OGC ' \
                          'resource, skip record')
                continue
            layer_type, layer_param, layer_name, layer_url = online_resource
            parsed_url = urlparse(layer_url)
            if parsed_url.scheme is None or \
                    not parsed_url.scheme.startswith('http'):
                log.debug('layer url scheme is not HTTP, skip record')
                continue

            # read other properties
            id = record.identifier
            name = self._read_layer_name(record)
            alternate_title = self._read_layer_alternate_title(record)
            extent = self._read_layer_extent(record)
            data_provider, data_provider_link = \
                    self._read_layer_data_provider(record)
            abstract = self._read_layer_abstract(record)
            resolution_distance = \
                    self._read_layer_resolution_distance(record)
            equivalent_scales = \
                    self._read_layer_equivalent_scales(record)
            web_links = self._read_layer_online(record, 'Webaddresse (URL)')
            thematic_geoportals = \
                    self._read_layer_online(record, 'specialised geoportal')
            downloads = \
                    self._read_layer_online(record, 'Webaddresse zum Download')
            date = self._read_layer_date(record)
            legal_constraints = self._read_legal_constraints(record)
            copyright, copyright_link = \
                    self._read_layer_copyright(record)

            # create a dict with the properties and append
            # it to the results list
            layer = dict(id=id,
                         name=name,
                         layertype=layer_type,
                         url=layer_url,
                         alternate_title=alternate_title,
                         extent=extent,
                         data_provider=data_provider,
                         data_provider_link=data_provider_link,
                         abstract=abstract,
                         resolution_distance=resolution_distance,
                         equivalent_scales=equivalent_scales,
                         web_links=web_links,
                         thematic_geoportals=thematic_geoportals,
                         downloads=downloads,
                         date=date,
                         legal_constraints=legal_constraints,
                         copyright=copyright,
                         copyright_link=copyright_link)
            layer[layer_param] = layer_name
            results['results'].append(layer);

        return results
Exemplo n.º 15
0
class CkanMetadata(object):
    """ Provides general access to CSW for CKAN """
    def __init__(self, url, schema, version='2.0.2', lang='en-US'):
        self.schema = schema
        self.catalog = CatalogueServiceWeb(url,
                                           lang,
                                           version,
                                           timeout=10,
                                           skip_caps=True)
        self.metadata = dict.fromkeys([
            'id',
            'name',
            'title',
            'url',
            'author',
            'maintainer',
            'maintainer_email',
            'license_url',
            'version',
            'service_url',
            'service_type',
            'notes',
            'tags',
            'metadata_url',
            'metadata_raw',
        ])

    def get_by_search(self, searchterm, propertyname='csw:AnyText'):
        """ Returns the found csw dataset with the given searchterm """
        self.catalog.getrecords(keywords=[searchterm],
                                propertyname=propertyname)
        if (self.catalog.response is None
                or self.catalog.results['matches'] == 0):
            raise DatasetNotFoundError(
                "No dataset for the given searchterm '%s' (%s) found" %
                (searchterm, propertyname))
        return self.catalog.records

    def get_by_id(self, id):
        """ Returns the csw dataset with the given id """
        self.catalog.getrecordbyid(id=[id], outputschema=self.schema)
        return self.catalog.response

    def get_id_by_dataset_name(self, dataset_name):
        """
            Returns the id of a dataset identified by it's name.
            If there are multiple datasets with the given name,
            only the id of the first one is returned.
        """
        dataset_list = self.get_by_search(dataset_name, 'title')
        return dataset_list.itervalues().next().identifier

    def get_attribute(self, ckan_attribute, dataset_name=None):
        """
        Abstract method to define the mapping
        of a ckan attribute to a csw attribute
        """
        raise NotImplementedError

    def get_xml(self, id):
        dataset_xml_string = self.get_by_id(id)
        if dataset_xml_string is None:
            raise DatasetNotFoundError("Dataset with id %s not found" % id)
        return dataset_xml_string

    def get_ckan_metadata_by_id(self, id, language='de'):
        log.debug("Dataset ID: %s" % id)

        dataset_xml = etree.fromstring(self.get_xml(id))
        for key in self.metadata:
            log.debug("Metadata key: %s" % key)
            attribute = self.get_attribute(key)
            self.metadata[key] = attribute.get_value(xml=dataset_xml,
                                                     lang=language)
        return self.metadata

    def get_ckan_metadata(self, dataset_name, language='de'):
        """ Returns the requested dataset mapped to CKAN attributes """
        id = self.get_id_by_dataset_name(dataset_name)
        return self.get_ckan_metadata_by_id(id, language)
Exemplo n.º 16
0
# -*- coding: utf-8 -*-
"""
Created on Sun May 19 23:44:09 2013

@author: maurizio napolitano
"""
import csv
from owslib.csw import CatalogueServiceWeb
csw = CatalogueServiceWeb('http://www.territorio.provincia.tn.it/geoportlet/srv/eng/csw')
csw.getrecords()
data = {}
records = csw.records
for r in records:
    data[r]=records[r]
matches = csw.results['matches']
returned = csw.results['returned']
startposition = 0
while (returned != 0):
    records = csw.records
    for r in records:
        data[r]=records[r]
    csw.getrecords(startposition=csw.results['nextrecord'])
    returned = csw.results['returned']

with open('datafromgeocatalogpat.csv', 'wb') as csvfile:
    csvoutput = csv.writer(csvfile, delimiter=';',quoting=csv.QUOTE_ALL)
    firstrow = True
    for d in data:
        fields = {}
        fields = data[d].__dict__
        if (firstrow):
Exemplo n.º 17
0

# <codecell>

#This cell examines and provides information on the number of model records
#   available via each catalog endpoint
records1 = []
titles1 = []
lenrecords1 = []
lentitles1 = []
k = 0
for endpoint in endpoints[:3]:
    csw = CatalogueServiceWeb(endpoint, timeout=60)
    for model_string in model_strings:
        try:
            csw.getrecords(keywords=[model_string], maxrecords=60, esn='full')
            records1.append(csw.results)
        except Exception, ex1:
            records1.append('Error')
        try:
            for rec in csw.records:
                titles1.append(csw.records[rec].title)
        except Exception, ex1:
            titles1.append('Error')
        lentitles1.append(len(titles1[-1]))
        lenrecords1.append(len(records1[-1]))

zipvar1 = zip(endpoints, records1, lenrecords1, titles1, lentitles1)
df = DataFrame(
    data=zipvar1,
    columns=['endpoints', 'records1', 'lenrecords1', 'titles1', 'lentitles1'])
Exemplo n.º 18
0
class CkanMetadata(object):
    """ Provides general access to CSW for CKAN """
    def __init__(self, url, schema, version='2.0.2', lang='en-US'):
        self.schema = schema
        self.catalog = CatalogueServiceWeb(
            url,
            lang,
            version,
            timeout=10,
            skip_caps=True
        )
        self.metadata = dict.fromkeys([
            'id',
            'name',
            'title',
            'url',
            'author',
            'maintainer',
            'maintainer_email',
            'license_url',
            'version',
            'service_url',
            'service_type',
            'notes',
            'tags',
            'metadata_url',
            'metadata_raw',
        ])

    def get_by_search(self, searchterm, propertyname='csw:AnyText'):
        """ Returns the found csw dataset with the given searchterm """
        self.catalog.getrecords(
            keywords=[searchterm],
            propertyname=propertyname
        )
        if (self.catalog.response is None or
                self.catalog.results['matches'] == 0):
            raise DatasetNotFoundError(
                "No dataset for the given searchterm '%s' (%s) found"
                % (searchterm, propertyname)
            )
        return self.catalog.records

    def get_by_id(self, id):
        """ Returns the csw dataset with the given id """
        self.catalog.getrecordbyid(id=[id], outputschema=self.schema)
        return self.catalog.response

    def get_id_by_dataset_name(self, dataset_name):
        """
            Returns the id of a dataset identified by it's name.
            If there are multiple datasets with the given name,
            only the id of the first one is returned.
        """
        dataset_list = self.get_by_search(dataset_name, 'title')
        return dataset_list.itervalues().next().identifier

    def get_attribute(self, ckan_attribute, dataset_name=None):
        """
        Abstract method to define the mapping
        of a ckan attribute to a csw attribute
        """
        raise NotImplementedError

    def get_xml(self, id):
        dataset_xml_string = self.get_by_id(id)
        if dataset_xml_string is None:
            raise DatasetNotFoundError("Dataset with id %s not found" % id)
        return dataset_xml_string

    def get_ckan_metadata_by_id(self, id, language='de'):
        log.debug("Dataset ID: %s" % id)

        dataset_xml = etree.fromstring(self.get_xml(id))
        for key in self.metadata:
            log.debug("Metadata key: %s" % key)
            attribute = self.get_attribute(key)
            self.metadata[key] = attribute.get_value(
                xml=dataset_xml,
                lang=language
            )
        return self.metadata

    def get_ckan_metadata(self, dataset_name, language='de'):
        """ Returns the requested dataset mapped to CKAN attributes """
        id = self.get_id_by_dataset_name(dataset_name)
        return self.get_ckan_metadata_by_id(id, language)
Exemplo n.º 19
0
def _parse_csw(context, repos, record, identifier, pagesize=10):

    from owslib.csw import CatalogueServiceWeb

    recobjs = []  # records
    serviceobj = repos.dataset()

    # if init raises error, this might not be a CSW
    md = CatalogueServiceWeb(record)

    # generate record of service instance
    _set(context, serviceobj, 'pycsw:Identifier', identifier)
    _set(context, serviceobj, 'pycsw:Typename', 'csw:Record')
    _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2')
    _set(context, serviceobj, 'pycsw:MdSource', record)
    _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now())
    _set(context, serviceobj, 'pycsw:XML', md.response)
    _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml))
    _set(context, serviceobj, 'pycsw:Type', 'service')
    _set(context, serviceobj, 'pycsw:Title', md.identification.title)
    _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract)
    _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords))
    _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Publisher', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints)
    _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees)
    _set(context, serviceobj, 'pycsw:Source', record)
    _set(context, serviceobj, 'pycsw:Format', md.identification.type)

    _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type)
    _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version)
    _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations]))
    _set(context, serviceobj, 'pycsw:CouplingType', 'tight')

    recobjs.append(serviceobj)

    # get all supported typenames of metadata
    # so we can harvest the entire CSW

    csw_typenames = 'csw:Record'

    for op in md.operations:
        if op.name == 'GetRecords':
            try:
                csw_typenames = ' '.join(op.parameters['typeNames']['values'])
            except:  # stay with default
                pass

    # now get all records
    # get total number of records to loop against

    try:
        md.getrecords(typenames=csw_typenames, resulttype='hits')
        matches = md.results['matches']
    except:  # this is a CSW, but server rejects query
        raise RuntimeError(md.response)

    if pagesize > matches:
        pagesize = matches

    LOGGER.debug('Harvesting %d CSW records' % matches)

    # loop over all catalogue records incrementally
    for r in range(1, matches, pagesize):
        try:
            md.getrecords(typenames=csw_typenames, startposition=r,
                          maxrecords=pagesize)
        except Exception, err:  # this is a CSW, but server rejects query
            raise RuntimeError(md.response)
        for k, v in md.records.iteritems():
            recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
Exemplo n.º 20
0
def _parse_csw(context, repos, record, identifier, pagesize=10):

    from owslib.csw import CatalogueServiceWeb

    recobjs = []  # records
    serviceobj = repos.dataset()

    # if init raises error, this might not be a CSW
    md = CatalogueServiceWeb(record)

    # generate record of service instance
    _set(context, serviceobj, 'pycsw:Identifier', identifier)
    _set(context, serviceobj, 'pycsw:Typename', 'csw:Record')
    _set(context, serviceobj, 'pycsw:Schema',
         'http://www.opengis.net/cat/csw/2.0.2')
    _set(context, serviceobj, 'pycsw:MdSource', record)
    _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now())
    _set(context, serviceobj, 'pycsw:XML', md.response)
    _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml))
    _set(context, serviceobj, 'pycsw:Type', 'service')
    _set(context, serviceobj, 'pycsw:Title', md.identification.title)
    _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract)
    _set(context, serviceobj, 'pycsw:Keywords',
         ','.join(md.identification.keywords))
    _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Publisher', md.provider.name)
    _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:OrganizationName',
         md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:AccessConstraints',
         md.identification.accessconstraints)
    _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees)
    _set(context, serviceobj, 'pycsw:Source', record)
    _set(context, serviceobj, 'pycsw:Format', md.identification.type)

    _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type)
    _set(context, serviceobj, 'pycsw:ServiceTypeVersion',
         md.identification.version)
    _set(context, serviceobj, 'pycsw:Operation',
         ','.join([d.name for d in md.operations]))
    _set(context, serviceobj, 'pycsw:CouplingType', 'tight')

    links = [
        '%s,OGC-CSW Catalogue Service for the Web,OGC:CSW,%s' %
        (identifier, md.url),
        '%s,OGC-CSW Capabilities service (ver 2.0.2),OGC:CSW-2.0.2-http-get-capabilities,%s'
        % (identifier, md.url),
    ]

    _set(context, serviceobj, 'pycsw:Links', '^'.join(links))

    recobjs.append(serviceobj)

    # get all supported typenames of metadata
    # so we can harvest the entire CSW

    csw_typenames = 'csw:Record'

    # now get all records
    # get total number of records to loop against

    try:
        md.getrecords(typenames=csw_typenames, resulttype='hits')
        matches = md.results['matches']
    except:  # this is a CSW, but server rejects query
        raise RuntimeError(md.response)

    if pagesize > matches:
        pagesize = matches

    LOGGER.debug('Harvesting %d CSW records' % matches)

    # loop over all catalogue records incrementally
    for r in range(1, matches, pagesize):
        try:
            md.getrecords(typenames=csw_typenames,
                          startposition=r,
                          maxrecords=pagesize)
        except Exception, err:  # this is a CSW, but server rejects query
            raise RuntimeError(md.response)
        for k, v in md.records.iteritems():
            recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'onm',
        supplier_title = u"Office national de l'eau et des milieux aquatiques",
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_site_url = u'http://opendata-sie-back.brgm-rec.fr/geosource/srv/eng/csw'  # Recette environment

    if not args.dry_run:
        harvester.retrieve_target()

    # Retrieve short infos of packages in source.
    csw = CatalogueServiceWeb(source_site_url)

    bad_indexes = []
    index = 0
    limit = 50
    record_by_id = {}
    while True:
        try:
            csw.getrecords(maxrecords = limit, startposition = index)
        except:
            if limit == 1:
                # Bad record found. Skip it.
                bad_indexes.append(index)
                index += 1
                limit = 50
            else:
                # Retry one by one to find bad record and skip it.
                limit = 1
        else:
            for id, record in csw.records.iteritems():
                record_by_id[id] = record
            next_index = csw.results['nextrecord']
            if next_index <= index:
                break
            index = next_index

    # Retrieve packages from source.
    formats = set()
    groups = [
        harvester.upsert_group(dict(
            title = u'Environnement',
            )),
        ]
    temporals = set()
    types = set()
    for record_id in record_by_id.iterkeys():
        csw.getrecordbyid(id = [record_id])
        record = csw.records[record_id]

        formats.add(record.format)
        temporals.add(record.temporal)
        types.add(record.type)

        if not args.dry_run:
            package = dict(
                license_id = u'fr-lo',
                notes = u'\n\n'.join(
                    fragment
                    for fragment in (
                        record.abstract,
                        record.source,
                        )
                    if fragment
                    ),
                resources = [
                    dict(
                        description = uri.get('description') or None,
                        format = {
                            'CSV': 'CSV',
                            'ESRI Shapefile': 'SHP',
                            'MIF / MID': 'MIF / MID',  # TODO?
                            'RDF': 'RDF',
                            'SHP': 'SHP',
                            'Txt': 'TXT',
                            'WMS': 'WMS',
                            }.get(record.format, record.format),
                        name = uri.get('name'),
                        url = uri['url'],
                        )
                    for uri in record.uris
                    ],
                tags = [
                    dict(name = strings.slugify(subject))
                    for subject in record.subjects
                    ],
#                territorial_coverage = TODO
                # Datasets have a granularity of either "commune" or "poi". Since the both are indexed the same way, use
                # "poi".
                territorial_coverage_granularity = 'poi',
                title = record.title,
#                url = u'URL TODO',
                )

            log.info(u'Harvested package: {}'.format(package['title']))
            harvester.add_package(package, harvester.supplier, record.title, package['url'], groups = groups)

    if not args.dry_run:
        harvester.update_target()

    log.info(u'Formats: {}'.format(sorted(formats)))
    log.info(u'Temporals: {}'.format(sorted(temporals)))
    log.info(u'Types: {}'.format(sorted(types)))

    return 0
Exemplo n.º 22
0
def _parse_csw(context, repos, record, identifier, pagesize=10):

    from owslib.csw import CatalogueServiceWeb

    recobjs = []  # records
    serviceobj = repos.dataset()

    md = CatalogueServiceWeb(record)

    # generate record of service instance
    _set(context, serviceobj, 'pycsw:Identifier', identifier)
    _set(context, serviceobj, 'pycsw:Typename', 'csw:Record')
    _set(context, serviceobj, 'pycsw:Schema',
         'http://www.opengis.net/cat/csw/2.0.2')
    _set(context, serviceobj, 'pycsw:MdSource', record)
    _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now())
    _set(context, serviceobj, 'pycsw:XML', md.response)
    _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml))
    _set(context, serviceobj, 'pycsw:Type', 'service')
    _set(context, serviceobj, 'pycsw:Title', md.identification.title)
    _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract)
    _set(context, serviceobj, 'pycsw:Keywords',
         ','.join(md.identification.keywords))
    _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Publisher', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:OrganizationName',
         md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:AccessConstraints',
         md.identification.accessconstraints)
    _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees)
    _set(context, serviceobj, 'pycsw:Source', record)
    _set(context, serviceobj, 'pycsw:Format', md.identification.type)

    _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type)
    _set(context, serviceobj, 'pycsw:ServiceTypeVersion',
         md.identification.version)
    _set(context, serviceobj, 'pycsw:Operation',
         ','.join([d.name for d in md.operations]))
    _set(context, serviceobj, 'pycsw:CouplingType', 'tight')

    recobjs.append(serviceobj)

    # get all supported typenames of metadata
    # so we can harvest the entire CSW

    csw_typenames = 'csw:Record'

    for op in md.operations:
        if op.name == 'GetRecords':
            try:
                csw_typenames = ' '.join(op.parameters['typeNames']['values'])
            except:  # stay with default
                pass

    # now get all records
    # get total number of records to loop against

    md.getrecords(typenames=csw_typenames, resulttype='hits')
    matches = md.results['matches']

    if pagesize > matches:
        pagesize = matches

    # loop over all catalogue records incrementally
    for r in range(1, matches, pagesize):
        md.getrecords(typenames=csw_typenames,
                      startposition=r,
                      maxrecords=pagesize)
        for k, v in md.records.iteritems():
            recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))

    return recobjs
Exemplo n.º 23
0
 def test_GetRecords_summary(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords(outputschema=GMD, startposition=1, maxrecords=5, esn="summary")
     nrecords = len(csw.records)
Exemplo n.º 24
0
csw.results

# Search for keywords like ‘birds’ or ‘fowl’
birds_query_like = PropertyIsLike('dc:subject', '%birds%')
fowl_query_like = PropertyIsLike('dc:subject', '%fowl%')
csw.getrecords2(constraints=[birds_query_like, fowl_query_like])
csw.results

# Search for a specific record:
csw.getrecordbyid(id=['9250AA67-F3AC-6C12-0CB9-0662231AA181'])
c.records['9250AA67-F3AC-6C12-0CB9-0662231AA181'].title

# Search with a CQL query

csw.getrecords(cql='csw:AnyText like "%birds%"')

csw.transaction(ttype='insert',
                typename='gmd:MD_Metadata',
                record=open("file.xml").read())

# update ALL records
csw.transaction(ttype='update',
                typename='csw:Record',
                propertyname='dc:title',
                propertyvalue='New Title')
# update records satisfying keywords filter
csw.transaction(ttype='update',
                typename='csw:Record',
                propertyname='dc:title',
                propertyvalue='New Title',
Exemplo n.º 25
0
    print("Usage: %s <source_catalogue_url> <destination_catalogue_url> [maxrecords]" % sys.argv[0])
    sys.exit(1)

src = CatalogueServiceWeb(sys.argv[1])
dest = CatalogueServiceWeb(sys.argv[2])

if len(sys.argv) == 4:
    maxrecords = sys.argv[3]

while stop == 0:
    if flag == 0:  # first run, start from 0
        startposition = 0
    else:  # subsequent run, startposition is now paged
        startposition = src.results["nextrecord"]

    src.getrecords(esn="brief", startposition=startposition, maxrecords=maxrecords)

    print(src.results)

    if (
        src.results["nextrecord"] == 0
        or src.results["returned"] == 0
        or src.results["nextrecord"] > src.results["matches"]
    ):  # end the loop, exhausted all records
        stop = 1
        break

    # harvest each record to destination CSW
    for i in list(src.records):
        source = "%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s" % (sys.argv[1], i)
        dest.harvest(source=source, resourcetype="http://www.isotc211.org/2005/gmd")
Exemplo n.º 26
0
from owslib.csw import CatalogueServiceWeb
csw = CatalogueServiceWeb(
    'http://www.metadados.geo.ibge.gov.br/geonetwork_ibge/srv/por/csw')
csw.identification.type
'CSW'
[op.name for op in csw.operations]
csw.getdomain('GetRecords.resultType')
csw.getrecords(keywords=[('bcim')], maxrecords=10000)
csw.results
for rec in csw.records:
    print csw.records[rec].title
Exemplo n.º 27
0
if schema == 'iso':
    outputschema = 'http://www.isotc211.org/2005/gmd'

# init
c = CatalogueServiceWeb(url, lang, version)

if request == 'GetCapabilities':
    pass
elif request == 'DescribeRecord':
    c.describerecord(typename)
elif request == 'GetRecordById':
    c.getrecordbyid([id])
elif request == 'GetDomain':
    c.getdomain(dname, dtype)
elif request == 'GetRecords':
    c.getrecords(qtype, [keyword], bbox, esn, sortby, schema)

if print_request is True:  # print the request
    print c.request

if validate is True:  # do XML validation
    print 'Validating request XML'
    if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True:
        print 'request is valid XML'
    else:
        print 'request is NOT valid XML'

# print response
print c.response
Exemplo n.º 28
0

# In[5]:

#This cell examines and provides information on the number of model records
#   available via each catalog endpoint
records1 = []
titles1 = []
lenrecords1 = []
lentitles1 = []
k = 0
for endpoint in endpoints[:3]:    
    csw = CatalogueServiceWeb(endpoint,timeout=60)
    for model_string in model_strings:
        try:
            csw.getrecords(keywords = [model_string], maxrecords = 60, esn = 'full')
            records1.append(csw.results)
        except Exception, ex1:
            records1.append('Error')
        try:
            for rec in csw.records:    
                titles1.append(csw.records[rec].title)
        except Exception, ex1:    
                titles1.append('Error')  
        lentitles1.append(len(titles1[-1]))
        lenrecords1.append(len(records1[-1]))

zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1)
df = DataFrame(data = zipvar1, columns = ['endpoints', 'records1','lenrecords1', 'titles1','lentitles1'])
df.head()
Exemplo n.º 29
0
csw = CatalogueServiceWeb(endpoint)
csw.version

# <codecell>

import iris

# <codecell>

[op.name for op in csw.operations]

# <codecell>

bbox=[-141,42,-52,84]
#bbox=[-71.5, 39.5, -63.0, 46]
csw.getrecords(keywords=['sea_water_temperature'],bbox=bbox,maxrecords=20)
#csw.getrecords(keywords=['sea_water_temperature'],maxrecords=20)
csw.results

# <codecell>

for rec,item in csw.records.iteritems():
    print rec
    print item.abstract

# <codecell>

a=csw.records['data/oceansites/DATA/STATION-M/OS_STATION-M-1_194810_D_CTD.nc']

# <codecell>
Exemplo n.º 30
0
        % sys.argv[0]
    sys.exit(1)

src = CatalogueServiceWeb(sys.argv[1])
dest = CatalogueServiceWeb(sys.argv[2])

if len(sys.argv) == 4:
    maxrecords = sys.argv[3]

while stop == 0:
    if flag == 0:  # first run, start from 0
        startposition = 0
    else:  # subsequent run, startposition is now paged
        startposition = src.results['nextrecord']

    src.getrecords(esn='brief', startposition=startposition, maxrecords=maxrecords)

    print src.results

    if src.results['nextrecord'] == 0 \
        or src.results['returned'] == 0 \
        or src.results['nextrecord'] > src.results['matches']:  # end the loop, exhausted all records
        stop = 1
        break

    # harvest each record to destination CSW
    for i in list(src.records):
        source = '%s?service=CSW&version=2.0.2&request=GetRecordById&id=%s' % \
            (sys.argv[1], i)
        dest.harvest(source=source, \
            resourcetype='http://www.isotc211.org/2005/gmd')
Exemplo n.º 31
0
csw = CatalogueServiceWeb(endpoint)
csw.version

# <codecell>

import iris

# <codecell>

[op.name for op in csw.operations]

# <codecell>

bbox = [-141, 42, -52, 84]
#bbox=[-71.5, 39.5, -63.0, 46]
csw.getrecords(keywords=['sea_water_temperature'], bbox=bbox, maxrecords=20)
#csw.getrecords(keywords=['sea_water_temperature'],maxrecords=20)
csw.results

# <codecell>

for rec, item in csw.records.iteritems():
    print rec
    print item.abstract

# <codecell>

a = csw.records[
    'data/oceansites/DATA/STATION-M/OS_STATION-M-1_194810_D_CTD.nc']

# <codecell>
Exemplo n.º 32
0
if schema == 'iso':
  outputschema = 'http://www.isotc211.org/2005/gmd'

# init
c = CatalogueServiceWeb(url, lang, version)

if request == 'GetCapabilities':
    pass
elif request == 'DescribeRecord':
    c.describerecord(typename)
elif request == 'GetRecordById':
    c.getrecordbyid([id])
elif request == 'GetDomain':
    c.getdomain(dname, dtype)
elif request == 'GetRecords':
    c.getrecords(qtype, [keyword], bbox, esn, sortby, schema)

if print_request is True: # print the request
    print c.request

if validate is True: # do XML validation
    print 'Validating request XML'
    if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True:
        print 'request is valid XML'
    else:
        print 'request is NOT valid XML'

# print response
print c.response

Exemplo n.º 33
0
# In[10]:

#This cell searches the endpoints using the revised list of variable strings from the previous cell and returns basic information on the number of records and the titles of data sets available via each endpoint

variables1 = []
records1 = []
titles1 = []
lenrecords1 = []
lentitles1 = []

for endpoint in endpoints[:3]:
    csw = CatalogueServiceWeb(endpoint,timeout=60)
    for v in RecVarsMerg[:2]:
        try:
            csw.getrecords(keywords = [v], maxrecords = 60, esn = 'full')
            records1.append(csw.results)
        except Exception, ex1:
            records1.append('Error')
        try:
            for rec in csw.records:    
                titles1.append(csw.records[rec].title)
        except Exception, ex1:
            titles1.append('Error') 
    lentitles1.append(len(titles1[-1]))
    lenrecords1.append(len(records1[-1]))

zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1)
df = DataFrame(data = zipvar1, columns = ['endpoints','records1','lenrecords1', 'titles1','lentitles1'])
df.head()
Exemplo n.º 34
0
# In[11]:

#This cell examines and provides information on the number of model records
#   available via each catalog endpoint
records1 = []
records2 = []
titles1 = []
titles2=[]
lenrecords1 = []
lentitles1 = []

for endpoint in endpoints:    
    csw = CatalogueServiceWeb(endpoint,timeout=100)
    for model_string in model_strings:
        try:
            csw.getrecords(keywords = [model_string], maxrecords = 100)
            records1.append(csw.results)
        except Exception, ex1:
            records1.append('Error')
        try:
            for rec in csw.records:    
                titles1.append(csw.records[rec].title)
        except Exception, ex1:    
                titles1.append('Error') 
    lentitles1.append(len(titles1[-1]))
    lenrecords1.append(len(records1[-1]))

zipvar1 = zip(endpoints, records1,lenrecords1, titles1,lentitles1)
df = DataFrame(data = zipvar1, columns = ['endpoints', 'records1','lenrecords1', 'titles1','lentitles1'])
df.head()
Exemplo n.º 35
0
 def test_GetRecords_dataset(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords(qtype="dataset", outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
Exemplo n.º 36
0
args = parser.parse_args()

# optional proxy handler for urllib2 and owslib
if bool(args.http_proxy):
    logging.debug('using proxy %s'%args.http_proxy)
    proxyHandler = urllib2.ProxyHandler({"http" : args.http_proxy})
    opener = urllib2.build_opener(proxyHandler)
    urllib2.install_opener(opener)

# csw connection
bbox = [float(x) for x in args.extent.split(',')]
logging.debug('connecting to %s'%args.csw)
csw = CatalogueServiceWeb(args.csw)
logging.info('retrieving %s datasets, bbox=%s'%(args.maxrecords, bbox))
csw.getrecords(qtype="dataset", keywords=[], esn="full", maxrecords=args.maxrecords, bbox=bbox)
logging.info('retrieved %s records'%len(csw.records))
if len(csw.records)>=args.maxrecords:
    logging.warn('maxrecords reached')


# shapefile preparation
shp = shapefile.Writer(shapefile.POLYGON)
shp.autoBalance = 1
shp.field('id')
shp.field('title')
shp.field('layerurl')
shp.field('layername')
shp.field('layertitle')
shp.field('subjects')
Exemplo n.º 37
0
class HarvestNode(NgdsDataObject):
    """Stores information about harvest endpoints"""
    csw = None

    def __init__(self, url, **kwargs):
        # A URL must be given
        p = urlparse(url)
        self.url = urlunparse((p.scheme, p.netloc, p.path, "", "",
                               ""))  # Strip URL to just domain + path
        self.frequency = kwargs.get(
            'frequency',
            'manual')  # frequency should be one of manual|daily|weekly|monthly
        self.title = kwargs.get(
            'title', 'No Title Was Given')  # A title for bookkeeping
        self.node_admin_id = kwargs.get(
            'node_admin_id', None
        )  # Foreign Key to a responsible_party who maintains the remote node
        #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests

    def setup_csw(self):
        self.csw = CatalogueServiceWeb(self.url)

    def do_harvest(self):
        """Perform a harvest from another CSW server"""
        if self.csw == None:
            self.setup_csw()
        self.get_records()  # Do the first GetRecords request
        ids = self.csw.records.keys()  # Start an array to house all of the ids
        print "next: %s, total: %s" % (self.csw.results["nextrecord"],
                                       self.csw.results["matches"])

        while self.csw.results["nextrecord"] < self.csw.results[
                "matches"] and self.csw.results[
                    "nextrecord"] != 0:  # Once next_record > number_matched, we've gotten everything
            self.get_records(
                self.csw.results["nextrecord"], self.csw.results["returned"]
            )  # Get another set, starting from next_record from previous response
            ids += self.csw.records.keys()  # Add new ids to the array
            print "next: %s, total: %s" % (self.csw.results["nextrecord"],
                                           self.csw.results["matches"])

        self.parse_records(ids)  # Gather the records themselves

    def parse_records(self, ids):
        """Perform as many GetRecordById requests as needed"""
        print "Gathered %s IDs" % str(len(ids))
        for record_id in ids:
            self.get_record_by_id(record_id)
            rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id],
                                                   self)

    def get_record_by_id(self, record_id):
        """Get a single record, by ID"""
        params = {
            "id": [record_id],
            "outputschema": "http://www.isotc211.org/2005/gmd"
        }
        self.csw.getrecordbyid(**params)  # Puts response in self.csw.records

    def get_records(self, start_position=1, max_records=1000):
        """Perform a GetRecords request"""
        params = {
            "typenames": "gmd:MD_Metadata",
            "outputschema": "http://www.isotc211.org/2005/gmd",
            "startposition": start_position,
            "maxrecords": max_records,
            "esn": "brief"
        }
        self.csw.getrecords(**params)  # Puts results in self.csw.records
Exemplo n.º 38
0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'gl',
        supplier_title = u"Grand Lyon",
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_site_url = u'http://catalogue.data.grandlyon.com/geosource/srv/fr/csw'

    if not args.dry_run:
        harvester.retrieve_target()

    # Retrieve short infos of packages in source.
    csw = CatalogueServiceWeb(source_site_url)

    bad_indexes = []
    index = 0
    limit = 50
    record_by_id = {}
    while True:
        try:
            csw.getrecords(maxrecords = limit, startposition = index)
        except:
            if limit == 1:
                # Bad record found. Skip it.
                bad_indexes.append(index)
                index += 1
                limit = 50
            else:
                # Retry one by one to find bad record and skip it.
                limit = 1
        else:
            for id, record in csw.records.iteritems():
                record_by_id[id] = record
            next_index = csw.results['nextrecord']
            if next_index <= index:
                break
            index = next_index

    # Retrieve packages from source.
    formats = set()
    licenses_url = set()
    protocols = set()
    rights = set()
    temporals = set()
    types = set()
    for record_id in record_by_id.iterkeys():
        csw.getrecordbyid(id = [record_id])
        dc_record = csw.records[record_id]
        csw.getrecordbyid(id = [record_id], outputschema = 'http://www.isotc211.org/2005/gmd')
        gmd_record = csw.records.get(record_id)

        format = dc_record.format
        if format is not None:
            format = format.split(u' (', 1)[0]
        formats.add(format)

        copyright = dc_record.rights
        if copyright and isinstance(copyright, list):
            copyright = tuple(copyright)
            rights.add(copyright)

        if gmd_record is None:
            frequency = None
        else:
            for frequency_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo'
                    '/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation'
                    '/gmd:userDefinedMaintenanceFrequency/gts:TM_PeriodDuration',
                    namespaces = namespaces):
                frequency = frequency_xml.text
                break
            else:
                frequency = None
        if frequency is not None:
            assert frequency in frequency_by_code, 'Unknown frequency: {}'.format(frequency)
            frequency = frequency_by_code[frequency]

        for uri in dc_record.uris:
            if uri['url'].startswith('http://opendata.data.grandlyon.com/Licence'):
                licenses_url.add(uri['url'])
            protocols.add(uri['protocol'])

        subjects = [
            subject
            for subject in dc_record.subjects
            if subject != 'OpenData'
            ]
        groups = [
            harvester.upsert_group(dict(
                title = subjects[0],
                )),
            ] if subjects else []
        groups.append(harvester.upsert_group(dict(
            title = u'Territoires et Transports',
            )))
        tags = [
            dict(name = strings.slugify(subject))
            for subject in subjects
            ]

        related = []
        if gmd_record is None:
            resources = [
                dict(
                    description = uri.get('description') or None,
                    format = {
                        'application/pdf': 'PDF',
                        'application/zip': 'ZIP',
                        'pdf': 'PDF',
                        'text/csv': 'CSV',
                        'text/plain': 'TXT',
                        }.get(format, format),
                    name = uri.get('name') or None,
                    url = uri['url'],
                    )
                for uri in dc_record.uris
                if uri.get('protocol') in ('WWW:DOWNLOAD-1.0-http--download', 'WWW:LINK-1.0-http--link')
                    and uri['url'].startswith('http://opendata.data.grandlyon.com/')
                    and uri['url'] != 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf'
                ]
        else:
            kml_resource = False
            resources = []
            for online in gmd_record.distribution.online:
                if online.url.startswith((
                        'http://catalogue.data.grandlyon.com/geosource/srv/en/resources.get?id=',
                        'file:',
                        'jdbc:',
                        )) \
                        or online.url == 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf':
                    continue
                if online.protocol == 'OGC:WFS':
                    if not kml_resource:
                        resources.append(dict(
                            description = online.description or None,
                            format = 'KML',
                            name = online.name or None,
                            url = 'http://kml.data.grandlyon.com/grandlyon/?request=list&typename={}'.format(
                                online.name),
                            ))
                        kml_resource = True
                    if '?' not in online.url:
                        resources.append(dict(
                            description = online.description or None,
                            format = 'GML',
                            name = online.name or None,
                            url = u'{}?SERVICE={}&REQUEST=GetFeature&VERSION=1.1.0&typename={}'.format(online.url,
                                online.protocol.split(':', 1)[1], online.name),
                            ))
                elif online.protocol == 'OGC:WMS':
                    if '?' not in online.url:
                        bounding_box = gmd_record.identification.extent.boundingBox
                        related.append(dict(
                            image_url = u'{}?SERVICE={}&REQUEST=GetMap&VERSION=1.1.1&LAYERS={}&FORMAT=image/png'
                                u'&SRS=EPSG:4326&BBOX={},{},{},{}&WIDTH=400&HEIGHT=300'.format(online.url,
                                online.protocol.split(':', 1)[1], online.name, bounding_box.minx, bounding_box.miny,
                                bounding_box.maxx, bounding_box.maxy),
                            title = u'Vignette',
                            type = u'visualization',
                            # url = None,
                            ))
                resources.append(dict(
                    description = online.description or None,
                    format = {
                        'DB:POSTGIS': 'POSTGIS',
                        'FILE:RASTER': 'RASTER',
                        'OGC:WCS': 'WCS',
                        'OGC:WFS': 'WFS',
                        'OGC:WMS': 'WMS',
                        'WWW:DOWNLOAD-1.0-http--download': None,
                        'WWW:LINK-1.0-http--link': None,
                        }[online.protocol],
                    name = online.name or None,
                    url = online.url,
                    ))
        temporals.add(dc_record.temporal)
        types.add(dc_record.type)

        if args.dry_run:
            log.info(u'Harvested package: {}'.format(dc_record.title))
        else:
            package = dict(
                frequency = {
                    'P0Y0M0DT0H1M0S': u"ponctuelle",
                    }.get(frequency),
                license_id = {
                    'copyright': None,
                    ('Licence ODbL GRAND LYON', u"Pas de restriction d'accès public"): u'odc-odbl',
                    'license': None,
                    }.get(copyright),
                notes = u'\n\n'.join(
                    fragment
                    for fragment in (
                        dc_record.abstract,
                        dc_record.source,
                        )
                    if fragment
                    ),
                resources = resources,
                tags = [
                    dict(name = strings.slugify(subject))
                    for subject in dc_record.subjects
                    ],
#                territorial_coverage = TODO
                title = dc_record.title,
#                TODO: Use this URL once Grand Lyon is ready to use it. Before end of year.
#                url = u'http://smartdata.grandlyon.com/single/{}'.format(record_id),
                url = u'http://smartdata.grandlyon.com/',
                )

#            if gmd_record is not None:
#                for graphic_filename_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo'
#                        '/gmd:MD_DataIdentification/gmd:graphicOverview'
#                        '/gmd:MD_BrowseGraphic[gmd:fileDescription/gco:CharacterString="large_thumbnail"]'
#                        '/gmd:fileName/gco:CharacterString',
#                        namespaces = namespaces):
#                    related.append(dict(
#                        image_url = urlparse.urljoin(base_url, unicode(graphic_filename_xml.text)),
#                        title = u'Vignette',
#                        type = u'visualization',
#                        # url = TODO,
#                        ))

            log.info(u'Harvested package: {}'.format(package['title']))
            harvester.add_package(package, harvester.supplier, dc_record.title, package['url'],
                related = related or None)

    if not args.dry_run:
        harvester.update_target()

    log.info(u'Formats: {}'.format(sorted(formats)))
    log.info(u'Licenses: {}'.format(sorted(licenses_url)))
    log.info(u'Protocols: {}'.format(sorted(protocols)))
    log.info(u'Rights: {}'.format(sorted(rights)))
    log.info(u'Temporals: {}'.format(sorted(temporals)))
    log.info(u'Types: {}'.format(sorted(types)))

    return 0