class GeocatCatalogueServiceWeb(object):
    def __init__(self, url):
        self.csw = CatalogueServiceWeb(url)
        self.schema = CHE_SCHEMA

    def get_geocat_id_from_csw(self,
                               cqlquery=CQL_QUERY_DEFAULT,
                               cqlterm=CQL_SEARCH_TERM_DEFAULT):
        harvest_query = PropertyIsEqualTo(cqlquery, cqlterm)
        nextrecord = 0
        record_ids = []
        while nextrecord is not None:
            self.csw.getrecords2(constraints=[harvest_query],
                                 maxrecords=50,
                                 startposition=nextrecord)
            if self.csw.response is None or self.csw.results['matches'] == 0:
                raise CswNotFoundError(
                    "No dataset found for harvest query {}".format(
                        harvest_query))
            if self.csw.results['returned'] > 0:
                if self.csw.results['nextrecord'] > 0:
                    nextrecord = self.csw.results['nextrecord']
                else:
                    nextrecord = None
                for id in self.csw.records.keys():
                    record_ids.append(id)
        return record_ids

    def get_record_by_id(self, geocat_id):
        self.csw.getrecordbyid(id=[geocat_id], outputschema=self.schema)
        csw_record_as_string = self.csw.response
        if csw_record_as_string:
            return csw_record_as_string
        else:
            return None
Пример #2
0
 def mdcount(self, cswurl, constraints=[], startrecord=0, maxharvest=10):
     """Queries the csw and count md matching constraints"""
     csw = CatalogueServiceWeb(cswurl, skip_caps=True)
     csw.getrecords2(
         esn="brief", constraints=constraints, startposition=startrecord, maxrecords=maxharvest, resulttype="hits"
     )
     return csw.results
 def test_GetRecords_summary(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords2(outputschema=GMD,
                     startposition=1,
                     maxrecords=5,
                     esn="summary")
     nrecords = len(csw.records)
def get_datasets_from_csw(csw_endpoint, extended):
    print 'Retrieving data sets from %s' % csw_endpoint

    csw = CatalogueServiceWeb(csw_endpoint)
    csw.getrecords2(esn='full', maxrecords=1000)
    parsed_records = csw.records.values()
    if extended:
        #request the data a second time in an encoding with different information
        #manually parse the second response and join the information in to the existing records 
        csw.getrecords2(esn='full', maxrecords=1000, outputschema=namespaces['gmd'])
        unparsed_response = csw.response
        root = etree.XML(unparsed_response)
   
        for record in parsed_records:
            record_id = record.identifier
            xpath_record_fragment = record_xpath_template.format(record_id)
            for attribute, xpath_fragment in attribute_to_xpath_fragment.iteritems():
                xpath_attribute_fragment = xpath_record_fragment + xpath_fragment
                value = root.xpath(xpath_attribute_fragment, namespaces=namespaces)
                if len(value) != 0:
                    #unpack list
                    value = value[0]
                else:
                    value = ""
                
                setattr(record, attribute, value)
                
            print(vars(record))
       
    return {
            'datasets' : parsed_records,
    }
 def test_GetRecords_dataset(self):
     csw = CatalogueServiceWeb(service)
     constraints = [PropertyIsEqualTo("dc:type", "dataset")]
     csw.getrecords2(constraints=constraints,
                     outputschema=GMD,
                     startposition=1,
                     maxrecords=5)
     nrecords = len(csw.records)
Пример #6
0
def deleteMetadata(name, login, password, url, workspace, verbose=False):
    #takes a keyword and delete metadata associated

    url_csw = url + "/geonetwork/srv/fre/csw-publication"
    keyword = workspace + ":" + name  # keyword = geosync-restreint:baies_metadata__baies_metadata

    # Connect to a CSW, and inspect its properties:
    from owslib.csw import CatalogueServiceWeb
    csw = CatalogueServiceWeb(url_csw,
                              skip_caps=True,
                              username=login,
                              password=password)
    # suppression des métadonnées relatives à "keyword" / couche geoserver

    # TODO faire sans owslib via API rest
    #      ex: https://georchestra-mshe.univ-fcomte.fr/geonetwork/srv/fre/xml.search?any=geosync-ouvert:fouilles_chailluz__mobilier_pros
    #      cette recherche est stricte au sens où cela trouve ce qui correspond exactement (exact match) par exemple la recherche de 'myworkspace:foo' ne trouve pas 'myworkspace:foobar'

    from owslib.fes import PropertyIsEqualTo, PropertyIsLike
    myquery = PropertyIsEqualTo(
        'csw:AnyText', keyword
    )  # TODO vérifier que la recherche est stricte sinon risque de retourner des résultats non désirés (ex: 'myworkspace:foobar' en cherchant 'myworkspace:foo')
    if verbose:
        print "INFO CatalogueServiceWeb(" + url_csw + ",...) ... PropertyIsEqualTo('csw:AnyText'," + keyword + ")"
    csw.getrecords2(constraints=[myquery], maxrecords=10)
    resultat = csw.results
    if verbose:
        print "INFO " + str(
            csw.results['matches']) + " fiche(s) de metadata trouvée(s)"

    if csw.results['matches'] > 1:
        print "WARNING plus d'1 fiche de metadata trouvée pour " + keyword

    result = True  # s'il y a au moins 1 erreur, retourne False
    for rec in csw.records:
        try:
            csw.transaction(
                ttype='delete',
                typename='MD_Metadata',
                identifier=csw.records[rec].identifier
            )  #marche apparement pour les metadonnees étant de type gmd:MD_Metadata
            result = True and result
        except Exception as e:
            sys.stderr.write("ERROR suppression de metadata pour " + keyword +
                             " échouée : " + e.message + "\n")
            print "ERROR suppression de metadata pour " + keyword + " échouée"
            result = False
        try:
            identifier = csw.records[
                rec].identifier  #genere une erreur si pas d'identifiant
            # titre=csw.records[rec].title
            print "OK suppression de metadata " + keyword + " (" + identifier + ") réussie"
        except Exception as e:
            print "OK suppression de metadata réussie"
            print "WARNING " + e.message

    return result
Пример #7
0
 def test_GetRecords(self):
     # NB: This test fails because no records have been setup...
     raise SkipTest() # therefore skip
     csw = CatalogueServiceWeb(service)
     csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
     #print csw.response[:1024]
     assert nrecords == 5, nrecords
     for ident in csw.records:
         identifiers.append(ident)
         assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])
 def test_GetRecords(self):
     # NB: This test fails because no records have been setup...
     raise SkipTest()  # therefore skip
     csw = CatalogueServiceWeb(service)
     csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
     #print csw.response[:1024]
     assert nrecords == 5, nrecords
     for ident in csw.records:
         identifiers.append(ident)
         assert isinstance(csw.records[ident],
                           MD_Metadata), (ident, csw.records[ident])
Пример #9
0
def csw_ajax(request, *args, **kwargs):
    if request.method == 'GET':
        csw_url = request.session['csw_url']
        user = request.session['user']
        password = request.session['password']
        keywords = request.session['keywords']
        keywords_query = [fes.PropertyIsLike(
            'csw:AnyText', '%%%s%%' % keywords)]
        if not csw_url:
            return HttpResponseServerError()

        try:
            csw = CatalogueServiceWeb(
                csw_url,
                username=user,
                password=password)
            result = csw.identification.type
            if result == 'CSW':
                offset = int(request.GET['offset'])
                per_page = int(request.GET['perPage'])
                csw.getrecords2(
                    typenames='gmd:MD_Metadata',
                    esn='full',
                    outputschema='http://www.isotc211.org/2005/gmd',
                    constraints=keywords_query,
                    startposition=offset,
                    maxrecords=per_page)
                result = []
                for key in csw.records:
                    rec = csw.records[key]
                    res = {}
                    if isinstance(rec, MD_Metadata):
                        res['id'] = rec.identifier
                        res['title'] = rec.identification.title
                        res['inasafe_keywords'] = rec.identification.\
                            supplementalinformation
                        if res['inasafe_keywords']:
                            res['inasafe_layer'] = (
                                '<inasafe_keywords/>' in
                                res['inasafe_keywords'])
                        result.append(res)
            json_result = {
                'records': result,
                'queryRecordCount': csw.results['matches'],
                'totalRecordCount': csw.results['matches']
            }
            return JsonResponse(json_result, safe=False)
        except Exception as e:
            LOGGER.exception(e)
            return HttpResponseServerError()

    return HttpResponseServerError()
Пример #10
0
    def mdsearch(self, cswurl, esn="summary", constraints=[], startrecord=0, maxrecords=10, maxharvest=20):
        tstart = datetime.datetime.now()
        """Queries a csw to retrieve md ids matching constraints"""
        records = {}

        logging.info("searching max %s md from %s" % (maxharvest, cswurl))
        csw = CatalogueServiceWeb(cswurl, skip_caps=True)
        first = True
        nextrecord = startrecord
        count = 0

        while True:
            if not first:
                nextrecord = csw.results["nextrecord"]

            if count + maxrecords > maxharvest:
                maxrecords = maxharvest - count  # retrieve exactly maxharvest md

            csw.getrecords2(
                esn=esn,
                constraints=constraints,
                startposition=nextrecord,
                maxrecords=maxrecords,
                outputschema=self.OUTPUTSCHEMA,
            )

            if csw.results["matches"] == 0:
                logging.info("0 md found from %s" % cswurl)
                break
            else:
                first = False
                # fetch records
                for rec_id, rec in csw.records.iteritems():
                    count += 1
                    percent = int(float(count) / min(maxharvest, csw.results["matches"]) * 100)
                    logging.debug("%s%% %s" % (percent, rec_id))
                    records[rec_id] = rec

                # get out if no records or beyond maxrecords
                if (
                    csw.results["nextrecord"] == 0
                    or csw.results["returned"] == 0
                    or csw.results["nextrecord"] > csw.results["matches"]
                    or csw.results["nextrecord"] > maxharvest
                ):
                    d = (datetime.datetime.now() - tstart).total_seconds()
                    logging.info("%s md found from %s in %d s" % (count, cswurl, d))
                    break

        return records
Пример #11
0
def test_default_csw_connections():
    """test that the default CSW connections work"""

    relpath = 'resources%sconnections-default.xml' % os.sep
    csw_connections_xml = options.base.plugin / relpath

    conns = etree.parse(csw_connections_xml)

    for conn in conns.findall('csw'):
        try:
            csw = CatalogueServiceWeb(conn.attrib.get('url'))
            info('Success: %s', csw.identification.title)
            csw.getrecords2()
        except Exception as err:
            raise ValueError('ERROR: %s', err)
Пример #12
0
def test_bbox(endpoints,bbox):
    for title,url in endpoints.iteritems():
        try:
            csw = CatalogueServiceWeb(url, timeout=40)
            if "BBOX" in csw.filters.spatial_operators:
                filter_list = [fes.BBox(bbox)]
                try:
                    csw.getrecords2(constraints=filter_list, maxrecords=1000)
                    print("%s : Datasets = %d" % (title,len(csw.records.keys())))
                except Exception:
                    print "%s : BBOX Query FAILS" % title
            else:
                print "%s - BBOX Query NOT supported" % title
        except Exception:
            print "%s - Timed out" % title
Пример #13
0
def test_default_csw_connections():
    """test that the default CSW connections work"""

    relpath = 'resources%sconnections-default.xml' % os.sep
    csw_connections_xml = options.base.plugin / relpath

    conns = etree.parse(csw_connections_xml)

    for conn in conns.findall('csw'):
        try:
            csw = CatalogueServiceWeb(conn.attrib.get('url'))
            info('Success: %s', csw.identification.title)
            csw.getrecords2()
        except Exception as err:
            raise ValueError('ERROR: %s', err)
Пример #14
0
def test_csw_skgeodsy():
    c = CatalogueServiceWeb(SERVICE_URL)

    assert sorted([op.name for op in c.operations]) == [
        'DescribeRecord', 'GetCapabilities', 'GetRecordById', 'GetRecords',
        'Transaction'
    ]

    grop = c.get_operation_by_name('GetRecords')
    assert grop.name == 'GetRecords'

    c.getrecords2(typenames='csw:Record gmd:MD_Metadata')
    assert c.results.get('returned') > 0
    assert c.results.get('nextrecord') > 0
    assert c.results.get('matches') > 0
Пример #15
0
def test_bbox(endpoints, bbox):
    for title, url in endpoints.iteritems():
        try:
            csw = CatalogueServiceWeb(url, timeout=40)
            if "BBOX" in csw.filters.spatial_operators:
                filter_list = [fes.BBox(bbox)]
                try:
                    csw.getrecords2(constraints=filter_list, maxrecords=1000)
                    print("%s : Datasets = %d" %
                          (title, len(csw.records.keys())))
                except Exception:
                    print "%s : BBOX Query FAILS" % title
            else:
                print "%s - BBOX Query NOT supported" % title
        except Exception:
            print "%s - Timed out" % title
Пример #16
0
def csw_query_metadata_by_id(csw_url,
                             identifier,
                             username=None,
                             password=None):
    csw = CatalogueServiceWeb(csw_url, username=username, password=password)
    result = csw.identification.type
    record = None
    if result == 'CSW':
        constraints = [fes.PropertyIsEqualTo('dc:identifier', identifier)]
        csw.getrecords2(typenames='gmd:MD_Metadata',
                        esn='full',
                        outputschema='http://www.isotc211.org/2005/gmd',
                        constraints=constraints)
        for key in csw.records:
            record = csw.records[key]
    return record
Пример #17
0
    def mdsearch(self, cswurl, esn='summary', constraints=[], startrecord=0, maxrecords=10, maxharvest=20):
        tstart = datetime.datetime.now()
        """Queries a csw to retrieve md ids matching constraints"""
        records = {}

        logging.info('searching max %s md from %s' % (maxharvest, cswurl))
        csw = CatalogueServiceWeb(cswurl, skip_caps=True)
        first = True
        nextrecord = startrecord
        count = 0

        while True:
            if not first:
                nextrecord = csw.results['nextrecord']

            if count + maxrecords > maxharvest:
                maxrecords = maxharvest - count     # retrieve exactly maxharvest md

            csw.getrecords2(
                esn=esn, constraints=constraints, startposition=nextrecord,
                maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA)

            if csw.results['matches'] == 0:
                logging.info('0 md found from %s' % cswurl)
                break
            else:
                first = False
                # fetch records
                for rec_id, rec in csw.records.iteritems():
                    count += 1
                    percent = int(float(count)/min(maxharvest, csw.results['matches'])*100)
                    logging.debug('%s%% %s' % (percent, rec_id))
                    records[rec_id] = rec

                # get out if no records or beyond maxrecords
                if csw.results['nextrecord'] == 0 \
                        or csw.results['returned'] == 0 \
                        or csw.results['nextrecord'] > csw.results['matches'] \
                        or csw.results['nextrecord'] > maxharvest:
                    d = (datetime.datetime.now() - tstart).total_seconds()
                    logging.info('%s md found from %s in %d s' % (count, cswurl, d))
                    break

        return records
Пример #18
0
        def get_uuid_from_title(csw_url, title):
            '''
            Function to return OWSLib CSW record record from specified CSW URL using title as the search criterion
            Sample UUID: 221dcfd8-03d7-5083-e053-10a3070a64e3
            '''
            MAXRECORDS = 200

            uuid = None
            csw = CatalogueServiceWeb(csw_url)
            assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url

            search_title = title.replace('_', '%')
            while search_title and len(title) - len(
                    search_title) < 10 and not uuid:
                title_query = PropertyIsEqualTo('csw:Title',
                                                '%' + search_title + '%')
                csw.getrecords2(constraints=[title_query],
                                esn='summary',
                                maxrecords=MAXRECORDS)

                if not csw.records:  # No records found
                    # Broaden search by shortening title
                    search_title = search_title[0:-1]
                else:
                    uuid_list = []
                    # Strip all non-alphanumeric characters from title
                    alphanumeric_title = re.sub('\W', '', title)
                    while not uuid_list:
                        uuid_list = [
                            identifier for identifier in csw.records.keys()
                            if alphanumeric_title in re.sub(
                                '\W', '', csw.records[identifier].title)
                        ]
                        if len(uuid_list) == 1:  # Unique match found
                            uuid = uuid_list[0]
                            logger.info('UUID %s found from title characters',
                                        uuid)
                            break
                        else:
                            # Broaden search by shortening munged_title
                            alphanumeric_title = alphanumeric_title[0:-1]

            return uuid
Пример #19
0
def get_csw_results(protocol, maxresults=0):
    csw = CatalogueServiceWeb(CSW_URL)
    svc_owner = "Beheer PDOK"
    query = (
        f"type='service' AND organisationName='{svc_owner}' AND protocol='{protocol}'"
    )
    md_ids = []
    start = 1
    maxrecord = maxresults if (maxresults < 100 and maxresults != 0) else 100
    while True:
        csw.getrecords2(maxrecords=maxrecord, cql=query, startposition=start)
        md_ids.extend([{"mdId": rec, "protocol": protocol} for rec in csw.records])
        if len(md_ids) >= maxresults:
            break
        if csw.results["nextrecord"] != 0:
            start = csw.results["nextrecord"]
            continue
        break
    return md_ids
Пример #20
0
    def initialize(self):
        startposition = 0
        csw = CatalogueServiceWeb(self.source.url)
        csw.getrecords2(maxrecords=1)
        matches = csw.results.get("matches")

        while startposition <= matches:
            csw.getrecords2(maxrecords=100, startposition=startposition)
            startposition = csw.results.get('nextrecord')
            for rec in csw.records:
                item = {}
                record = csw.records[rec]
                item["id"] = record.identifier
                item["title"] = record.title
                item["description"] = record.abstract
                item["url"] = record.references[0].get('url')
                item["type"] = record.type
                self.add_item(record.identifier,
                              title=record.title,
                              date=None,
                              item=item)
Пример #21
0
def _from_csw(url):
    result = []

    schema = 'http://www.isotc211.org/2005/gmd'

    src = CatalogueServiceWeb(url)

    stop = 0
    flag = 0
    maxrecords = 5

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results['nextrecord']

        src.getrecords2(esn='full',
                        startposition=startposition,
                        maxrecords=maxrecords,
                        outputschema=schema)

        if src.results['nextrecord'] == 0 \
            or src.results['returned'] == 0 \
            or src.results['nextrecord'] > src.results['matches']: # end the loop, exhausted all records
            stop = 1
            break

        # harvest each record
        for key, value in src.records.items():
            item = convert_from_iso(value)

            item['properties']['harvested_from'] = url
            item['properties']['harvest_json'] = value.xml.decode()

        result.append(item)

        flag = 1

    return result
Пример #22
0
    def get_filepaths(self, collection_id: str, spatial_extent: List[float],
                      temporal_extent: List[str]) -> List[str]:
        """Retrieve a file list from the a CSW server according to the specified parameters.

        Arguments:
            collecion_id {str} -- identifier of the collection
            spatial_extent {List[float]} -- bounding box [ymin, xmin, ymax, xmax]
            temporal_extent {List[str]} -- e.g. ["2018-06-04", "2018-06-23"]

        Returns:
            list -- list of filepaths
        """
        csw = CatalogueServiceWeb(self.csw_server_uri, timeout=300)

        constraints = []
        constraints.append(PropertyIsLike(self.group_property, collection_id))

        # Spatial filter
        constraints.append(BBox(spatial_extent))
        # Temporal filter
        constraints.append(
            PropertyIsGreaterThan('apiso:TempExtent_begin',
                                  temporal_extent[0]))
        constraints.append(
            PropertyIsLessThan('apiso:TempExtent_end', temporal_extent[1]))

        # Run the query
        csw.getrecords2(constraints=[constraints], maxrecords=100)

        # Put found records in a variable (dictionary)
        records0 = csw.records

        # Sort records
        records = []
        for record in records0:
            records.append(records0[record].references[0]['url'])
        records = sorted(records)

        return records
Пример #23
0
        def get_uuid_from_title(csw_url, title):
            '''
            Function to return OWSLib CSW record record from specified CSW URL using title as the search criterion
            Sample UUID: 221dcfd8-03d7-5083-e053-10a3070a64e3
            '''
            MAXRECORDS = 200

            uuid = None
            csw = CatalogueServiceWeb(csw_url)
            assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url

            search_title = title.replace('_', '%')
            while search_title and len(
                    title) - len(search_title) < 10 and not uuid:
                title_query = PropertyIsEqualTo(
                    'csw:Title', '%' + search_title + '%')
                csw.getrecords2(
                    constraints=[title_query], esn='summary', maxrecords=MAXRECORDS)

                if not csw.records:  # No records found
                    # Broaden search by shortening title
                    search_title = search_title[0:-1]
                else:
                    uuid_list = []
                    # Strip all non-alphanumeric characters from title
                    alphanumeric_title = re.sub('\W', '', title)
                    while not uuid_list:
                        uuid_list = [identifier for identifier in csw.records.keys(
                        ) if alphanumeric_title in re.sub('\W', '', csw.records[identifier].title)]
                        if len(uuid_list) == 1:  # Unique match found
                            uuid = uuid_list[0]
                            logger.info(
                                'UUID %s found from title characters', uuid)
                            break
                        else:
                            # Broaden search by shortening munged_title
                            alphanumeric_title = alphanumeric_title[0:-1]

            return uuid
def sync():
    """
    Sync a bunch of csw layers with Solr.
    """
    class cswLayer(object):
        pass

    csw = CatalogueServiceWeb(
        'http://hh.worldmap.harvard.edu/registry/hypermap/csw')
    count = 0
    for startposition in range(0, 2000, 10):
        csw.getrecords2(maxrecords=10, startposition=startposition)
        print csw.results
        for uuid in csw.records:
            record = csw.records[uuid]
            if record.bbox.crs.code == 4326:
                print record.bbox.minx, record.bbox.miny, record.bbox.maxx, record.bbox.maxy
                layer = cswLayer()
                layer.bbox_x0 = float(record.bbox.minx)
                layer.bbox_y0 = float(record.bbox.miny)
                layer.bbox_x1 = float(record.bbox.maxx)
                layer.bbox_y1 = float(record.bbox.maxy)
                if layer.bbox_x0 >= -180 and layer.bbox_x1 <= 180 and layer.bbox_y0 >= -90 and layer.bbox_y1 <= 90:
                    layer.uuid = uuid
                    layer.name = uuid
                    layer.title = record.title
                    layer.abstract = record.abstract
                    layer.url = record.source
                    layer.date = get_random_date()
                    layer.category = get_random_category()
                    layer.regions = add_random_regions()
                    layer.keywords = add_random_keywords()
                    layer_to_solr(layer)
                    print 'Imported record %s' % count
                    count += 1
    print 'Done.'
Пример #25
0
def harvest_csw(src_url, dest_url):
    stop = 0
    flag = 0
    maxrecords = 10

    src = CatalogueServiceWeb(src_url)
    dest = CatalogueServiceWeb(dest_url)

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results['nextrecord']

        src.getrecords2(esn='full', startposition=startposition, maxrecords=maxrecords)
        
        print(src.results)

        if src.results['nextrecord'] == 0 \
           or src.results['returned'] == 0 \
           or src.results['nextrecord'] > src.results['matches']:  # end the loop, exhausted all records
            stop = 1
            break

        
        # harvest each record to destination CSW
        for i in list(src.records):
            print "insert", i
            src.getrecordbyid(id=[i], outputschema='http://www.isotc211.org/2005/gmd')
            md = src._exml.find('{http://www.isotc211.org/2005/gmd}MD_Metadata')
            f = open('/tmp/a.xml', 'w')
            f.write(etree.tostring(md))
            f.close()
            dest.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("/tmp/a.xml").read())

        flag = 1
Пример #26
0
def show_add_layer_dialog(request, *args, **kwargs):
    if request.method == 'POST':
        csw_url = request.session['csw_url']
        user = request.session['user']
        password = request.session['password']
        layer_id = request.POST['layer_id']
        try:
            csw = CatalogueServiceWeb(csw_url,
                                      username=user,
                                      password=password)
            constraints = [fes.PropertyIsEqualTo('dc:identifier', layer_id)]
            result = csw.identification.type
            if result == 'CSW':
                csw.getrecords2(constraints=constraints)
                record = None
                for key in csw.records:
                    rec = csw.records[key]
                    for ref in rec.references:
                        if 'OGC:WCS' in ref['scheme']:
                            rec.type = 'WCS'
                            rec.endpoint = ref['url']
                            record = rec
                            break
                        if 'OGC:WFS' in ref['scheme']:
                            rec.type = 'WFS'
                            rec.endpoint = ref['url']
                            record = rec
                            break
                    if record:
                        break

                if record.type == 'WCS':
                    # get describe coverage
                    # find coverage id from references
                    coverage_id = None
                    version = None
                    for ref in record.references:
                        if 'service=WCS' in ref['url']:
                            url = ref['url']
                            parse_result = urlparse.urlparse(url)
                            query = parse_result.query
                            query_dict = urlparse.parse_qs(query)
                            coverage_id = query_dict['coverageid'][0]
                            version = query_dict['version'][0]
                            if coverage_id and version:
                                break
                    record.service_id = coverage_id
                    record.service_version = version
                elif record.type == 'WFS':
                    typename = None
                    version = None
                    for ref in record.references:
                        if 'service=WFS' in ref['url']:
                            url = ref['url']
                            parse_result = urlparse.urlparse(url)
                            query = parse_result.query
                            query_dict = urlparse.parse_qs(query)
                            typename = query_dict['typename'][0]
                            version = query_dict['version'][0]
                            if typename and version:
                                break

                    record.service_id = typename
                    record.service_version = version
                #     wcs = WebCoverageService(record.endpoint)
                #     result = wcs.getDescribeCoverage(coverage_id)
                context = {'record': record}
                return render(request,
                              'geosafe/metasearch/modal/add_layer.html',
                              context)
        except:
            return HttpResponseServerError()
    return HttpResponseServerError()
Пример #27
0
def main():
    # Connection variables
    csw_url = 'csw.open.canada.ca/geonetwork/srv/csw'
    csw_user = None
    csw_passwd = None

    proxy_protocol = None
    proxy_url = None
    proxy_user = None
    proxy_passwd = None

    # Or read from a .ini file
    harvester_file = 'config/harvester.ini'
    if os.path.isfile(harvester_file):
        from ConfigParser import ConfigParser

        ini_config = ConfigParser()

        ini_config.read(harvester_file)

        csw_url = ini_config.get(
            'csw', 'url')
        if ini_config.has_option('csw', 'username'):
            csw_user = ini_config.get(
                'csw', 'username')
            csw_passwd = ini_config.get(
                'csw', 'password')

        proxy_protocol = ini_config.get(
            'proxy', 'protocol')
        proxy_url = ini_config.get(
            'proxy', 'url')
        if ini_config.has_option('proxy', 'username'):
            proxy_user = ini_config.get(
                'proxy', 'username')
            proxy_passwd = ini_config.get(
                'proxy', 'password')

        records_per_request = int(ini_config.get(
            'processing', 'records_per_request'))
        start_date = ini_config.get('processing', 'start_date')

    # If your supplying a proxy
    if proxy_url:
        # And your using authentication
        if proxy_user and proxy_passwd:
            password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
            password_mgr.add_password(
                None, proxy_url, proxy_user, proxy_passwd)
            proxy_auth_handler = urllib2.ProxyBasicAuthHandler(password_mgr)
        # or even if your not
        else:
            proxy_auth_handler = urllib2.ProxyHandler(
                {proxy_protocol: proxy_url})

        opener = urllib2.build_opener(proxy_auth_handler)
        urllib2.install_opener(opener)

    # Fetch the data
    # csw = CatalogueServiceWeb(
    #   'http://*****:*****@csw_url/geonetwork/srv/csw')
    if csw_user and csw_passwd:
        csw = CatalogueServiceWeb(
            'http://'+csw_url,
            username=csw_user,
            password=csw_passwd,
            timeout=20)
    else:
        csw = CatalogueServiceWeb('http://'+csw_url, timeout=20)

    request_template = """<?xml version="1.0"?>
<csw:GetRecords
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
    service="CSW"
    version="2.0.2"
    resultType="results"
    outputSchema="csw:IsoRecord"
    maxRecords="%d"
    startPosition="%d"
>
    <csw:Query
        typeNames="gmd:MD_Metadata">
        <csw:ElementSetName>full</csw:ElementSetName>
        <csw:Constraint
            version="1.1.0">
            <Filter
                xmlns="http://www.opengis.net/ogc"
                xmlns:gml="http://www.opengis.net/gml">
                <PropertyIsGreaterThanOrEqualTo>
                    <PropertyName>Modified</PropertyName>
                    <Literal>%s</Literal>
                </PropertyIsGreaterThanOrEqualTo>
            </Filter>
        </csw:Constraint>
    </csw:Query>
</csw:GetRecords>
"""

    # Is there a specified start date
    if arguments['-f']:
        start_date = arguments['-f']

    active_page = 0
    next_record = 1
    request_another = True
    while request_another:
        request_another = False

        # Filter records into latest updates
        #
        # Sorry Tom K., we'll be more modern ASAWC.
        # For now it's good ol' Kitchen Sink
        #
        # from owslib.fes import PropertyIsGreaterThanOrEqualTo
        # modified = PropertyIsGreaterThanOrEqualTo(
        #   'apiso:Modified',
        #   '2015-04-04'
        # )
        # csw.getrecords2(constraints=[modified])
        #
        # Kitchen Sink is the valid HNAP, we need HNAP for R1 to debug issues
        # This filter was supplied by EC, the CSW service technical lead
        current_request = request_template % (
            records_per_request,
            next_record,
            start_date
        )

        # (active_page*records_per_request)+1
        csw.getrecords2(format='xml', xml=current_request)
        active_page += 1

        # Identify if we need to continue this.
        records_root = ("/csw:GetRecordsResponse")

        # Read the file, should be a streamed input in the future
        root = etree.XML(csw.response)
        # Parse the root and itterate over each record
        records = fetchXMLArray(root, records_root)

        timestamp = fetchXMLAttribute(
            records[0], "csw:SearchStatus",
            "timestamp")[0]
        number_of_records_matched = int(fetchXMLAttribute(
            records[0], "csw:SearchResults",
            "numberOfRecordsMatched")[0]
        )
        number_of_records_returned = int(fetchXMLAttribute(
            records[0], "csw:SearchResults",
            "numberOfRecordsReturned")[0]
        )
        next_record = int(fetchXMLAttribute(
            records[0], "csw:SearchResults",
            "nextRecord")[0]
        )

        if next_record > number_of_records_matched or next_record == 0:
            pass
        else:
            request_another = True

        # When we move to Tom K's filter we can use results in an R2 unified
        # harvester
        # print csw.results
        # for rec in csw.records:
        #    print '* '+csw.records[rec].title
        # Till then we need to collect and dump the response from the CSW

        # No use minimizing the XML to try to create a XML Lines file as the
        # data has carriage returns.
        # parser = etree.XMLParser(remove_blank_text=True)
        # elem = etree.XML(csw.response, parser=parser)
        # print etree.tostring(elem)

        # Output the harvested page
        print csw.response
Пример #28
0
# ## search NODC geoportal

# In[2]:

#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw'
endpoint = 'http://www.nodc.noaa.gov/geoportal/csw' 

val = 'wms'
filter2 = fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val),
                        escapeChar='\\',wildCard='*',singleChar='?')
csw = CatalogueServiceWeb(endpoint,timeout=60)


filter_list = [filter2]
csw.getrecords2(constraints=filter_list, maxrecords=1000)

print(len(csw.records.keys()))
choice=np.random.choice(list(csw.records.keys()))
print(csw.records[choice].title)
csw.records[choice].references


# In[3]:

csw.request


# ## search geoport pycsw 

# In[5]:
Пример #29
0
class CSWUtils(object):
    '''
    CSW query utilities
    '''
    DEFAULT_CSW_URL = 'http://ecat.ga.gov.au/geonetwork/srv/eng/csw' # GA's externally-facing eCat
    DEFAULT_TIMEOUT = 30 # Timeout in seconds
    DEFAULT_CRS = 'EPSG:4326' # Unprojected WGS84
    DEFAULT_MAXQUERYRECORDS = 100 # Retrieve only this many datasets per CSW query
    DEFAULT_MAXTOTALRECORDS = 2000 # Maximum total number of records to retrieve

    def __init__(self, 
                 csw_url=None, 
                 timeout=None,
                 debug=False
                 ):
        '''
        Constructor for CSWUtils class
        @param csw_url: URL for CSW service. Defaults to value of CSWUtils.DEFAULT_CSW_URL
        @param timeout: Timeout in seconds. Defaults to value of CSWUtils.DEFAULT_TIMEOUT
        '''
        csw_url = csw_url or CSWUtils.DEFAULT_CSW_URL
        timeout = timeout or CSWUtils.DEFAULT_TIMEOUT
        self.debug = debug

        self.csw = CatalogueServiceWeb(csw_url, timeout=timeout)

    def list_from_comma_separated_string(self, comma_separated_string):
        '''
        Helper function to return list of strings from a comma-separated string
        Substitute single-character wildcard for whitespace characters
        @param comma_separated_string: comma-separated string
        @return: list of strings
        '''
        return [re.sub('(\s)', '_', keyword.strip()) for keyword in comma_separated_string.split(',')]

    # A helper function for date range filtering
    def get_date_filter(self, start_datetime=None, stop_datetime=None, constraint='overlaps'):
        '''
        Helper function to return a list containing a pair of FES filters for a date range
        @param  start_datetime: datetime object for start of time period to search
        @param stop_datetime: datetime object for end of time period to search
        @param constraint: string value of either 'overlaps' or 'within' to indicate type of temporal search

        @return: list containing a pair of FES filters for a date range
        '''
        if start_datetime:
            start_date_string = start_datetime.date().isoformat()
        else:
            start_date_string = '1900-01-01' # Distant past

        if start_datetime:
            stop_date_string = stop_datetime.date().isoformat()
        else:
            stop_date_string = '2100-01-01' # Distant future
            
        if constraint == 'overlaps':
            start_filter = fes.PropertyIsLessThanOrEqualTo(propertyname='ows100:TempExtent_begin', literal=stop_date_string)
            stop_filter = fes.PropertyIsGreaterThanOrEqualTo(propertyname='ows100:TempExtent_end', literal=start_date_string)
        elif constraint == 'within':
            start_filter = fes.PropertyIsGreaterThanOrEqualTo(propertyname='ows100:TempExtent_begin', literal=start_date_string)
            stop_filter = fes.PropertyIsLessThanOrEqualTo(propertyname='ows100:TempExtent_end', literal=stop_date_string)

        return [start_filter, stop_filter]
    
    def any_case_match_filters(self, propertyname, word_list):
        '''
        Helper function to return and/or filter from a word list for partial case insensitivity
        Resolves issue where "matchCase=False" is ignored
        @param propertyname: String denoting CSW property to search
        @param word_list: List of strings for (partially) case-insensitive "and" search
        
        @return: List of FES filters expressing "and" query
        '''
        filter_list = []
        for word in word_list:
            word_variant_set = set([word, word.upper(), word.lower(), word.title()])
            
            if len(word_variant_set) == 1: # Use single filter if no "or" required
                filter_list.append(fes.PropertyIsLike(propertyname=propertyname, literal=word_variant_set.pop(), matchCase=False))
            else:
                filter_list.append(fes.Or([fes.PropertyIsLike(propertyname=propertyname, literal=word_variant, matchCase=False)
                                           for word_variant in set([word, word.upper(), word.lower(), word.title()])
                                           ]
                                          )
                                   )
                
        return filter_list

    def get_csw_records(self, 
                        fes_filters, 
                        max_query_records=None, 
                        max_total_records=None
                        ):
        '''
        Generator yeilding nested dicts containing information about each CSW query result record including distributions
        @param fes_filters: List of fes filters to apply to CSW query
        @param max_query_records: Maximum number of records to return per CSW query. Defaults to value of CSWUtils.DEFAULT_MAXRECORDS
        @param max_total_records: Maximum total number of records to return. Defaults to value of CSWUtils.DEFAULT_MAXTOTALRECORDS
 
        '''
        max_query_records = max_query_records or CSWUtils.DEFAULT_MAXQUERYRECORDS
        max_total_records = max_total_records or CSWUtils.DEFAULT_MAXTOTALRECORDS

        startposition = 1 # N.B: This is 1-based, not 0-based
        record_count = 0

        # Keep querying until all results have been retrieved
        while record_count < max_total_records:
            # apply all the filters using the "and" syntax: [[filter1, filter2]]
            self.csw.getrecords2(constraints=[fes_filters],
                                 esn='full',
                                 outputschema='http://www.isotc211.org/2005/gmd',
                                 maxrecords=max_query_records,
                                 startposition=startposition)

            if self.debug:
                print 'CSW request:\n%s' % self.csw.request
                print 'CSW response:\n %s' % self.csw.response

            query_record_count = len(self.csw.records)

            for uuid in self.csw.records.keys():
                record = self.csw.records[uuid]

                record_count += 1
                yield uuid, record.xml
                #print '%d distribution(s) found for "%s"' % (len(info_list), title)

                if record_count >= max_total_records:  # Don't go around again for another query - maximum retrieved
                    raise Exception('Maximum number of records retrieved ($d)' % max_total_records)
    
            if query_record_count < max_query_records:  # Don't go around again for another query - should be the end
                break

            # Increment start position and repeat query
            startposition += max_query_records

        #print '%d records found.' % record_count


    def query_csw(self,
                  keyword_list=None,
                  bounding_box=None,
                  bounding_box_crs=None,
                  anytext_list=None,
                  titleword_list=None,
                  start_datetime=None,
                  stop_datetime=None,
                  max_total_records=None
                  ):
        '''
        Function to query CSW using AND combination of provided search parameters and return generator object
            yielding nested dicts containing information about each record including distributions
        @param keyword_list: List of strings or comma-separated string containing keyword search terms
        @param bounding_box: Bounding box to search as a list of ordinates [bbox.minx, bbox.minx, bbox.maxx, bbox.maxy]
        @param bounding_box_crs: Coordinate reference system for bounding box. Defaults to value of CSWUtils.DEFAULT_CRS
        @param anytext_list: List of strings or comma-separated string containing any text search terms
        @param titleword: List of strings or comma-separated string containing title search terms
        @param start_datetime: Datetime object defining start of temporal search period
        @param stop_datetime: Datetime object defining end of temporal search period
        @param max_total_records: Maximum total number of records to return. Defaults to value of CSWUtils.DEFAULT_MAXTOTALRECORDS 
        
        @return: generator object yielding nested dicts containing information about each record including distributions
        '''
        bounding_box_crs = bounding_box_crs or CSWUtils.DEFAULT_CRS

        # Convert strings to lists if required
        if type(keyword_list) == str:
            keyword_list = self.list_from_comma_separated_string(keyword_list)

        if type(anytext_list) == str:
            anytext_list = self.list_from_comma_separated_string(anytext_list)

        if type(titleword_list) == str:
            titleword_list = self.list_from_comma_separated_string(titleword_list)

        # Build filter list
        fes_filter_list = []
        
        # Check for unchanged, upper-case, lower-case and capitalised keywords
        if keyword_list:
            fes_filter_list += self.any_case_match_filters('Subject', keyword_list)
          
        if anytext_list:
            fes_filter_list += [fes.PropertyIsLike(propertyname='anyText', literal=phrase, matchCase=False) for phrase in anytext_list]
        
        if start_datetime or stop_datetime:
            fes_filter_list += self.get_date_filter(start_datetime, stop_datetime)
        
        if titleword_list:
            fes_filter_list += [fes.PropertyIsLike(propertyname='title', literal=titleword, matchCase=False) for titleword in titleword_list]
        
        if bounding_box:
            fes_filter_list += [fes.BBox(bounding_box, crs=bounding_box_crs)]

        assert fes_filter_list, 'No search criteria defined'

        # Use single filter if no "and" required
        if len(fes_filter_list) == 1:
            fes_filter_list = fes_filter_list[0]

        # Return generator object
        return self.get_csw_records(fes_filter_list, 
                                    max_total_records=max_total_records)
Пример #30
0
    def mdcache(self, cswurl, constraints=[], maxrecords=10, maxharvest=20):
        """Fills the cache from a csw"""
        
        # cache directory for this csw
        if cswurl in self.cswlist:
            cswsig = self.cswlist[cswurl]['cswsig']
            cswpath = os.path.join(self.cachepath, cswsig)
        else:
            cswsig = hashlib.md5(cswurl).hexdigest()
            logging.info('%s : new signature %s' % (cswurl, cswsig))
            cswpath = os.path.join(self.cachepath, cswsig)
            if not(os.path.isdir(cswpath)):
                os.makedirs(cswpath)
            logging.info('%s %s created' % (cswurl, cswpath))
            
        manifest = {
            'cswsig': cswsig,
            'cswurl': cswurl,
            'domains': {
                'organisationName': self.mdPropertyValues(cswurl, 'organisationName').values()[0],
                'Subject': self.mdPropertyValues(cswurl, 'Subject').values()[0]
            }
        }
        
        f = open(os.path.join(cswpath, self.MANIFEST), 'w')
        f.write(json.dumps(manifest))
        f.close()

        logging.info('loading max %s md from %s' % (maxharvest, cswurl))
        csw = CatalogueServiceWeb(cswurl, skip_caps=True)
        first = True
        nextrecord = 0
        count = 0

        while True:
            if not first:
                nextrecord = csw.results['nextrecord']

            if count + maxrecords > maxharvest:
                maxrecords = maxharvest - count

            csw.getrecords2(
                esn='full', constraints=constraints, startposition=nextrecord,
                maxrecords=maxrecords, outputschema=self.OUTPUTSCHEMA)

            if csw.results['matches'] == 0:
                logging.info('0 md found from %s' % cswurl)
                break
            else:
                first = False
                # fetch records
                for rec_id, rec in csw.records.iteritems():
                    count += 1
                    logging.info(str(int(float(count)/min(maxharvest, csw.results['matches'])*100))+'%')
                    filename = os.path.join(cswpath, rec_id)
                    os.path.join(filename)
                    f = open(filename, 'w')
                    f.write(rec.xml)
                    f.close()
            
                # break if no records, beyond maxrecords or matches
                if csw.results['nextrecord'] == 0 \
                        or csw.results['returned'] == 0 \
                        or csw.results['nextrecord'] > csw.results['matches'] \
                        or csw.results['nextrecord'] > maxharvest:
                    logging.info('%s md loaded from %s' % (count, cswurl))
                    break

        return cswpath
Пример #31
0
# <markdowncell>

# #### Query CSW Servers using filters

# <codecell>

titles = []
urls = []
service_types = []
servers = []
for url in bbox_endpoints:
    print "*", url
    try:
        csw = CatalogueServiceWeb(url, timeout=20)
        csw.getrecords2(constraints=[filters], maxrecords=200, esn='full')
        for record, item in csw.records.items():
            try:
                # Get title
                if len(item.title) > 80:
                   title = "{!s}...{!s}".format(item.title[0:40], item.title[-40:])
                else:
                    title = item.title
                service_url, scheme = next(((d['url'], d['scheme']) for d in item.references), None)
                if service_url:    
                    print "    [x] {!s}".format(title)
                    titles.append(item.title)
                    urls.append(service_url)
                    service_types.append(scheme)
                    servers.append(url)
            except:
Пример #32
0
def _parse_csw(context, repos, record, identifier, pagesize=10):

    from owslib.csw import CatalogueServiceWeb

    recobjs = []  # records
    serviceobj = repos.dataset()

    # if init raises error, this might not be a CSW
    md = CatalogueServiceWeb(record)

    LOGGER.debug('Setting CSW service metadata')
    # generate record of service instance
    _set(context, serviceobj, 'pycsw:Identifier', identifier)
    _set(context, serviceobj, 'pycsw:Typename', 'csw:Record')
    _set(context, serviceobj, 'pycsw:Schema', 'http://www.opengis.net/cat/csw/2.0.2')
    _set(context, serviceobj, 'pycsw:MdSource', record)
    _set(context, serviceobj, 'pycsw:InsertDate', util.get_today_and_now())
    _set(context, serviceobj, 'pycsw:XML', md.response)
    _set(context, serviceobj, 'pycsw:AnyText', util.get_anytext(md._exml))
    _set(context, serviceobj, 'pycsw:Type', 'service')
    _set(context, serviceobj, 'pycsw:Title', md.identification.title)
    _set(context, serviceobj, 'pycsw:Abstract', md.identification.abstract)
    _set(context, serviceobj, 'pycsw:Keywords', ','.join(md.identification.keywords))
    _set(context, serviceobj, 'pycsw:Creator', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:Publisher', md.provider.name)
    _set(context, serviceobj, 'pycsw:Contributor', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:OrganizationName', md.provider.contact.name)
    _set(context, serviceobj, 'pycsw:AccessConstraints', md.identification.accessconstraints)
    _set(context, serviceobj, 'pycsw:OtherConstraints', md.identification.fees)
    _set(context, serviceobj, 'pycsw:Source', record)
    _set(context, serviceobj, 'pycsw:Format', md.identification.type)

    _set(context, serviceobj, 'pycsw:ServiceType', md.identification.type)
    _set(context, serviceobj, 'pycsw:ServiceTypeVersion', md.identification.version)
    _set(context, serviceobj, 'pycsw:Operation', ','.join([d.name for d in md.operations]))
    _set(context, serviceobj, 'pycsw:CouplingType', 'tight')

    links = [
        '%s,OGC-CSW Catalogue Service for the Web,OGC:CSW,%s' % (identifier, md.url)
    ]

    _set(context, serviceobj, 'pycsw:Links', '^'.join(links))

    recobjs.append(serviceobj)

    # get all supported typenames of metadata
    # so we can harvest the entire CSW

    # try for ISO, settle for Dublin Core
    csw_typenames = 'csw:Record'
    csw_outputschema = 'http://www.opengis.net/cat/csw/2.0.2'

    grop = md.get_operation_by_name('GetRecords')
    if all(['gmd:MD_Metadata' in grop.parameters['typeNames']['values'],
            'http://www.isotc211.org/2005/gmd' in grop.parameters['outputSchema']['values']]):
        LOGGER.info('CSW supports ISO')
        csw_typenames = 'gmd:MD_Metadata'
        csw_outputschema = 'http://www.isotc211.org/2005/gmd'


    # now get all records
    # get total number of records to loop against

    try:
        md.getrecords2(typenames=csw_typenames, resulttype='hits',
                       outputschema=csw_outputschema)
        matches = md.results['matches']
    except:  # this is a CSW, but server rejects query
        raise RuntimeError(md.response)

    if pagesize > matches:
        pagesize = matches

    LOGGER.debug('Harvesting %d CSW records' % matches)

    # loop over all catalogue records incrementally
    for r in range(1, matches, pagesize):
        try:
            md.getrecords2(typenames=csw_typenames, startposition=r,
                           maxrecords=pagesize, outputschema=csw_outputschema)
        except Exception, err:  # this is a CSW, but server rejects query
            raise RuntimeError(md.response)
        for k, v in md.records.iteritems():
            if csw_typenames == 'gmd:MD_Metadata':
                recobjs.append(_parse_iso(context, repos, etree.fromstring(v.xml)))
            else:
                recobjs.append(_parse_dc(context, repos, etree.fromstring(v.xml)))
def test_csw_geonetwork():
    c = CatalogueServiceWeb(SERVICE_URL)
    c.getrecords2(typenames='csw:Record')
    assert c.results.get('returned') > 0
    assert c.results.get('nextrecord') > 0
    assert c.results.get('matches') > 0
Пример #34
0
from owslib.fes import PropertyIsLike

from owslib.csw import CatalogueServiceWeb

csw = CatalogueServiceWeb('http://geoland2.ipma.pt/csw')

query_like = PropertyIsLike('dc:title', 'Global Land Surface Temperature %')

csw.getrecords2(constraints=[query_like])

csw.results['matches']
Пример #35
0
class CSWConnexion(object):
    def __init__(self,
                 csw_discovery_url,
                 csw_publication_url,
                 gn_username=None,
                 gn_password=None):
        # TODO: make parameters for username and password
        self.getrecordbyidparam = '?request=GetRecordById&service=CSW&version=2.0.2&namespace=xmlns%28csw=http://www.opengis.net/cat/csw%29&resultType=results&outputSchema=http://www.isotc211.org/2005/gmd&outputFormat=application/xml&typeNames=csw:Record&elementSetName=full&constraintLanguage=CQL_TEXT&constraint_language_version=1.1.0'
        self.csw_discovery_url = csw_discovery_url
        self.csw_publication_url = csw_publication_url
        try:
            if gn_username is not None and gn_password is not None:
                self.csw_publication = CatalogueServiceWeb(
                    self.csw_publication_url,
                    username=gn_username,
                    password=gn_password)
                self.csw_discovery = CatalogueServiceWeb(
                    self.csw_discovery_url,
                    username=gn_username,
                    password=gn_password)
        except ServiceException:
            self.csw_publication = None
            self.csw_discovery = None
        try:
            self.csw_discovery = CatalogueServiceWeb(self.csw_discovery_url)
        except ServiceException:
            self.csw_discovery = None

    def get_record_list(
        self,
        keywords=[],
        typerecord=[]
    ):  #par defaut les types et les keywords a selectionner sont vides --> toutes les fiches sont selectionnees
        from owslib.fes import PropertyIsEqualTo

        constraints = []
        if len(typerecord) > 0:
            for type in typerecord:
                print(type)
                constraints.append(PropertyIsEqualTo('dc:type', type))
        if len(keywords) > 0:
            for keyword in keywords:
                constraints.append(PropertyIsEqualTo('dc:subject', keyword))
                constraints.append(PropertyIsEqualTo('keyword', keyword))
        self.csw_discovery.getrecords2(constraints=constraints,
                                       maxrecords=100000)
        print(self.csw_discovery.results)
        self.identifiers = []
        for rec in self.csw_discovery.records:
            self.identifiers.append(self.csw_discovery.records[rec].identifier)

    def updaterecord(self, id, input_xml):
        if self.csw_publication is not None:
            if isinstance(input_xml, etree._Element):
                input_xml = etree.tostring(input_xml)
            self.csw_publication.transaction(ttype='update',
                                             typename='gmd:MD_Metadata',
                                             record=input_xml,
                                             identifier=id)
            print("updated record " + id)
        else:
            print(
                "Could not perform operation as password-protected CSW is unreachable."
            )

    def getrecordxml(self, id):
        """
        fetches the metadata where the fileIdentifier is equal to id
        """
        f = furl(self.csw_discovery_url + self.getrecordbyidparam)
        f.args['id'] = id
        r = http.request('GET', f.url)
        publishedrecord = r.data
        #publishedrecord=urlopen(f.url)
        tree = etree.fromstring(publishedrecord)
        #tree=etree.parse(str(publishedrecord))
        root = tree[0]
        return root
Пример #36
0
 def test_GetRecords_dataset(self):
     csw = CatalogueServiceWeb(service)
     constraints = [PropertyIsEqualTo("dc:type", "dataset")]
     csw.getrecords2(constraints=constraints, outputschema=GMD, startposition=1, maxrecords=5)
     nrecords = len(csw.records)
Пример #37
0
 def test_GetRecords_summary(self):
     csw = CatalogueServiceWeb(service)
     csw.getrecords2(outputschema=GMD, startposition=1, maxrecords=5, esn="summary")
     nrecords = len(csw.records)
Пример #38
0
# <codecell>

endpoint = 'http://www.nodc.noaa.gov/geoportal/csw'  # NODC/UAF Geoportal: granule level
csw = CatalogueServiceWeb(endpoint, timeout=60)
print csw.version

# <codecell>

for oper in csw.operations:
    if oper.name == 'GetRecords':
        print '\nISO Queryables:\n', oper.constraints[
            'SupportedISOQueryables']['values']

# <codecell>

csw.getrecords2(constraints=filt, maxrecords=1000, esn='full')
len(csw.records.keys())

# <codecell>

dap_urls = service_urls(
    csw.records, service_string='urn:x-esri:specification:ServiceType:OPeNDAP')
len(dap_urls)

# <codecell>

bad_urls = []
for url in dap_urls:
    try:
        nc = netCDF4.Dataset(url)
        #print url
Пример #39
0
def getDataSetURI(anyText, CSWURL, BBox):
    """

    Searches a given CSW server and returns metadata content for the datasets found.

    Arguments
    ---------

    - anyText - A string that will be submitted to the CSW search. (Optional, default is empty which will return all records.)
    - CSWURL - A base URL for the CSW server to be searched. (Optional, defaults to the CDIA/GDP CSW server.)
    - BBox - A lat/lon bounding box in [minx,miny,maxx,maxy] that will be used to limit results to datasets that atleast partially intersect. (Optional)

    """
    csw = CatalogueServiceWeb(CSWURL, skip_caps=True)
    # FIXME: we should allow for "real" multiple keywords,
    # or change the API of anyText if that if that does not make sense in pygdp.
    # If the former we need `fes.And`, if the latter we need to not listfy `anyText`.
    if not anyText:
        constraints = []
    else:
        constraints = [
            fes.PropertyIsLike(propertyname='csw:AnyText', literal=literal)
            for literal in anyText
        ]

    csw.getrecords2(constraints=constraints,
                    outputschema='http://www.isotc211.org/2005/gmd',
                    esn='full',
                    maxrecords=100)
    dataset_uris = [['title', 'abstract', ['urls']]]

    for rec in csw.records:
        title = csw.records[rec].identification.title
        abstract = csw.records[rec].identification.abstract
        urls = []

        try:
            for onlineresource in range(
                    len(csw.records[rec].distribution.online)):
                urls.append(
                    csw.records[rec].distribution.online[onlineresource].url)
        except AttributeError:
            pass

        for ident in range(len(csw.records[rec].identificationinfo)):
            try:
                for operation in range(
                        len(csw.records[rec].identificationinfo[ident].
                            operations)):
                    urls.append(csw.records[rec].identificationinfo[ident].
                                operations[0]['connectpoint'][0].url)
            except AttributeError:
                pass
        entry = [title, abstract, urls]
        dataset_uris.append(entry)

    for i, dataset in enumerate(dataset_uris):
        dataset_uris[i][2] = [
            uri.replace("https", "dods").replace("http", "dods")
            if "/dodsC/" in uri else uri for uri in dataset[2]
        ]
    return dataset_uris
Пример #40
0
class CSWHarvester(Harvester):
    """
    OWSLib csw: https://geopython.github.io/OWSLib/#csw
    """
    def __init__(self, community, url, schema, fromdate, clean, limit, outdir,
                 verify):
        super().__init__(community, url, fromdate, clean, limit, outdir,
                         verify)
        logging.captureWarnings(True)
        self.csw = CatalogueServiceWeb(self.url,
                                       auth=Authentication(verify=self.verify))
        self._schema_type = schema
        self._schema = None
        self._constraints = None

    def identifier(self, record):
        return record.identifier

    def matches(self):
        self.csw.getrecords2(maxrecords=0,
                             constraints=self.constraints,
                             outputschema=self.schema)
        return self.csw.results['matches']

    @property
    def schema(self):
        if not self._schema:
            if self._schema_type == SchemaType.ISO19139:
                ns_name = 'gmd'
            else:
                ns_name = 'csw'
            self._schema = Namespaces().get_namespace(ns_name)
        return self._schema

    @property
    def constraints(self):
        if not self._constraints:
            self._constraints = []
            if self.fromdate:
                from_filter = fes.PropertyIsGreaterThan(
                    'dct:modified', self.fromdate)
                self._constraints.append(from_filter)
        return self._constraints

    def get_records(self):
        """get_records implementation with paging"""
        startposition = 0
        ok = True
        while ok:
            self.csw.getrecords2(constraints=self.constraints,
                                 startposition=startposition,
                                 esn='full',
                                 outputschema=self.schema,
                                 maxrecords=20)
            startposition = self.csw.results['nextrecord']
            if startposition == 0:
                ok = False
            for rec in self.csw.records:
                yield self.csw.records[rec]

    def _write_record(self, fp, record, pretty_print=True):
        xml = etree.tostring(etree.fromstring(record.xml),
                             pretty_print=pretty_print).decode('utf8')
        fp.write(xml)
Пример #41
0
class CSW202Search(SearchBase):
    def __init__(self, url, timeout, username, password, auth):
        super().__init__(url, timeout, username, password, auth)

        self.type = CATALOG_TYPES[0]
        self.format = 'xml'
        self.service_info_template = 'csw_service_metadata.html'
        self.record_info_template = 'record_metadata_dc.html'
        self.constraints = []

        self.conn = CatalogueServiceWeb(
            self.url,  # spellok
            timeout=self.timeout,
            username=self.username,
            password=self.password,
            auth=self.auth)

        self.request = self.conn.request
        self.response = self.conn.response

    def query_records(self, bbox=[], keywords=None, limit=10, offset=1):

        self.constraints = []

        # only apply spatial filter if bbox is not global
        # even for a global bbox, if a spatial filter is applied, then
        # the CSW server will skip records without a bbox
        if bbox and bbox != ['-180', '-90', '180', '90']:
            minx, miny, maxx, maxy = bbox
            self.constraints.append(
                BBox([miny, minx, maxy, maxx],
                     crs='urn:ogc:def:crs:EPSG::4326'))

        # keywords
        if keywords:
            # TODO: handle multiple word searches
            self.constraints.append(PropertyIsLike('csw:AnyText', keywords))

        if len(self.constraints) > 1:  # exclusive search (a && b)
            self.constraints = [self.constraints]

        self.conn.getrecords2(constraints=self.constraints,
                              maxrecords=limit,
                              startposition=offset,
                              esn='full')

        self.matches = self.conn.results['matches']
        self.returned = self.conn.results['returned']

        self.request = self.conn.request
        self.response = self.conn.response

    def records(self):
        recs = []

        for record in self.conn.records:
            rec = {
                'identifier': None,
                'type': None,
                'title': None,
                'bbox': None
            }

            if self.conn.records[record].identifier:
                rec['identifier'] = self.conn.records[record].identifier
            if self.conn.records[record].type:
                rec['type'] = self.conn.records[record].type
            if self.conn.records[record].title:
                rec['title'] = self.conn.records[record].title
            if self.conn.records[record].bbox:
                rec['bbox'] = bbox_list_to_dict(self.conn.records[record].bbox)

            rec['links'] = (self.conn.records[record].uris +
                            self.conn.records[record].references)

            recs.append(rec)

        return recs

    def get_record(self, identifier):
        self.conn.getrecordbyid([identifier])

        return self.conn.records[identifier]
Пример #42
0
    def mdcache(self, cswurl, constraints=[], maxrecords=10, maxharvest=20):
        """Fills the cache from a csw"""

        # cache directory for this csw
        if cswurl in self.cswlist:
            cswsig = self.cswlist[cswurl]["cswsig"]
            cswpath = os.path.join(self.cachepath, cswsig)
        else:
            cswsig = hashlib.md5(cswurl).hexdigest()
            logging.info("%s : new signature %s" % (cswurl, cswsig))
            cswpath = os.path.join(self.cachepath, cswsig)
            if not (os.path.isdir(cswpath)):
                os.makedirs(cswpath)
            logging.info("%s %s created" % (cswurl, cswpath))

        manifest = {
            "cswsig": cswsig,
            "cswurl": cswurl,
            "domains": {
                "organisationName": self.mdPropertyValues(cswurl, "organisationName").values()[0],
                "Subject": self.mdPropertyValues(cswurl, "Subject").values()[0],
            },
        }

        f = open(os.path.join(cswpath, self.MANIFEST), "w")
        f.write(json.dumps(manifest))
        f.close()

        logging.info("loading max %s md from %s" % (maxharvest, cswurl))
        csw = CatalogueServiceWeb(cswurl, skip_caps=True)
        first = True
        nextrecord = 0
        count = 0

        while True:
            if not first:
                nextrecord = csw.results["nextrecord"]

            if count + maxrecords > maxharvest:
                maxrecords = maxharvest - count

            csw.getrecords2(
                esn="full",
                constraints=constraints,
                startposition=nextrecord,
                maxrecords=maxrecords,
                outputschema=self.OUTPUTSCHEMA,
            )

            if csw.results["matches"] == 0:
                logging.info("0 md found from %s" % cswurl)
                break
            else:
                first = False
                # fetch records
                for rec_id, rec in csw.records.iteritems():
                    count += 1
                    logging.info(str(int(float(count) / min(maxharvest, csw.results["matches"]) * 100)) + "%")
                    filename = os.path.join(cswpath, rec_id)
                    os.path.join(filename)
                    f = open(filename, "w")
                    f.write(rec.xml)
                    f.close()

                # break if no records, beyond maxrecords or matches
                if (
                    csw.results["nextrecord"] == 0
                    or csw.results["returned"] == 0
                    or csw.results["nextrecord"] > csw.results["matches"]
                    or csw.results["nextrecord"] > maxharvest
                ):
                    logging.info("%s md loaded from %s" % (count, cswurl))
                    break

        return cswpath
Пример #43
0
def main():
    # Connection variables
    csw_url = 'csw.open.canada.ca/geonetwork/srv/csw'
    csw_user = None
    csw_passwd = None

    proxy_protocol = None
    proxy_url = None
    proxy_user = None
    proxy_passwd = None
    records_per_request = 10

    # Or read from a .ini file
    harvester_file = 'config/harvester.ini'
    if os.path.isfile(harvester_file):
        from ConfigParser import ConfigParser

        ini_config = ConfigParser()

        ini_config.read(harvester_file)

        csw_url = ini_config.get('csw', 'url')

        # Get configuration options
        if ini_config.has_option('csw', 'username'):
            csw_user = ini_config.get('csw', 'username')

            csw_passwd = ini_config.get('csw', 'password')

        if ini_config.has_option('proxy', 'protocol'):
            proxy_protocol = ini_config.get('proxy', 'protocol')

        if ini_config.has_option('proxy', 'url'):
            proxy_url = ini_config.get('proxy', 'url')

        if ini_config.has_option('proxy', 'username'):
            proxy_user = ini_config.get('proxy', 'username')
            proxy_passwd = ini_config.get('proxy', 'password')

        if ini_config.has_option('processing', 'records_per_request'):
            records_per_request = int(
                ini_config.get('processing', 'records_per_request'))

        if ini_config.has_option('processing', 'start_date'):
            start_date = ini_config.get('processing', 'start_date')

    # If your supplying a proxy
    if proxy_url:
        # And your using authentication
        if proxy_user and proxy_passwd:
            password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
            password_mgr.add_password(None, proxy_url, proxy_user,
                                      proxy_passwd)
            proxy_auth_handler = urllib2.ProxyBasicAuthHandler(password_mgr)
        # or even if your not
        else:
            proxy_auth_handler = urllib2.ProxyHandler(
                {proxy_protocol: proxy_url})

        opener = urllib2.build_opener(proxy_auth_handler)
        urllib2.install_opener(opener)

    # Fetch the data
    # csw = CatalogueServiceWeb(
    #   'http://*****:*****@csw_url/geonetwork/srv/csw')
    if csw_user and csw_passwd:
        csw = CatalogueServiceWeb('http://' + csw_url,
                                  username=csw_user,
                                  password=csw_passwd,
                                  timeout=20)
    else:
        csw = CatalogueServiceWeb('http://' + csw_url, timeout=20)

    request_template = """<?xml version="1.0"?>
<csw:GetRecords
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
    service="CSW"
    version="2.0.2"
    resultType="results"
    outputSchema="csw:IsoRecord"
    maxRecords="%d"
    startPosition="%d"
>
    <csw:Query
        typeNames="gmd:MD_Metadata">
        <csw:ElementSetName>full</csw:ElementSetName>
        <csw:Constraint
            version="1.1.0">
            <Filter
                xmlns="http://www.opengis.net/ogc"
                xmlns:gml="http://www.opengis.net/gml">
                <PropertyIsGreaterThanOrEqualTo>
                    <PropertyName>Modified</PropertyName>
                    <Literal>%s</Literal>
                </PropertyIsGreaterThanOrEqualTo>
            </Filter>
        </csw:Constraint>
    </csw:Query>
</csw:GetRecords>
"""

    # Is there a specified start date
    if arguments['-f']:
        start_date = arguments['-f']

    active_page = 0
    next_record = 1
    request_another = True

    while request_another:

        request_another = False

        # Filter records into latest updates
        #
        # Sorry Tom K., we'll be more modern ASAWC.
        # For now it's good ol' Kitchen Sink
        #
        # from owslib.fes import PropertyIsGreaterThanOrEqualTo
        # modified = PropertyIsGreaterThanOrEqualTo(
        #   'apiso:Modified',
        #   '2015-04-04'
        # )
        # csw.getrecords2(constraints=[modified])
        #
        # Kitchen Sink is the valid HNAP, we need HNAP for R1 to debug issues
        # This filter was supplied by EC, the CSW service technical lead
        current_request = request_template % (records_per_request, next_record,
                                              start_date)

        # (active_page*records_per_request)+1
        csw.getrecords2(format='xml', xml=current_request)
        active_page += 1

        # Identify if we need to continue this.
        records_root = ("/csw:GetRecordsResponse")

        # Read the file, should be a streamed input in the future
        root = etree.XML(csw.response)
        # Parse the root and itterate over each record
        records = fetchXMLArray(root, records_root)

        timestamp = fetchXMLAttribute(records[0], "csw:SearchStatus",
                                      "timestamp")[0]
        number_of_records_matched = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "numberOfRecordsMatched")[0])
        number_of_records_returned = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "numberOfRecordsReturned")[0])
        next_record = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "nextRecord")[0])

        if next_record > number_of_records_matched or next_record == 0:
            pass
        else:
            request_another = True

        # When we move to Tom K's filter we can use results in an R2 unified
        # harvester
        # print csw.results
        # for rec in csw.records:
        #    print '* '+csw.records[rec].title
        # Till then we need to collect and dump the response from the CSW

        # No use minimizing the XML to try to create a XML Lines file as the
        # data has carriage returns.
        # parser = etree.XMLParser(remove_blank_text=True)
        # elem = etree.XML(csw.response, parser=parser)
        # print etree.tostring(elem)

        # Output the harvested page
        print csw.response
Пример #44
0
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>

# <codecell>

from owslib import fes
from owslib.csw import CatalogueServiceWeb

schemes = set()
c   = CatalogueServiceWeb("https://data.noaa.gov/csw", timeout=20)
fil = fes.PropertyIsLike(propertyname='apiso:AnyText', literal="*sea_surface_height_above_sea_level*", wildCard='*')
c.getrecords2(constraints=[fil], maxrecords=1000, esn='full')

# <codecell>

for record, item in c.records.items():
    for d in item.references:
        schemes.add(d['scheme'])

# <codecell>

for scheme in schemes:
    print scheme

# <codecell>

type(schemes)

# <codecell>

Пример #45
0
def create_services_from_endpoint(url, catalog, greedy_opt=True):
    """
    Generate service/services from an endpoint.
    WMS, WMTS, TMS endpoints correspond to a single service.
    ESRI, CSW endpoints corrispond to many services.
    :return: imported, message
    """

    # this variable will collect any exception message during the routine.
    # will be used in the last step to send a message if "detected" var is False.
    messages = []

    num_created = 0
    endpoint = get_sanitized_endpoint(url)
    try:
        urllib2.urlopen(endpoint, timeout=10)
    except Exception as e:
        message = traceback.format_exception(*sys.exc_info())
        LOGGER.error('Cannot open this endpoint: %s' % endpoint)
        LOGGER.error('ERROR MESSAGE: %s' % message)
        LOGGER.error(e, exc_info=True)

        return False, message

    detected = False

    # handle specific service types for some domains (WorldMap, Wrapper...)
    parsed_uri = urlparse(endpoint)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    if domain == 'http://worldmap.harvard.edu/':
        service_type = 'Hypermap:WorldMap'
        title = 'Harvard WorldMap'
        abstract = 'Harvard WorldMap'
        endpoint = domain
        detected = True
    if domain in [
        'http://maps.nypl.org/',
        'http://mapwarper.net/',
        'http://warp.worldmap.harvard.edu/',
    ]:
        service_type = 'Hypermap:WARPER'
        title = 'Warper at %s' % domain
        abstract = 'Warper at %s' % domain
        detected = True

    # test if it is CSW, WMS, TMS, WMTS or Esri
    # CSW
    try:
        csw = CatalogueServiceWeb(endpoint)
        service_type = 'OGC:CSW'
        service_links = {}
        detected = True

        typenames = 'csw:Record'
        outputschema = 'http://www.opengis.net/cat/csw/2.0.2'

        if 'csw_harvest_pagesize' in settings.REGISTRY_PYCSW['manager']:
            pagesize = int(settings.REGISTRY_PYCSW['manager']['csw_harvest_pagesize'])
        else:
            pagesize = 10

        LOGGER.debug('Harvesting CSW %s' % endpoint)
        # now get all records
        # get total number of records to loop against
        try:
            csw.getrecords2(typenames=typenames, resulttype='hits',
                            outputschema=outputschema)
            matches = csw.results['matches']
        except:  # this is a CSW, but server rejects query
            raise RuntimeError(csw.response)

        if pagesize > matches:
            pagesize = matches

        LOGGER.info('Harvesting %d CSW records' % matches)

        # loop over all catalogue records incrementally
        for r in range(1, matches+1, pagesize):
            LOGGER.info('Parsing %s from %s' % (r, matches))
            try:
                csw.getrecords2(typenames=typenames, startposition=r,
                                maxrecords=pagesize, outputschema=outputschema, esn='full')
            except Exception as err:  # this is a CSW, but server rejects query
                raise RuntimeError(csw.response)
            for k, v in csw.records.items():
                # try to parse metadata
                try:
                    LOGGER.info('Looking for service links')

                    LOGGER.debug('Looking for service links via dct:references')
                    if v.references:
                        for ref in v.references:
                            scheme = None

                            if ref['scheme'] in [st[0] for st in SERVICE_TYPES]:
                                if ref['url'] not in service_links:
                                    scheme = ref['scheme']
                                    service_links[ref['url']] = scheme
                            else:  # loose detection
                                scheme = detect_metadata_url_scheme(ref['url'])
                                if scheme is not None:
                                    if ref['url'] not in service_links:
                                        service_links[ref['url']] = scheme

                            if scheme is None:
                                continue
                            try:
                                service = create_service_from_endpoint(ref['url'], scheme, catalog=catalog)

                                if service is not None:
                                    num_created = num_created + 1
                                    LOGGER.info('Found %s services on endpoint' % num_created)
                            except Exception, e:
                                LOGGER.error('Could not create service for %s : %s' % (scheme, ref['url']))
                                LOGGER.error(e, exc_info=True)
                    LOGGER.debug('Looking for service links via the GeoNetwork-ish dc:URI')
                    if v.uris:
                        for u in v.uris:  # loose detection
                            scheme = detect_metadata_url_scheme(u['url'])
                            if scheme is not None:
                                if u['url'] not in service_links:
                                    service_links[u['url']] = scheme
                            else:
                                continue

                            try:
                                service = create_service_from_endpoint(u['url'], scheme, catalog=catalog)

                                if service is not None:
                                    num_created = num_created + 1
                                    LOGGER.info('Found %s services on endpoint' % num_created)
                            except Exception, e:
                                LOGGER.error('Could not create service for %s : %s' % (scheme, u['url']))
                                LOGGER.error(e, exc_info=True)

                except Exception as err:  # parsing failed for some reason
                    LOGGER.warning('Metadata parsing failed %s', err)
                    LOGGER.error(err, exc_info=True)
def get_datasets_from_csw(csw_endpoint):
    csw = CatalogueServiceWeb('http://cida.usgs.gov/gdp/csw/')
    csw.getrecords2(esn='full', maxrecords=1000)
    return csw.records.values()
from owslib.csw import CatalogueServiceWeb
cenia = CatalogueServiceWeb('http://geoportal.gov.cz/php/micka/csw/index.php')
print (cenia.service)

cenia.getrecords2()
print (cenia.results)

for rec in cenia.records:
    print (cenia.records[rec].title)

from owslib.fes import PropertyIsLike, BBox, And, PropertyIsEqualTo
wms_query = PropertyIsEqualTo('csw:AnyText', 'WMS')
praha_query = BBox([14.22,49.94,14.71,50.18])
praha_and_wms = And([praha_query, wms_query])
cenia.getrecords2([praha_and_wms], esn='full')
print (cenia.results)

for recid in cenia.records:
    record = cenia.records[recid]
    print (u'{}: {} {} {} {}'.format(record.title, record.bbox.minx, record.bbox.miny,
                                     record.bbox.maxx, record.bbox.maxy))

zm_query = PropertyIsEqualTo('csw:AnyText', '%ZM10%')
cenia.getrecords2([zm_query], esn='full')
zm10 = cenia.records['CZ-CUZK-WMS-ZM10-P']
print (zm10.type)

print (u'{}\n{}'.format(zm10.title, zm10.abstract))

url = zm10.references[0]['url']
Пример #48
0
class TestCSW(LiveServerTestCase):
    def setUp(self):
        self.script_name = '/registry/{}/csw'.format(catalog_test_slug)
        self.url = '{}{}'.format(self.live_server_url, self.script_name)
        self.username = '******'
        self.password = '******'
        self.client = Client()
        user = User.objects.create(username=self.username)
        user.set_password(self.password)
        user.save()
        self.client.login(username=self.username, password=self.password)

        settings.REGISTRY_PYCSW['server']['url'] = self.url

        Catalog.objects.get_or_create(
            name=catalog_test_slug
        )

        Layer.objects.all().delete()
        Service.objects.all().delete()

        print ""
        print ">>> with env:"
        print "REGISTRY_SKIP_CELERY: %s" % settings.REGISTRY_SKIP_CELERY
        print "REGISTRY_LIMIT_LAYERS: %s" % settings.REGISTRY_LIMIT_LAYERS
        print "REGISTRY_SEARCH_URL: %s" % settings.REGISTRY_SEARCH_URL
        print "REGISTRY_HARVEST_SERVICES: %s" % settings.REGISTRY_HARVEST_SERVICES
        print ""

        # Post the 10 Layers contained in this file: data/cswt_insert.xml
        path = os.path.join(settings.PROJECT_DIR, "..",
                            "data", "cswt_insert.xml")
        with open(path, 'rb') as ff:
            payload = ff.read()
        content_type = "application/xml"

        res = self.client.post(self.url, data=payload, content_type=content_type)
        self.assertEqual(res.status_code, 200)
        self.assertEqual(Layer.objects.all().count(), 10)

    def test_csw(self):
        # test 2.0.2 Basic Service Profile

        self.csw = CatalogueServiceWeb(self.url, version='2.0.2', username=self.username, password=self.password)
        self.assertEqual(self.csw.version, '2.0.2')

        self.assertIn('2.0.2', self.csw.parameters['version'].values)
        self.assertIn('3.0.0', self.csw.parameters['version'].values)

        for op in self.csw.operations:
            for method in op.methods:
                self.assertEqual(self.csw.url, method['url'])

        self.assertTrue('Transaction' in [o.name for o in self.csw.operations])
        self.assertTrue('Harvest' in [o.name for o in self.csw.operations])

        get_records_op = self.csw.get_operation_by_name('GetRecords')
        self.assertIn('application/json', get_records_op.parameters['outputFormat']['values'])

        # test basic search, no predicates
        self.csw.getrecords2()
        self.assertEqual(Layer.objects.all().count(), self.csw.results['matches'])

        # test csw:AnyText
        anytext = PropertyIsLike('csw:AnyText', 'Brasilia')
        self.csw.getrecords2(constraints=[anytext])
        self.assertEqual(self.csw.results['matches'], 1)

        anytext = PropertyIsLike('csw:AnyText', 'roads')
        self.csw.getrecords2(constraints=[anytext])
        self.assertEqual(self.csw.results['matches'], 4)

        # test ogc:BBOX
        bbox = BBox(['-13', '-80', '15', '-30'])
        self.csw.getrecords2(constraints=[bbox])
        self.assertEqual(self.csw.results['matches'], 2)

        # test csw:AnyText OR ogc:BBOX
        self.csw.getrecords2(constraints=[anytext, bbox])
        self.assertEqual(self.csw.results['matches'], 5)

        # test csw:AnyText && ogc:BBOX
        self.csw.getrecords2(constraints=[[anytext, bbox]])
        self.assertEqual(self.csw.results['matches'], 1)

        # test that ElementSetName=full stores full metadata record as inserted
        self.csw.getrecords2(esn='full')
        self.assertIn('xmlns:registry="http://gis.harvard.edu/HHypermap/registry/0.1"', self.csw.response)

        # test JSON output
        # TODO: fix owslib.csw.CatalogueServiceWeb.getrecords2 to handle non-XML request/response
        with self.assertRaises(XMLSyntaxError):
            self.csw.getrecords2(constraints=[anytext, bbox], format='application/json')

        records_json = json.loads(self.csw.response)
        self.assertEqual(records_json['csw:GetRecordsResponse']['csw:SearchResults']['@numberOfRecordsMatched'], '5')

        # test 3.0.0 OpenSearch
        bsp = {
            'mode': 'opensearch',
            'service': 'CSW',
            'version': '3.0.0',
            'request': 'GetRecords',
            'typenames': 'csw:Record',
            'elementsetname': 'full',
            'outputformat': 'application/json'
        }

        # test basic search, no predicates
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '10')

        # test q
        bsp['q'] = 'Brasilia'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')
        bsp.pop('q')

        # test bbox
        bsp['bbox'] = '-80,-13,-30,15'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '2')
        bsp.pop('bbox')

        # test time
        bsp['time'] = '2014-09-23T12:04:31.102243+00:00/'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '10')
        bsp.pop('time')

        # test q and bbox
        bsp['q'] = 'roads'
        bsp['bbox'] = '-80,-13,-30,15'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')

        # test q and bbox and time
        bsp['time'] = '2014-09-23T12:04:31.102243+00:00/'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')

    @classmethod
    def tearDownClass(cls):
        # Workaround for https://code.djangoproject.com/ticket/22414
        # Persistent connections not closed by LiveServerTestCase, preventing dropping test databases
        # https://github.com/cjerdonek/django/commit/b07fbca02688a0f8eb159f0dde132e7498aa40cc
        def close_sessions(conn):
            close_sessions_query = """
                SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE
                    datname = current_database() AND
                    pid <> pg_backend_pid();
            """
            with conn.cursor() as cursor:
                try:
                    cursor.execute(close_sessions_query)
                except OperationalError:
                    # We get kicked out after closing.
                    pass

        for alias in connections:
            connections[alias].close()
            close_sessions(connections[alias])

        print "Forcefully closed database connections."
# #### Query each CSW catalog for the cf_name_filters constructed above

# <codecell>

from owslib.csw import CatalogueServiceWeb
from utilities import normalize_service_urn

var_results = []

for x in range(len(cf_name_filters)):
    var_name          = variables_to_query[x]
    single_var_filter = cf_name_filters[x]
    for url in known_csw_servers:
        try:
            csw = CatalogueServiceWeb(url, timeout=20)
            csw.getrecords2(constraints=[single_var_filter], maxrecords=1000, esn='full')
            for record, item in csw.records.items():
                for d in item.references:
                    result = dict(variable=var_name,
                                  scheme=normalize_service_urn(d['scheme']),
                                  url=d['url'],
                                  server=url,
                                  title=record.title())
                    var_results.append(result)
        except BaseException, e:
            print "- FAILED: %s - %s" % (url, e)

# <markdowncell>

# <div class="error"><strong>Paginating CSW Records</strong> - Some servers have a maximum amount of records you can retrieve at once. See: https://github.com/ioos/system-test/issues/126</div>
Пример #50
0
    print('GetDomain not supported')

# <codecell>

box=[-72.0, 41.0, -69.0, 43.0]   # gulf of maine

# <codecell>

val = 'salinity'
filter1 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val),
                        escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [ filter1 ]

# <codecell>

csw.getrecords2(constraints=filter_list,maxrecords=1000,esn='full')
len(csw.records.keys())

# <codecell>

for rec,item in csw.records.iteritems():
    print item.title

# <codecell>

choice=np.random.choice(list(csw.records.keys()))
print csw.records[choice].title
csw.records[choice].references

# <codecell>
Пример #51
0
def create_services_from_endpoint(url, catalog, greedy_opt=True):
    """
    Generate service/services from an endpoint.
    WMS, WMTS, TMS endpoints correspond to a single service.
    ESRI, CSW endpoints corrispond to many services.
    :return: imported, message
    """

    # this variable will collect any exception message during the routine.
    # will be used in the last step to send a message if "detected" var is False.
    messages = []

    num_created = 0
    endpoint = get_sanitized_endpoint(url)
    try:
        urllib.request.urlopen(endpoint, timeout=10)
    except Exception as e:
        message = traceback.format_exception(*sys.exc_info())
        LOGGER.error('Cannot open this endpoint: %s' % endpoint)
        LOGGER.error('ERROR MESSAGE: %s' % message)
        LOGGER.error(e, exc_info=True)

        return False, message

    detected = False

    # handle specific service types for some domains (WorldMap, Wrapper...)
    parsed_uri = urlparse(endpoint)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    if domain == 'http://worldmap.harvard.edu/':
        service_type = 'Hypermap:WorldMap'
        title = 'Harvard WorldMap'
        abstract = 'Harvard WorldMap'
        endpoint = domain
        detected = True
    if domain in [
            'http://maps.nypl.org/',
            'http://mapwarper.net/',
            'http://warp.worldmap.harvard.edu/',
    ]:
        service_type = 'Hypermap:WARPER'
        title = 'Warper at %s' % domain
        abstract = 'Warper at %s' % domain
        detected = True

    # test if it is CSW, WMS, TMS, WMTS or Esri
    # CSW
    try:
        csw = CatalogueServiceWeb(endpoint)
        service_type = 'OGC:CSW'
        service_links = {}
        detected = True

        typenames = 'csw:Record'
        outputschema = 'http://www.opengis.net/cat/csw/2.0.2'

        if 'csw_harvest_pagesize' in settings.REGISTRY_PYCSW['manager']:
            pagesize = int(
                settings.REGISTRY_PYCSW['manager']['csw_harvest_pagesize'])
        else:
            pagesize = 10

        LOGGER.debug('Harvesting CSW %s' % endpoint)
        # now get all records
        # get total number of records to loop against
        try:
            csw.getrecords2(typenames=typenames,
                            resulttype='hits',
                            outputschema=outputschema)
            matches = csw.results['matches']
        except:  # this is a CSW, but server rejects query
            raise RuntimeError(csw.response)

        if pagesize > matches:
            pagesize = matches

        LOGGER.info('Harvesting %d CSW records' % matches)

        # loop over all catalogue records incrementally
        for r in range(1, matches + 1, pagesize):
            LOGGER.info('Parsing %s from %s' % (r, matches))
            try:
                csw.getrecords2(typenames=typenames,
                                startposition=r,
                                maxrecords=pagesize,
                                outputschema=outputschema,
                                esn='full')
            except Exception as err:  # this is a CSW, but server rejects query
                raise RuntimeError(csw.response)
            for k, v in list(csw.records.items()):
                # try to parse metadata
                try:
                    LOGGER.info('Looking for service links')

                    LOGGER.debug(
                        'Looking for service links via dct:references')
                    if v.references:
                        for ref in v.references:
                            scheme = None

                            if ref['scheme'] in [
                                    st[0] for st in SERVICE_TYPES
                            ]:
                                if ref['url'] not in service_links:
                                    scheme = ref['scheme']
                                    service_links[ref['url']] = scheme
                            else:  # loose detection
                                scheme = detect_metadata_url_scheme(ref['url'])
                                if scheme is not None:
                                    if ref['url'] not in service_links:
                                        service_links[ref['url']] = scheme

                            if scheme is None:
                                continue
                            try:
                                service = create_service_from_endpoint(
                                    ref['url'], scheme, catalog=catalog)

                                if service is not None:
                                    num_created = num_created + 1
                                    LOGGER.info(
                                        'Found %s services on endpoint' %
                                        num_created)
                            except Exception as e:
                                LOGGER.error(
                                    'Could not create service for %s : %s' %
                                    (scheme, ref['url']))
                                LOGGER.error(e, exc_info=True)
                    LOGGER.debug(
                        'Looking for service links via the GeoNetwork-ish dc:URI'
                    )
                    if v.uris:
                        for u in v.uris:  # loose detection
                            scheme = detect_metadata_url_scheme(u['url'])
                            if scheme is not None:
                                if u['url'] not in service_links:
                                    service_links[u['url']] = scheme
                            else:
                                continue

                            try:
                                service = create_service_from_endpoint(
                                    u['url'], scheme, catalog=catalog)

                                if service is not None:
                                    num_created = num_created + 1
                                    LOGGER.info(
                                        'Found %s services on endpoint' %
                                        num_created)
                            except Exception as e:
                                LOGGER.error(
                                    'Could not create service for %s : %s' %
                                    (scheme, u['url']))
                                LOGGER.error(e, exc_info=True)

                except Exception as err:  # parsing failed for some reason
                    LOGGER.warning('Metadata parsing failed %s', err)
                    LOGGER.error(err, exc_info=True)

    except XMLSyntaxError as e:
        # This is not XML, so likely not a CSW. Moving on.
        pass
    except Exception as e:
        LOGGER.error(e, exc_info=True)
        messages.append(str(e))

    # WMS
    if not detected:
        try:
            service = get_wms_version_negotiate(endpoint, timeout=10)
            service_type = 'OGC:WMS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except XMLSyntaxError as e:
            # This is not XML, so likely not a WMS. Moving on.
            pass
        except Exception as e:
            LOGGER.error(e, exc_info=True)
            messages.append(str(e))

    # TMS
    if not detected:
        try:
            service = TileMapService(endpoint, timeout=10)
            service_type = 'OSGeo:TMS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except XMLSyntaxError as e:
            # This is not XML, so likely not a TsMS. Moving on.
            pass
        except Exception as e:
            LOGGER.error(e, exc_info=True)
            messages.append(str(e))

    # WMTS
    if not detected:
        try:
            # @tomkralidis timeout is not implemented for WebMapTileService?
            service = WebMapTileService(endpoint)
            service_type = 'OGC:WMTS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except XMLSyntaxError as e:
            # This is not XML, so likely not a WMTS. Moving on.
            pass
        except Exception as e:
            LOGGER.error(e, exc_info=True)
            messages.append(str(e))

    # if detected, let's create the service
    if detected and service_type != 'OGC:CSW':
        try:
            service = create_service_from_endpoint(endpoint,
                                                   service_type,
                                                   title,
                                                   abstract=abstract,
                                                   catalog=catalog)
            if service is not None:
                num_created = num_created + 1
        except XMLSyntaxError as e:
            # This is not XML, so likely not a OGC:CSW. Moving on.
            pass
        except Exception as e:
            LOGGER.error(e, exc_info=True)
            messages.append(str(e))

    # Esri
    # a good sample is here: https://gis.ngdc.noaa.gov/arcgis/rest/services

    # we can safely assume the following condition (at least it is true for 1170 services)
    # we need to test this as arcrest.Folder can freeze with not esri url such as this one:
    # http://hh.worldmap.harvard.edu/admin/aggregator/service/?q=%2Frest%2Fservices

    if '/rest/services' in endpoint:
        if not detected:
            try:
                esri = arcrest.Folder(endpoint)
                service_type = 'ESRI'
                detected = True

                service_to_process, folder_to_process = esri.services, esri.folders
                if not greedy_opt:
                    folder_to_process = []
                    sections = service_url_parse(url)
                    service_to_process = get_single_service(esri, sections)

                processed_services = process_esri_services(
                    service_to_process, catalog)
                num_created = num_created + len(processed_services)

                for folder in folder_to_process:
                    folder_services = process_esri_services(
                        folder.services, catalog)
                    num_created = num_created + len(folder_services)

            except Exception as e:
                LOGGER.error(e, exc_info=True)
                messages.append(str(e))

    if detected:
        return True, '%s service/s created' % num_created
    else:
        m = '|'.join(messages)
        return False, 'ERROR! Could not detect service type for ' \
                      'endpoint %s or already existing. messages=(%s)' % (endpoint, m)
# #### Query each CSW catalog for revery model_name_filter constructed above

# <codecell>

from owslib.csw import CatalogueServiceWeb

model_results = []

for x in range(len(model_name_filters)):
    model_name          = known_model_strings[x]
    single_model_filter = model_name_filters[x]
    for url in known_csw_servers:
        try:
            csw = CatalogueServiceWeb(url, timeout=20)
            csw.getrecords2(constraints=[single_model_filter], maxrecords=1000, esn='full')
            for record, item in csw.records.items():
                for d in item.references:
                    result = dict(model=model_name,
                                  scheme=d['scheme'],
                                  url=d['url'],
                                  server=url)
                    model_results.append(result)
        except BaseException as e:
            print "- FAILED: %s - %s" % (url, e.msg)

# <markdowncell>

# <div class="error"><strong>Paginating CSW Records</strong> - Some servers have a maximum amount of records you can retrieve at once. See: https://github.com/ioos/system-test/issues/126</div>

# <markdowncell>
Пример #53
0
class CswScanner:
    def __init__(self):
        self.napids = []
        self.start_pos = 0

        # Get the CSW URL, Username and Password

        ini_config = ConfigParser()
        ini_config.read('harvester.ini')
        csw_url = ini_config.get('csw', 'csw.url')
        csw_user = ini_config.get('csw', 'csw.username')
        csw_passwd = ini_config.get('csw', 'csw.password')
        if csw_user and csw_passwd:
            self.csw = CatalogueServiceWeb(csw_url,
                                           username=csw_user,
                                           password=csw_passwd,
                                           timeout=20)
        else:
            self.csw = CatalogueServiceWeb(csw_url, timeout=20)

#### The since date is currently being ignored.

    def get_all_ids(self, since=None):

        while True:
            if since is not None:
                scan_date = since.strftime('%Y-%m-%d')
                since_query = PropertyIsGreaterThanOrEqualTo(
                    'Modified', scan_date)
                self.csw.getrecords2(esn='brief',
                                     startposition=self.start_pos,
                                     typenames='gmd:MD_Metadata',
                                     constraints=[since_query])
            else:
                #self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata')
                self.csw.getrecords2(
                    xml=
                    '<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord"><csw:Query typeNames="gmd:MD_Metadata"><csw:Constraint version="1.1.0"><Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"/></csw:Constraint></csw:Query></csw:GetRecords>'
                )
            if self.csw.results['returned'] == 0:
                break
            print '{0}Found {1}{2}{3} records'.format(
                Fore.GREEN, Fore.BLUE, self.csw.results['matches'], Fore.GREEN)
            print '{0}Next record: {1}{2}'.format(
                Fore.GREEN, Fore.BLUE, self.csw.results['nextrecord'])
            self.start_pos = self.csw.results['nextrecord']

            for rec in self.csw.records:
                try:
                    print u'{0}{1}{2}: {3}'.format(
                        Fore.RED, rec, Fore.CYAN,
                        self.csw.records[rec].title.decode('utf-8'))
                except UnicodeEncodeError:
                    print u'{0}Unprintable title for {1}{2}'.format(
                        Fore.GREEN, Fore.RED, rec)
                self.napids.append(rec)

    def load_naps(self):

        ns = Namespaces()
        gmd = ns.get_namespace('gmd')
        session = connect_to_database()

        for napid in self.napids:

            print '{0}Full NAP Record for {1}{2}'.format(
                Fore.GREEN, Fore.CYAN, napid)
            self.csw.getrecordbyid(id=[napid], outputschema=gmd)

            ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord)

            if ec_rec is None:
                ec_rec = ECRecord(
                    uuid=self.csw.records[napid].identifier,
                    title=self.csw.records[napid].identification.title,
                    state='active',
                    nap_record=self.csw.records[napid].xml,
                    csw_scanned=datetime.now().isoformat())
            else:
                ec_rec.title = self.csw.records[napid].identification.title,
                ec_rec.state = 'active',
                ec_rec.nap_record = self.csw.records[napid].xml,
                ec_rec.csw_scanned = datetime.now().isoformat()

            add_record(session, ec_rec)

        session.close_all()
#         print '\n' + var_name.upper()
        num_recs = []
        for location, bounding_box in locations.iteritems():
#             print location
            
            bbox = fes.BBox(bounding_box)
            #use the search name to create search filter
            or_filt = fes.Or([fes.PropertyIsLike(propertyname='apiso:AnyText',
                                                 literal='*%s*' % val,
                                                 escapeChar='\\',
                                                 wildCard='*',
                                                 singleChar='?') for val in names_dict[var_name]["names"]])
            filter_list = [fes.And([ bbox, or_filt])]
            # try request using multiple filters "and" syntax: [[filter1,filter2]]
            try:
                csw.getrecords2(constraints=filter_list, maxrecords=max_records, resulttype='hits')
            except Exception as e:
                print '\t' + 'ERROR - ' + str(e)
                num_recs.append(np.NaN)
            else:
#                 print csw.results['matches']
                num_recs.append(csw.results['matches'])
            
        results[var_name] = np.array(num_recs)

    # Save the results
    prod = list(itertools.product([endpoint], locations.keys()))
    mi = pd.MultiIndex.from_tuples(prod, names=['endpoint', 'location'])
    all_data.append(pd.DataFrame(results, index=mi))
                     
    # Update progress bar
Пример #55
0
class TestCSW(LiveServerTestCase):
    def setUp(self):
        self.script_name = '/registry/{}/csw'.format(catalog_test_slug)
        self.url = '{}{}'.format(self.live_server_url, self.script_name)
        self.username = '******'
        self.password = '******'
        self.client = Client()
        user = User.objects.create(username=self.username)
        user.set_password(self.password)
        user.save()
        self.client.login(username=self.username, password=self.password)

        settings.REGISTRY_PYCSW['server']['url'] = self.url

        Catalog.objects.get_or_create(name=catalog_test_slug)

        Layer.objects.all().delete()
        Service.objects.all().delete()

        print ""
        print ">>> with env:"
        print "REGISTRY_SKIP_CELERY: %s" % settings.REGISTRY_SKIP_CELERY
        print "REGISTRY_LIMIT_LAYERS: %s" % settings.REGISTRY_LIMIT_LAYERS
        print "REGISTRY_CHECK_PERIOD: %s" % settings.REGISTRY_CHECK_PERIOD
        print "REGISTRY_SEARCH_URL: %s" % settings.REGISTRY_SEARCH_URL
        print "REGISTRY_HARVEST_SERVICES: %s" % settings.REGISTRY_HARVEST_SERVICES
        print ""

        # Post the 10 Layers contained in this file: data/cswt_insert.xml
        path = os.path.join(settings.PROJECT_DIR, "..", "data",
                            "cswt_insert.xml")
        with open(path, 'rb') as ff:
            payload = ff.read()
        content_type = "application/xml"

        res = self.client.post(self.url,
                               data=payload,
                               content_type=content_type)
        self.assertEqual(res.status_code, 200)
        self.assertEqual(Layer.objects.all().count(), 10)

    def test_csw(self):
        # test 2.0.2 Basic Service Profile

        self.csw = CatalogueServiceWeb(self.url,
                                       version='2.0.2',
                                       username=self.username,
                                       password=self.password)
        self.assertEqual(self.csw.version, '2.0.2')

        self.assertIn('2.0.2', self.csw.parameters['version'].values)
        self.assertIn('3.0.0', self.csw.parameters['version'].values)

        for op in self.csw.operations:
            for method in op.methods:
                self.assertEqual(self.csw.url, method['url'])

        self.assertTrue('Transaction' in [o.name for o in self.csw.operations])
        self.assertTrue('Harvest' in [o.name for o in self.csw.operations])

        get_records_op = self.csw.get_operation_by_name('GetRecords')
        self.assertIn('application/json',
                      get_records_op.parameters['outputFormat']['values'])

        # test basic search, no predicates
        self.csw.getrecords2()
        self.assertEqual(Layer.objects.all().count(),
                         self.csw.results['matches'])

        # test csw:AnyText
        anytext = PropertyIsLike('csw:AnyText', 'Brasilia')
        self.csw.getrecords2(constraints=[anytext])
        self.assertEqual(self.csw.results['matches'], 1)

        anytext = PropertyIsLike('csw:AnyText', 'roads')
        self.csw.getrecords2(constraints=[anytext])
        self.assertEqual(self.csw.results['matches'], 4)

        # test ogc:BBOX
        bbox = BBox(['-13', '-80', '15', '-30'])
        self.csw.getrecords2(constraints=[bbox])
        self.assertEqual(self.csw.results['matches'], 2)

        # test csw:AnyText OR ogc:BBOX
        self.csw.getrecords2(constraints=[anytext, bbox])
        self.assertEqual(self.csw.results['matches'], 5)

        # test csw:AnyText && ogc:BBOX
        self.csw.getrecords2(constraints=[[anytext, bbox]])
        self.assertEqual(self.csw.results['matches'], 1)

        # test that ElementSetName=full stores full metadata record as inserted
        self.csw.getrecords2(esn='full')
        self.assertIn(
            'xmlns:registry="http://gis.harvard.edu/HHypermap/registry/0.1"',
            self.csw.response)

        # test JSON output
        # TODO: fix owslib.csw.CatalogueServiceWeb.getrecords2 to handle non-XML request/response
        with self.assertRaises(XMLSyntaxError):
            self.csw.getrecords2(constraints=[anytext, bbox],
                                 format='application/json')

        records_json = json.loads(self.csw.response)
        self.assertEqual(
            records_json['csw:GetRecordsResponse']['csw:SearchResults']
            ['@numberOfRecordsMatched'], '5')

        # test 3.0.0 OpenSearch
        bsp = {
            'mode': 'opensearch',
            'service': 'CSW',
            'version': '3.0.0',
            'request': 'GetRecords',
            'typenames': 'csw:Record',
            'elementsetname': 'full',
            'outputformat': 'application/json'
        }

        # test basic search, no predicates
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '10')

        # test q
        bsp['q'] = 'Brasilia'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')
        bsp.pop('q')

        # test bbox
        bsp['bbox'] = '-80,-13,-30,15'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '2')
        bsp.pop('bbox')

        # test time
        bsp['time'] = '2014-09-23T12:04:31.102243+00:00/'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '10')
        bsp.pop('time')

        # test q and bbox
        bsp['q'] = 'roads'
        bsp['bbox'] = '-80,-13,-30,15'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')

        # test q and bbox and time
        bsp['time'] = '2014-09-23T12:04:31.102243+00:00/'
        res = json.loads(self.client.get(self.script_name, bsp).content)
        self.assertEqual(res['atom:feed']['os:totalResults'], '1')

    @classmethod
    def tearDownClass(cls):
        # Workaround for https://code.djangoproject.com/ticket/22414
        # Persistent connections not closed by LiveServerTestCase, preventing dropping test databases
        # https://github.com/cjerdonek/django/commit/b07fbca02688a0f8eb159f0dde132e7498aa40cc
        def close_sessions(conn):
            close_sessions_query = """
                SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE
                    datname = current_database() AND
                    pid <> pg_backend_pid();
            """
            with conn.cursor() as cursor:
                try:
                    cursor.execute(close_sessions_query)
                except OperationalError:
                    # We get kicked out after closing.
                    pass

        for alias in connections:
            connections[alias].close()
            close_sessions(connections[alias])

        print "Forcefully closed database connections."
Пример #56
0
def test_csw_geoserver():
    c = CatalogueServiceWeb(SERVICE_URL)
    c.getrecords2(typenames='csw:Record')
    assert c.results.get('returned') > 0
    assert c.results.get('nextrecord') > 0
    assert c.results.get('matches') > 0
Пример #57
0
def create_services_from_endpoint(url):
    """
    Generate service/services from an endpoint.
    WMS, WMTS, TMS endpoints correspond to a single service.
    ESRI, CSW endpoints corrispond to many services.
    """
    num_created = 0
    endpoint = get_sanitized_endpoint(url)
    try:
        urllib2.urlopen(endpoint, timeout=10)
    except Exception as e:
        print 'ERROR! Cannot open this endpoint: %s' % endpoint
        message = traceback.format_exception(*sys.exc_info())
        return False, message

    detected = False

    # handle specific service types for some domains (WorldMap, Wrapper...)
    parsed_uri = urlparse(endpoint)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    if domain == 'http://worldmap.harvard.edu/':
        service_type = 'Hypermap:WorldMap'
        title = 'Harvard WorldMap'
        abstract = 'Harvard WorldMap'
        endpoint = domain
        detected = True
    if domain in [
        'http://maps.nypl.org/',
        'http://mapwarper.net/',
        'http://warp.worldmap.harvard.edu/',
    ]:
        service_type = 'Hypermap:WARPER'
        title = 'Warper at %s' % domain
        abstract = 'Warper at %s' % domain
        detected = True

    # test if it is CSW, WMS, TMS, WMTS or Esri
    # CSW
    try:
        csw = CatalogueServiceWeb(endpoint)
        service_type = 'OGC:CSW'
        service_links = {}
        detected = True

        typenames = 'csw:Record'
        outputschema = 'http://www.opengis.net/cat/csw/2.0.2'

        if 'csw_harvest_pagesize' in settings.PYCSW['manager']:
            pagesize = int(settings.PYCSW['manager']['csw_harvest_pagesize'])
        else:
            pagesize = 10

        print 'Harvesting CSW %s' % endpoint
        # now get all records
        # get total number of records to loop against
        try:
            csw.getrecords2(typenames=typenames, resulttype='hits',
                            outputschema=outputschema)
            matches = csw.results['matches']
        except:  # this is a CSW, but server rejects query
            raise RuntimeError(csw.response)

        if pagesize > matches:
            pagesize = matches

        print 'Harvesting %d CSW records' % matches

        # loop over all catalogue records incrementally
        for r in range(1, matches+1, pagesize):
            try:
                csw.getrecords2(typenames=typenames, startposition=r,
                                maxrecords=pagesize, outputschema=outputschema, esn='full')
            except Exception as err:  # this is a CSW, but server rejects query
                raise RuntimeError(csw.response)
            for k, v in csw.records.items():
                # try to parse metadata
                try:
                    LOGGER.info('Looking for service links')
                    if v.references:  # not empty
                        for ref in v.references:
                            if ref['scheme'] in [st[0] for st in SERVICE_TYPES]:
                                if ref['url'] not in service_links:
                                    service_links[ref['url']] = ref['scheme']
                except Exception as err:  # parsing failed for some reason
                    LOGGER.warning('Metadata parsing failed %s', err)

        LOGGER.info('Service links found: %s', service_links)
        for k, v in service_links.items():
            try:
                service = create_service_from_endpoint(k, v)
                if service is not None:
                    num_created = num_created + 1
            except Exception as err:
                raise RuntimeError('HHypermap error: %s' % err)

    except Exception as e:
        print str(e)

    # WMS
    if not detected:
        try:
            service = WebMapService(endpoint, timeout=10)
            service_type = 'OGC:WMS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except Exception as e:
            print str(e)

    # TMS
    if not detected:
        try:
            service = TileMapService(endpoint, timeout=10)
            service_type = 'OSGeo:TMS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except Exception as e:
            print str(e)

    # WMTS
    if not detected:
        try:
            # @tomkralidis timeout is not implemented for WebMapTileService?
            service = WebMapTileService(endpoint)
            service_type = 'OGC:WMTS'
            title = service.identification.title,
            abstract = service.identification.abstract
            detected = True
        except Exception as e:
            print str(e)

    # if detected, let's create the service
    if detected and service_type != 'OGC:CSW':
        try:
            service = create_service_from_endpoint(
                endpoint,
                service_type,
                title,
                abstract=abstract
            )
            if service is not None:
                num_created = num_created + 1
        except Exception as e:
            print str(e)

    # Esri
    # a good sample is here: https://gis.ngdc.noaa.gov/arcgis/rest/services

    # we can safely assume the following condition (at least it is true for 1170 services)
    # we need to test this as ArcFolder can freeze with not esri url such as this one:
    # http://hh.worldmap.harvard.edu/admin/aggregator/service/?q=%2Frest%2Fservices
    if '/rest/services' in endpoint:
        if not detected:
            try:
                esri = ArcFolder(endpoint)
                services = esri.services

                service_type = 'ESRI'
                detected = True

                # root
                root_services = process_esri_services(services)
                num_created = num_created + len(root_services)

                # folders
                for folder in esri.folders:
                    folder_services = process_esri_services(folder.services)
                    num_created = num_created + len(folder_services)

            except Exception as e:
                print str(e)

    if detected:
        return True, '%s service/s created' % num_created
    else:
        return False, 'ERROR! Could not detect service type for endpoint %s or already existing' % endpoint
Пример #58
0
class CSWRepository(HarvestRepository):
    """ CSW Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "csw"
        super(CSWRepository, self).setRepoParams(repoParams)
        try:
            self.cswrepo = CatalogueServiceWeb(self.url)
        except:
            self.cswrepo = None
        self.domain_metadata = []

    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name,
            "repo_type": "csw",
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)

        if self.cswrepo is None:
            self.logger.error("Could not initiate this repo to crawl it")
            return

        item_count = 0
        while True:
            try:
                self.cswrepo.getrecords2(startposition=self.cswrepo.results['nextrecord'])
            except:
                self.cswrepo.getrecords2()

            for rec in self.cswrepo.records:
                result = self.db.write_header(self.cswrepo.records[rec].identifier, self.repository_id)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count,
                                                                                               self.formatter.humanize(
                                                                                                   tdelta),
                                                                                               item_count / tdelta))
            if item_count == self.cswrepo.results['matches']:
                break

        self.logger.info("Found {} items in feed".format(item_count))

    def format_csw_to_oai(self, csw_record, local_identifier):
        record = {}

        record["title"] = csw_record.title
        record["description"] = csw_record.abstract
        record["tags"] = csw_record.subjects
        record["identifier"] = local_identifier
        record["creator"] = self.name
        record["contact"] = self.contact
        record["series"] = ""

        return record

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self, record):
        if self.cswrepo is None:
            return

        self.cswrepo.getrecordbyid(id=[record['local_identifier']])
        if self.cswrepo.records:
            csw_record = self.cswrepo.records[record['local_identifier']]
            oai_record = self.format_csw_to_oai(csw_record, record['local_identifier'])
            # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific
            self.cswrepo.getrecordbyid(id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd")
            oai_record["pub_date"] = self.cswrepo.records[record['local_identifier']].datestamp
            oai_record["pub_date"] = re.sub("[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "",
                                            oai_record["pub_date"])
            if oai_record:
                self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata)
            return True

        else:
            # This record was deleted
            self.db.delete_record(record)
            return True

        return False
Пример #59
0
    print(csw.results['values'])
except:
    print('GetDomain not supported')


# In[5]:

val = 'laterite'
filter1 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val),
                        escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [ filter1 ]


# In[6]:

csw.getrecords2(constraints=filter_list,maxrecords=100,esn='full')
print len(csw.records.keys())
for rec in list(csw.records.keys()):
    print csw.records[rec].title 
    


# In[7]:

choice=np.random.choice(list(csw.records.keys()))
print(csw.records[choice].title)
csw.records[choice].references


# In[8]:
Пример #60
0
jd_stop = dt.datetime.strptime(stop_date, "%Y-%m-%d %H:%M")

print start_date, "to", stop_date

# <codecell>

start, stop = dateRange(start_date, stop_date)
filter1 = fes.PropertyIsLike(
    propertyname="apiso:AnyText", literal=("*%s*" % val), escapeChar="\\", wildCard="*", singleChar="?"
)
bbox = fes.BBox(box, crs="urn:ogc:def:crs:OGC:1.3:CRS84")

# <codecell>

filter_list = [fes.And([bbox, filter1])]
csw.getrecords2(constraints=filter_list)
csw.results["matches"]

# <codecell>

filter_list = [fes.And([bbox, filter1, start, stop])]
csw.getrecords2(constraints=filter_list)
csw.results["matches"]

# <codecell>

startposition = 0
maxrecords = 20
while True:

    print "getting records %d to %d" % (startposition, startposition + pagesize)