def _parse_results_set_info(self): result_elem = extract_elem(self.parser.xml, ['SearchResults']) self.total = extract_attrib(result_elem, ['@numberOfRecordsMatched']) self.subtotal = extract_attrib(result_elem, ['@numberOfRecordsReturned']) self.schema = extract_attrib(result_elem, ['@recordSchema'])
def _parse_children(self, dialect): children = [] result_elem = extract_elem(self.parser.xml, ['SearchResults']) for child in result_elem.iterchildren(): item = self._parse_child(child, dialect) if item: children.append(item) return children
def _handle_polygon(self, polygon_elem): elem = extract_elem(polygon_elem, ['polygon', 'Polygon']) srs_name = elem.attrib.get('srsName', 'EPSG:4326') geom = gml_to_geom(elem) if srs_name != '': geom = reproject(geom, srs_name, 'EPSG:4326') # TODO: generate the envelope? return {"dc:spatial": to_wkt(geom)}
def _parse_child(self, child, dialect): identifier = extract_item(child, ['header', 'identifier']) timestamp = extract_item(child, ['header', 'datestamp']) if dialect == 'oai_dc': dc_elem = extract_elem(child, ['metadata', 'dc']) dc_parser = DcItemReader(dc_elem) return dict( chain( {"identifier": identifier, "timestamp": timestamp}.items(), dc_parser.parse_item().items() ) )
def _parse_child(self, child, dialect): identifier = extract_item(child, ['header', 'identifier']) timestamp = extract_item(child, ['header', 'datestamp']) if dialect == 'oai_dc': dc_elem = extract_elem(child, ['metadata', 'dc']) dc_parser = DcItemReader(dc_elem) return dict( chain({ "identifier": identifier, "timestamp": timestamp }.items(), dc_parser.parse_item().items()))
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item(elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item(elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item( elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item( elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def parse(self): # get the series self.description = {} md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata']) if md is None: return md_parser = MxParser(md) md_parser.parse() self.description = md_parser.description self.description['children'] = [] # get the children children = extract_elems( self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata']) for child in children: child_parser = MxParser(child) child_parser.parse() if child_parser.description: self.description['children'].append(child_parser.description) self.description = tidy_dict(self.description)
def _parse_extent(self, elem): ''' handle the spatial and/or temporal extent starting from the *:extent element ''' extents = {} geo_elem = extract_elem( elem, ['extent', 'EX_Extent', 'geographicElement']) if geo_elem is not None: # we need to sort out what kind of thing it # is bbox, polygon, list of points bbox_elem = extract_elem(geo_elem, ['EX_GeographicBoundingBox']) if bbox_elem is not None: extents.update(self._handle_bbox(bbox_elem)) # NOTE: this will obv overwrite the above poly_elem = extract_elem(geo_elem, ['EX_BoundingPolygon']) if poly_elem is not None: extents.update(self._handle_polygon(poly_elem)) time_elem = extract_elem( elem, [ 'extent', 'EX_Extent', 'temporalElement', 'EX_TemporalExtent', 'extent', 'TimePeriod' ] ) if time_elem is not None: begin_position = extract_elem(time_elem, ['beginPosition']) end_position = extract_elem(time_elem, ['endPosition']) if begin_position is not None and 'indeterminatePosition' not in begin_position.attrib: begin_position = self._parse_timestamp(begin_position.text) if end_position is not None and 'indeterminatePosition' not in end_position.attrib: end_position = self._parse_timestamp(end_position.text) extents.update({ "esip:startDate": begin_position.isoformat(), "esip:endDate": end_position.isoformat() }) return extents
def _parse_extent(self, elem): ''' handle the spatial and/or temporal extent starting from the *:extent element ''' extents = {} geo_elem = extract_elem(elem, ['extent', 'EX_Extent', 'geographicElement']) if geo_elem is not None: # we need to sort out what kind of thing it # is bbox, polygon, list of points bbox_elem = extract_elem(geo_elem, ['EX_GeographicBoundingBox']) if bbox_elem is not None: extents.update(self._handle_bbox(bbox_elem)) # NOTE: this will obv overwrite the above poly_elem = extract_elem(geo_elem, ['EX_BoundingPolygon']) if poly_elem is not None: extents.update(self._handle_polygon(poly_elem)) time_elem = extract_elem(elem, [ 'extent', 'EX_Extent', 'temporalElement', 'EX_TemporalExtent', 'extent', 'TimePeriod' ]) if time_elem is not None: begin_position = extract_elem(time_elem, ['beginPosition']) end_position = extract_elem(time_elem, ['endPosition']) if begin_position is not None and 'indeterminatePosition' not in begin_position.attrib: begin_position = self._parse_timestamp(begin_position.text) if end_position is not None and 'indeterminatePosition' not in end_position.attrib: end_position = self._parse_timestamp(end_position.text) extents.update({ "esip:startDate": begin_position.isoformat(), "esip:endDate": end_position.isoformat() }) return extents
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty']) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = ( ' '.join( [poc['contact'].get('city', ''), poc['contact'].get('country', '')]) ).strip() if poc.get('contact', {}) else '' self.output['publishers'].append(tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty' ]) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = (' '.join([ poc['contact'].get('city', ''), poc['contact'].get( 'country', '') ])).strip() if poc.get('contact', {}) else '' self.output['publishers'].append( tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)