def parse_item(self): ''' parse just the dc element (like oai_dc:dc) so if you're pulling this from an oai-pmh service, etc, make sure that it's *not* the full document ''' # TODO: this is not correct for the overall thing if self.elem is None: return {} title = extract_item(self.elem, ['title']) creator = extract_item(self.elem, ['creator']) subjects = extract_items(self.elem, ['subject']) description = extract_item(self.elem, ['description']) date = extract_item(self.elem, ['date']) language = extract_item(self.elem, ['language']) publisher = extract_item(self.elem, ['publisher']) sources = extract_items(self.elem, ['source']) types = extract_items(self.elem, ['type']) return tidy_dict({ 'title': title, 'creator': creator, 'subjects': subjects, 'abstract': description, 'language': language, 'date': date, 'publisher': publisher, 'types': types, 'sources': sources })
def parse_item(self): ''' parse just the dc element (like oai_dc:dc) so if you're pulling this from an oai-pmh service, etc, make sure that it's *not* the full document ''' # TODO: this is not correct for the overall thing if self.elem is None: return {} title = extract_item(self.elem, ['title']) creator = extract_item(self.elem, ['creator']) subjects = extract_items(self.elem, ['subject']) description = extract_item(self.elem, ['description']) date = extract_item(self.elem, ['date']) language = extract_item(self.elem, ['language']) publisher = extract_item(self.elem, ['publisher']) sources = extract_items(self.elem, ['source']) types = extract_items(self.elem, ['type']) return tidy_dict({ 'title': title, 'creator': creator, 'subjects': subjects, 'abstract': description, 'language': language, 'date': date, 'publisher': publisher, 'types': types, 'sources': sources })
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib( key_elem, ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'] ) thesaurus = extract_item( key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ] ) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items( elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics }) ) return keywords
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item(self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item(self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item(self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item(self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item(self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item(related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item( self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item( self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item( self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item( self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item( self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item( related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def return_service_descriptors(self): ''' basic service information title abtract note: what to do about keywords (thesaurus + list + type)? keywords ''' service_elements = {} for k, v in self._service_descriptors.iteritems(): # v can be a list of possible xpaths where we want # to keep all returned values from any xpath within items = [] xpaths = v if isinstance(v, list) else [v] for xp in xpaths: if '@' in xp[-1]: items = extract_attribs(self.parser.xml, xp) else: items += extract_items(self.parser.xml, xp) service_elements[k] = items endpoints = self.parse_endpoints() if endpoints: service_elements['endpoints'] = endpoints return service_elements
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item(elem, [ 'address', 'CI_Address', 'electronicMailAddress', 'CharacterString' ]) return tidy_dict(contact)
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item( elem, ['address', 'CI_Address', 'electronicMailAddress', 'CharacterString']) return tidy_dict(contact)
def return_service_descriptors(self): ''' basic service information title abtract note: what to do about keywords (thesaurus + list + type)? keywords ''' service_elements = {} for k, v in self._service_descriptors.iteritems(): # v can be a list of possible xpaths where we want # to keep all returned values from any xpath within items = [] xpaths = v if isinstance(v, list) else [v] for xp in xpaths: if '@' in xp[-1]: items = extract_attribs(self.parser.xml, xp) else: items += extract_items(self.parser.xml, xp) service_elements[k] = items endpoints = self.parse_endpoints() if endpoints: service_elements['endpoints'] = endpoints return service_elements
def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join(extract_items( self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def _parse_child(self, child): item = {} item["title"] = extract_item(child, ["title"]) item["language"] = extract_item(child, ["language"]) item["author"] = extract_item(child, ["author"]) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item["encoded"] = extract_item(child, ["encoded"]) item["id"] = extract_item(child, ["guid"]) item["creator"] = extract_item(child, ["creator"]) item["subjects"] = extract_items(child, ["category"]) item["published"] = extract_item(child, ["pubDate"]) item["timestamp"] = extract_item(child, ["date"]) item["links"] = extract_items(child, ["link"]) item["links"] += extract_items(child, ["docs"]) return tidy_dict(item)
def _parse_item(self, elem): item = {} item['title'] = extract_item(elem, ['title']) item['language'] = extract_item(elem, ['language']) item['author'] = extract_item(elem, ['author']) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item['encoded'] = extract_item(elem, ['encoded']) item['id'] = extract_item(elem, ['guid']) item['creator'] = extract_item(elem, ['creator']) item['subjects'] = extract_items(elem, ['category']) item['published'] = extract_item(elem, ['pubDate']) item['timestamp'] = extract_item(elem, ['date']) item['links'] = extract_items(elem, ['link']) item['links'] += extract_items(elem, ['docs']) return tidy_dict(item)
def _parse_item(self, elem): item = {} item['title'] = extract_item(elem, ['title']) item['language'] = extract_item(elem, ['language']) item['author'] = extract_item(elem, ['author']) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item['encoded'] = extract_item(elem, ['encoded']) item['id'] = extract_item(elem, ['guid']) item['creator'] = extract_item(elem, ['creator']) item['subjects'] = extract_items(elem, ['category']) item['published'] = extract_item(elem, ['pubDate']) item['timestamp'] = extract_item(elem, ['date']) item['links'] = extract_items(elem, ['link']) item['links'] += extract_items(elem, ['docs']) return tidy_dict(item)
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib(key_elem, [ 'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue' ]) thesaurus = extract_item(key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ]) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items(elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics })) return keywords
def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join( extract_items(self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "rdf:type": 'OpenSearch1.1:Description', "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": ' '.join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])), "urls": [], "webpages": [], "relationships": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) service['urls'].append(original_url) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output['keywords'] = [{ "object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"]) }] service['relationships'].append({ "relate": "dc:conformsTo", "object_id": key_id }) for t in extract_elems(self.parser.xml, ['Url']): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep['url']) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep['url'], "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(dist) wb_id = generate_uuid_urn() service['webpages'].append({ "object_id": wb_id, "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) service['relationships'].append({ "relate": "dcterms:references", "object_id": wb_id }) output['services'] = [service] return tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get("harvest_date", ""), "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""), "rdf:type": "OpenSearch1.1:Description", "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": " ".join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"]) ), "urls": [], "webpages": [], "relationships": [], } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha, } ) service["urls"].append(original_url) service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]}) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}] service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id}) for t in extract_elems(self.parser.xml, ["Url"]): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep["url"]) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep["url"], "object_id": url_id, "dc:identifier": url_sha, } ) service["urls"].append(dist) wb_id = generate_uuid_urn() service["webpages"].append( {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]} ) service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id}) output["services"] = [service] return tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)