def parse_results_set_info(self): # so if it includes the opensearch namespace, # we can get things like total count and page # TODO: convert to numbers total = extract_item(self.parser.xml, ['feed', 'totalResults']) start_index = extract_item(self.parser.xml, ['feed', 'startIndex']) per_page = extract_item(self.parser.xml, ['feed', 'itemsPerPage']) return total, start_index, per_page
def _parse_child(self, child, dialect): identifier = extract_item(child, ['header', 'identifier']) timestamp = extract_item(child, ['header', 'datestamp']) if dialect == 'oai_dc': dc_elem = extract_elem(child, ['metadata', 'dc']) dc_parser = DcItemReader(dc_elem) return dict( chain({ "identifier": identifier, "timestamp": timestamp }.items(), dc_parser.parse_item().items()))
def _parse_child(self, child, dialect): identifier = extract_item(child, ['header', 'identifier']) timestamp = extract_item(child, ['header', 'datestamp']) if dialect == 'oai_dc': dc_elem = extract_elem(child, ['metadata', 'dc']) dc_parser = DcItemReader(dc_elem) return dict( chain( {"identifier": identifier, "timestamp": timestamp}.items(), dc_parser.parse_item().items() ) )
def _parse_distribution(self, elem): ''' from the distributionInfo element ''' for dist_elem in extract_elems(elem, ['MD_Distribution']): # this is going to get ugly. # super ugly # get the transferoptions block # get the url, the name, the description, the size # get the format from a parent node # but where the transferoptions can be in some nested # distributor thing or at the root of the element (NOT # the root of the file) transfer_elems = extract_elems( dist_elem, ['//*', 'MD_DigitalTransferOptions']) for transfer_elem in transfer_elems: # _transfer = {} # transfer['url'] = extract_item( # transfer_elem, # ['onLine', 'CI_OnlineResource', 'linkage', 'URL']) # transfer['objectid'] = generate_sha_urn(transfer['url']) # xp = generate_localname_xpath( # ['..', '..', 'distributorFormat', 'MD_Format']) # format_elem = next(iter(transfer_elem.xpath(xp)), None) # if format_elem is not None: # transfer['format'] = ' '.join([ # extract_item(format_elem, # ['name', 'CharacterString']), # extract_item( # format_elem, ['version', 'CharacterString'])]) link = extract_item( transfer_elem, ['onLine', 'CI_OnlineResource', 'linkage', 'URL']) yield link
def _parse_child(self, child): entry = {} entry["title"] = extract_item(child, ["title"]) entry["id"] = extract_item(child, ["id"]) entry["creator"] = extract_item(child, ["creator"]) entry["author"] = extract_item(child, ["author", "name"]) entry["date"] = extract_item(child, ["date"]) entry["updated"] = extract_item(child, ["updated"]) entry["published"] = extract_item(child, ["published"]) entry["subjects"] = [e.attrib.get("term", "") for e in extract_elems(child, ["category"])] entry["contents"] = [] contents = extract_elems(child, ["content"]) for content in contents: text = content.text.strip() if content.text else "" content_type = content.attrib.get("type", "") entry["contents"].append({"content": text, "type": content_type}) entry["links"] = [] links = extract_elems(child, ["link"]) for link in links: href = link.attrib.get("href", "") rel = link.attrib.get("rel", "") entry["links"].append({"href": href, "rel": rel}) return tidy_dict(entry)
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [ e.attrib.get('term', '') for e in extract_elems(elem, ['category']) ] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(elem, ['category'])] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def parse(self): ''' key = entry for atom and item for rss ''' key = 'entry' if self.dialect == 'atom' else 'item' elems = extract_elems(self.parser.xml, ['//*', key]) items = [self.item_class(elem).item for elem in elems] # TODO: add the root level parsing, ie the difference btwn atom and rss title = extract_item(self.parser.xml, ['title']) updated = extract_item(self.parser.xml, ['updated']) author_name = extract_item(self.parser.xml, ['author', 'name']) return { "title": title, "updated": updated, "author": author_name, "items": items }
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item(elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item(elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def _extract_age(self, protocol, xml): if protocol == "ISO": md_dates = extract_item(xml, ["//*", "dateStamp", "Date"]) try: # this will use the current month, day # if only the year is provided (that isn't # valid iso anyway but i am not fixing it here) md_dates = dateparser.parse(md_dates) except: return None elif protocol == "FGDC": md_date = extract_item(xml, ["//*", "metainfo", "metd"]) try: md_dates = _convert_date(md_date) except: md_dates = None else: return None return md_dates
def _handle_operations(self): elems = extract_elems(self.elem, ['containsOperations', 'SV_OperationMetadata']) ops = [] for e in elems: op = {} op['name'] = extract_item(e, ['operationName', 'CharacterString']) op['method'] = extract_attrib(e, ['DCP', 'DCPList', '@codeListValue']) op['url'] = extract_item( e, ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL']) op['parameters'] = [ self._handle_parameter(pe) for pe in extract_elems(e, ['parameters', 'SV_Parameter']) ] ops.append(op) return ops
def _extract_age(self, protocol, xml): if protocol == 'ISO': md_dates = extract_item(xml, ['//*', 'dateStamp', 'Date']) try: # this will use the current month, day # if only the year is provided (that isn't # valid iso anyway but i am not fixing it here) md_dates = dateparser.parse(md_dates) except: return None elif protocol == 'FGDC': md_date = extract_item(xml, ['//*', 'metainfo', 'metd']) try: md_dates = _convert_date(md_date) except: md_dates = None else: return None return md_dates
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item( elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item( elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib( key_elem, ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'] ) thesaurus = extract_item( key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ] ) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items( elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics }) ) return keywords
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item( elem, ['address', 'CI_Address', 'electronicMailAddress', 'CharacterString']) return tidy_dict(contact)
def parse_item(self): ''' parse just the dc element (like oai_dc:dc) so if you're pulling this from an oai-pmh service, etc, make sure that it's *not* the full document ''' # TODO: this is not correct for the overall thing if self.elem is None: return {} title = extract_item(self.elem, ['title']) creator = extract_item(self.elem, ['creator']) subjects = extract_items(self.elem, ['subject']) description = extract_item(self.elem, ['description']) date = extract_item(self.elem, ['date']) language = extract_item(self.elem, ['language']) publisher = extract_item(self.elem, ['publisher']) sources = extract_items(self.elem, ['source']) types = extract_items(self.elem, ['type']) return tidy_dict({ 'title': title, 'creator': creator, 'subjects': subjects, 'abstract': description, 'language': language, 'date': date, 'publisher': publisher, 'types': types, 'sources': sources })
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item(elem, [ 'address', 'CI_Address', 'electronicMailAddress', 'CharacterString' ]) return tidy_dict(contact)
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item( elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item(elem, [ 'citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def _handle_operations(self): elems = extract_elems( self.elem, ['containsOperations', 'SV_OperationMetadata']) ops = [] for e in elems: op = {} op['name'] = extract_item( e, ['operationName', 'CharacterString']) op['method'] = extract_attrib( e, ['DCP', 'DCPList', '@codeListValue']) op['url'] = extract_item( e, ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL']) op['parameters'] = [ self._handle_parameter(pe) for pe in extract_elems(e, ['parameters', 'SV_Parameter'])] ops.append(op) return ops
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item(elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item( elem, ['citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def _handle_bbox(self, elem): west = extract_item(elem, ['westBoundLongitude', 'Decimal']) west = float(west) if west else 0 east = extract_item(elem, ['eastBoundLongitude', 'Decimal']) east = float(east) if east else 0 south = extract_item(elem, ['southBoundLatitude', 'Decimal']) south = float(south) if south else 0 north = extract_item(elem, ['northBoundLatitude', 'Decimal']) north = float(north) if north else 0 bbox = [west, south, east, north] \ if east and west and north and south else [] geom = bbox_to_geom(bbox) return { "dc:spatial": to_wkt(geom), "esip:westBound": west, "esip:eastBound": east, "esip:southBound": south, "esip:northBound": north }
def _parse_child(self, child): item = {} item["title"] = extract_item(child, ["title"]) item["language"] = extract_item(child, ["language"]) item["author"] = extract_item(child, ["author"]) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item["encoded"] = extract_item(child, ["encoded"]) item["id"] = extract_item(child, ["guid"]) item["creator"] = extract_item(child, ["creator"]) item["subjects"] = extract_items(child, ["category"]) item["published"] = extract_item(child, ["pubDate"]) item["timestamp"] = extract_item(child, ["date"]) item["links"] = extract_items(child, ["link"]) item["links"] += extract_items(child, ["docs"]) return tidy_dict(item)
def _parse_item(self, elem): item = {} item['title'] = extract_item(elem, ['title']) item['language'] = extract_item(elem, ['language']) item['author'] = extract_item(elem, ['author']) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item['encoded'] = extract_item(elem, ['encoded']) item['id'] = extract_item(elem, ['guid']) item['creator'] = extract_item(elem, ['creator']) item['subjects'] = extract_items(elem, ['category']) item['published'] = extract_item(elem, ['pubDate']) item['timestamp'] = extract_item(elem, ['date']) item['links'] = extract_items(elem, ['link']) item['links'] += extract_items(elem, ['docs']) return tidy_dict(item)
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib(key_elem, [ 'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue' ]) thesaurus = extract_item(key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ]) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items(elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics })) return keywords
def parse_identifiers(elem): # note that this elem is the root iso identifiers = [] xps = [ ['fileIdentifier', 'CharacterString'], [ 'identificationInfo', 'MD_DataIdentification', 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ], ['dataSetURI', 'CharacterString'] # TODO: this can be multiple items ] for xp in xps: i = extract_item(elem, xp) if i: identifiers.append(i) return identifiers
def parse_identifiers(elem): # note that this elem is the root iso identifiers = [] xps = [ ['fileIdentifier', 'CharacterString'], ['identificationInfo', 'MD_DataIdentification', 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString'], ['dataSetURI', 'CharacterString'] # TODO: this can be multiple items ] for xp in xps: i = extract_item(elem, xp) if i: identifiers.append(i) return identifiers
def _handle_parameter(self, elem): ''' parse an sv_parameter element ''' param = {} param['name'] = extract_item(elem, ['name', 'aName', 'CharacterString']) param['inputType'] = extract_item( elem, ['name', 'attributeType', 'TypeName', 'aName', 'CharacterString']) param['direction'] = extract_item( elem, ['direction', 'SV_ParameterDirection']) param['optional'] = extract_item(elem, ['optionality', 'CharacterString']) param['cardinality'] = extract_item(elem, ['repeatability', 'Boolean']) param['valueType'] = extract_item( elem, ['valueType', 'TypeName', 'aName', 'CharacterString']) return param
def _handle_parameter(self, elem): ''' parse an sv_parameter element ''' param = {} param['name'] = extract_item( elem, ['name', 'aName', 'CharacterString']) param['inputType'] = extract_item( elem, ['name', 'attributeType', 'TypeName', 'aName', 'CharacterString']) param['direction'] = extract_item( elem, ['direction', 'SV_ParameterDirection']) param['optional'] = extract_item( elem, ['optionality', 'CharacterString']) param['cardinality'] = extract_item( elem, ['repeatability', 'Boolean']) param['valueType'] = extract_item( elem, ['valueType', 'TypeName', 'aName', 'CharacterString']) return param
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item( self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item( self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item( self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item( self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item( self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item( related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item(self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item(self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item(self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item(self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item(self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item(related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "rdf:type": 'OpenSearch1.1:Description', "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": ' '.join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])), "urls": [], "webpages": [], "relationships": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) service['urls'].append(original_url) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output['keywords'] = [{ "object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"]) }] service['relationships'].append({ "relate": "dc:conformsTo", "object_id": key_id }) for t in extract_elems(self.parser.xml, ['Url']): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep['url']) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep['url'], "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(dist) wb_id = generate_uuid_urn() service['webpages'].append({ "object_id": wb_id, "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) service['relationships'].append({ "relate": "dcterms:references", "object_id": wb_id }) output['services'] = [service] return tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get("harvest_date", ""), "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""), "rdf:type": "OpenSearch1.1:Description", "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": " ".join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"]) ), "urls": [], "webpages": [], "relationships": [], } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha, } ) service["urls"].append(original_url) service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]}) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}] service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id}) for t in extract_elems(self.parser.xml, ["Url"]): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep["url"]) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep["url"], "object_id": url_id, "dc:identifier": url_sha, } ) service["urls"].append(dist) wb_id = generate_uuid_urn() service["webpages"].append( {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]} ) service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id}) output["services"] = [service] return tidy_dict(output)