def read(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: oaipmh.common.Metadata object generated from xml """ result = generic_xml_metadata_reader(xml).getMap() result['unified'] = self.read_data(xml) return oaipmh.common.Metadata(xml, result)
def read(self, xml): """ Extract package data from given XML. :param xml: xml element (lxml) :return: oaipmh.common.Metadata object generated from xml """ result = generic_xml_metadata_reader(xml).getMap() result['unified'] = self.read_data(xml) return oaipmh.common.Metadata(result)
def _ddi2ckan(self, original_url, original_xml, harvest_object): '''Extract package values from bs4 object 'ddi_xml' parsed from xml ''' # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml. doc_citation = "ddi_xml.codeBook.docDscr.citation" stdy_dscr = "ddi_xml.codeBook.stdyDscr" #################################################################### # Read mandatory metadata fields: # #################################################################### # Authors & organizations authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty') agent = authors[:] agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation)) # Availability availability = AVAILABILITY_DEFAULT if _access_request_URL_is_found(): availability = 'direct_download' if _is_fsd(original_url): availability = AVAILABILITY_FSD # Keywords keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject) # Language # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ? language = self.convert_language( self._read_value("ddi_xml.codeBook.get('xml:lang')")) # Titles titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \ self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True) # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles] # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}] # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} first_title = "" # default to finnish, since first title has no lang value, which causes the validator to whine # we might want to update the DDI harvester to accept a language configuration parameter, if # we decide to harvest DDI resources from other sources. default_lang = "fi" for title in titles: transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text # we want to get save the first title for use lateron if not first_title: first_title = title.text title = json.dumps(transl_json) # License # TODO: Extract prettier output. Should we check that element contains something? # Should this be in optional section if not mandatory_field? license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False) if _is_fsd(original_url): license_id = LICENSE_ID_FSD else: license_id = LICENSE_ID_DEFAULT # Contact (package_extra.key: contact_[k]_name in database, contact in WUI) contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \ self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \ self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True) # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482) if contact_name and contact_name[0].text: contact_name = contact_name[0].text else: contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True) if _is_fsd(original_url): contact_email = CONTACT_EMAIL_FSD # TODO: Allow trying other email also in FSD metadata else: contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True) # Modified date version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation, 'prodDate', 'date') or \ self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation, 'version', 'date') # This idNos is an FSD specific solution idNos = self._read_value(stdy_dscr + ".citation.titlStmt.find_all('IDNo')", mandatory_field=False) if not idNos: idNos = self._read_value(doc_citation + ".titlStmt.find_all('IDNo')", mandatory_field=True) pids = list() idNoValues = [bsIdNo.text for bsIdNo in idNos] agencies = [bsIdNo.get('agency') for bsIdNo in idNos] primary_pid = None if len(idNoValues) == len(agencies): for idNoVal, agency in zip(idNoValues, agencies): if agency == 'Kansalli' \ 'skirjasto': pids.append({'id': idNoVal, 'type': 'primary', 'provider': agency}) primary_pid = idNoVal else: pids.append({'id': agency + idNoVal, 'type': 'relation', 'provider': agency, 'relation': 'generalRelation'}) # Should we generate a version PID? # vpid = utils.generate_pid() # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'}) # Original web page as resource # For FSD 'URI' leads to summary web page of data, hence format='html' orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')") if orig_web_page: orig_web_page_resource = {'description': first_title, 'format': u'html', 'resource_type': 'documentation', 'url': orig_web_page} else: orig_web_page_resource = {} # Owner owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \ self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \ self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True) agent.append({'role': 'owner', 'name': owner}) # Owner organisation if harvest_object: hsid = harvest_object.harvest_source_id hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name else: owner_org = u'' # Distributor (Agent: distributor, the same is used as contact) agent.append({ 'role': 'distributor', 'name': contact_name}) #################################################################### # Read optional metadata fields: # #################################################################### # Availability if _is_fsd(original_url): access_request_url = ACCESS_REQUEST_URL_FSD else: access_request_url = u'' # Contact contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \ self._read_value(stdy_dscr + ".citation.holdings.get('callno')") contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \ CONTACT_URL_FSD if _is_fsd(original_url) else None # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"} descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')") if not descriptions: descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')") translated_notes = {} for des in descriptions: lang = self.convert_language(des.get('xml:lang', 'fi')) if lang in translated_notes: translated_notes[lang] += '\r\n\r\n' + des.text else: translated_notes[lang] = des.text notes = json.dumps(translated_notes) # Discipline discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject) # Dataset lifetime events events = self._get_events(stdy_dscr, authors) # Geographic coverage geo_cover = self.get_geo_coverage(self.ddi_xml) # Temporal coverage temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml) # Citation citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False) #################################################################### # Flatten rest to 'XPath/path/to/element': 'value' pairs # #################################################################### etree_xml = etree.fromstring(str(self.ddi_xml)) flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr')) xpath_dict = flattened_ddi.getMap() flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr')) xpath_dict.update(flattened_ddi.getMap()) existing_package_id = get_package_id_by_pid(primary_pid, u'primary') package_id = existing_package_id if existing_package_id else get_unique_package_id() package_name = pid_to_name(package_id) package_dict = dict( access_application_URL=u'', access_request_URL=unicode(access_request_url), agent=agent, algorithm=u'', # To be implemented straight in 'resources' availability=unicode(availability), contact=[{'name': contact_name, 'email': contact_email, 'URL': contact_URL, 'phone': contact_phone}], direct_download_URL=u'', # To be implemented straight in 'resources discipline=discipline, event=events, geographic_coverage=geo_cover, groups=[], id=package_id, langdis=u'True', # HUOMAA! language=language, license_URL=license_url, license_id=license_id, mimetype=u'', # To be implemented straight in 'resources name=package_name, notes=notes or u'', pids=pids, owner_org=owner_org, resources=[orig_web_page_resource], tag_string=keywords, temporal_coverage_begin=temp_start, temporal_coverage_end=temp_end, title=title, type='dataset', version=version, version_PID='', citation=citation ) package_dict['xpaths'] = xpath_dict # Above line creates: # package_dict = { # 'access_request_url': 'some_url', # # ... # 'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34': # 'Uskon asia: nuorisobarometri 2006 (2006).'}, # {'stdyD...': 'Some value'}] # } #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg)) # Vanhojen koodien järjestys: #_save_original_xml_and_link_as_resources() #_save_ddi_variables_to_csv() #_create_group_based_on_organizations() #_last_statements_to_rewrite() # JuhoL: Set harvest object to some end state and commit if harvest_object is not None: harvest_object.content = None # Should this be flushed? model.Session.flush() #model.repo.commit() return package_dict