def metadata(self): return common.Metadata( None, { "title": self._title(), "creator": self._creators(), "subject": self._subject(), "description": self._description(), "publisher": self._publisher(), "date": self._date(), "type": [fetch_pubtype_from_vocabulary(self.data.get("type"))], "format": ["text/html"], "identifier": [ self._context["url_for_html"]( acron=self.data["journal_acron"], doc_id=self.data["doc_id"], ), ], "source": [], "language": self._language(), "relation": self._relation(), "rights": ["info:eu-repo/semantics/openAccess"], }, )
def __call__(self, element): map = {} # create XPathEvaluator for this element xpath_evaluator = etree.XPathEvaluator(element, namespaces=self._namespaces) e = xpath_evaluator.evaluate # now extra field info according to xpath expr for field_name, (field_type, expr) in self._fields.items(): if field_type == 'bytes': value = str(e(expr)) elif field_type == 'bytesList': value = [str(item) for item in e(expr)] elif field_type == 'text': # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = unicode(e(expr)) elif field_type == 'textList': # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = [unicode(v) for v in e(expr)] else: raise Error, "Unknown field type: %s" % field_type map[field_name] = value return common.Metadata(element, map)
def add_record(result, so, resource_sets): header = common.Header(so.identifier, so.modified, resource_sets, so.deleted) if datestampInRange(header, from_, until): if metadataPrefix == 'metashare': map_class = MetashareMap() map_ = map_class.getMap(so.metadata) elif metadataPrefix == 'olac': map_class = OlacMap() map_ = map_class.getMap(so.metadata) elif metadataPrefix == 'cmdi': map_class = CmdiMap() resource = so.resourceinfotype_model_set.all()[0] identifiers = resource.identificationInfo.identifier doi = identifiers[0] if identifiers else "" map_ = _add_elements_to_cmdi_metadata( map_class.getMap(so.metadata), resource.pk, so.identifier, 'https://doi.org/' + doi, ) else: raise CannotDisseminateFormatError, \ '%s metadata format is not supported' % metadataPrefix metadata = common.Metadata(map_) result.append((header, metadata, None))
def record_for_book(self, book, headers_only=False): meta = None identifier = self.slug_to_identifier(book.slug) if isinstance(book, Book): # setSpec = map(self.tag_to_setspec, book.tags.filter(category__in=self.TAG_CATEGORIES)) header = common.Header(identifier, book.changed_at, [], False) if not headers_only: meta = common.Metadata(self.metadata(book)) about = None elif isinstance(book, Deleted): header = common.Header(identifier, book.deleted_at, [], True) if not headers_only: meta = common.Metadata({}) about = None if headers_only: return header return header, meta, about
def __init__(self): data = [] for i in range(0, 12): # create some records in a year year = 2005 month = i + 1 day = 1 datestamp = datetime(year, month, day, 12, 30, 0) data.append((common.Header(str(i), datestamp, '', False), common.Metadata({'title': ['Title %s' % i]}), None)) self._data = data
def createFakeData(): data = [] for i in range(100): # create some datestamp spread year = 2004 month = i % 12 + 1 day = i % 28 + 1 hour = i % 24 minute = i % 60 second = i % 60 datestamp = datetime(year, month, day, hour, minute, second) data.append((common.Header(str(i), datestamp, '', False), common.Metadata({'title': ['Title %s' % i]}), None)) return data
def _record_for_dataset(self, dataset, set_spec): '''Show a tuple of a header and metadata for this dataset. ''' package = get_action('package_show')({}, {'id': dataset.id}) coverage = [] temporal_begin = package.get('temporal_coverage_begin', '') temporal_end = package.get('temporal_coverage_end', '') geographic = package.get('geographic_coverage', '') if geographic: coverage.extend(geographic.split(',')) if temporal_begin or temporal_end: coverage.append("%s/%s" % (temporal_begin, temporal_end)) pids = [ pid.get('id') for pid in package.get('pids', {}) if pid.get('id', False) ] pids.append(package.get('id')) pids.append( config.get('ckan.site_url') + url_for(controller="package", action='read', id=package['name'])) meta = {#'title': self._get_json_content(package.get('title', None) or package.get('name')), 'identifier': pids, 'type': ['dataset'], 'language': [l.strip() for l in package.get('language').split(",")] if package.get('language', None) else None, 'description': self._get_json_content(package.get('notes')) if package.get('notes', None) else None, 'subject': [tag.get('display_name') for tag in package['tags']] if package.get('tags', None) else None, 'date': [dataset.metadata_created.strftime('%Y-%m-%d')] if dataset.metadata_created else None, 'rights': [package['license_title']] if package.get('license_title', None) else None, 'coverage': coverage if coverage else [], } iters = dataset.extras.items() meta = dict(iters + meta.items()) metadata = {} # Fixes the bug on having a large dataset being scrambled to individual # letters for key, value in meta.items(): if not isinstance(value, list): metadata[str(key)] = [value] else: metadata[str(key)] = value return (common.Header('', dataset.id, dataset.metadata_created, set_spec, False), common.Metadata('', metadata), None)
def getRecord(self, metadataPrefix, identifier): """ OAI-PMH verb, GetRecord. See http://www.openarchives.org/OAI/openarchivesprotocol.html#GetRecord """ self.check_no_multivalue_arg() checkMetadataPrefix(metadataPrefix) try: so = StorageObject.objects.get(identifier=identifier, publication_status=PUBLISHED) except: raise IdDoesNotExistError,'Resource [%s] does not exist' % identifier else: resource_sets = _get_setSpecs(so) header = common.Header(identifier, so.modified, resource_sets, so.deleted) if metadataPrefix == 'metashare': map_class = MetashareMap() map_ = map_class.getMap(so.metadata) elif metadataPrefix == 'olac': map_class = OlacMap() map_ = map_class.getMap(so.metadata) elif metadataPrefix == 'cmdi': map_class = CmdiMap() resource = so.resourceinfotype_model_set.all()[0] identifiers = resource.identificationInfo.identifier doi = identifiers[0] if identifiers else "" map_ = _add_elements_to_cmdi_metadata( map_class.getMap(so.metadata), resource.pk, so.identifier, 'https://doi.org/' + doi, ) else: raise CannotDisseminateFormatError, \ '%s metadata format is not supported' % metadataPrefix metadata = common.Metadata(map_) return header, metadata, None
def __call__(self, element): # extract the Json jsontxt = element.text payload = json.loads(jsontxt) return common.Metadata(element, payload)
def _record_for_dataset_datacite(self, dataset, set_spec): '''Show a tuple of a header and metadata for this dataset. ''' package = get_action('package_show')({}, {'id': dataset.id}) coverage = [] temporal_begin = package.get('temporal_coverage_begin', '') temporal_end = package.get('temporal_coverage_end', '') geographic = package.get('geographic_coverage', '') if geographic: coverage.extend(geographic.split(',')) if temporal_begin or temporal_end: coverage.append("%s/%s" % (temporal_begin, temporal_end)) #Loops through extras -table: extras = {} for item in package['extras']: for key, value in item.iteritems(): key = item[ 'key'] # extras table is constructed as key: language, value: English value = item[ 'value'] # instead of language : English, that is why it is looped here extras.update({key: value}) identifiers = self._set_id(package, extras) subj = [tag.get('display_name') for tag in package['tags'] ] if package.get('tags', None) else None if 'Discipline' in extras: subj.append(extras['Discipline']) meta = { 'identifier': identifiers[0], 'identifierType': identifiers[1], 'alternateIdentifier': identifiers[2], 'alternateIdentifierType': identifiers[3], 'creator': [author for author in package['author'].split(";")] if 'author' in package else None, 'publisher': extras['Publisher'] if 'Publisher' in extras else None, 'publicationYear': extras['PublicationYear'] if 'PublicationYear' in extras else None, 'publicationTimestamp': extras['PublicationTimestamp'] if 'PublicationTimestamp' in extras else None, 'resourceType': extras['ResourceType'] if 'ResourceType' in extras else None, 'language': extras['Language'] if 'Language' in extras else None, 'titles': package.get('title', None) or package.get('name'), 'contributor': [author for author in package['author'].split(";")] if 'author' in package else None, 'descriptions': self._get_json_content(package.get('notes')) if package.get( 'notes', None) else None, 'subjects': subj, 'rights': extras['Rights'].replace('info:eu-repo/semantics/openAccess', '') if 'Rights' in extras else None, 'openAccess': extras['OpenAccess'] if 'OpenAccess' in extras else None, 'coverage': coverage if coverage else None, } metadata = {} # Fixes the bug on having a large dataset being scrambled to individual # letters for key, value in meta.items(): if value and not isinstance(value, list): metadata[str(key)] = [value] else: metadata[str(key)] = value return (common.Header('', dataset.id, dataset.metadata_created, set_spec, False), common.Metadata('', metadata), None)
def nrd_metadata_reader(xml): '''Read metadata in NRD schema This function takes NRD metadata as an lxml.etree.Element object, and returns the same metadata as a dictionary, with central TTA elements picked to format-independent keys. :param xml: RDF metadata as XML-encoded NRD :type xml: lxml.etree.Element instance :returns: a metadata dictionary :rtype: a hash from string to any value ''' result = rdf_reader(xml).getMap() def document_attrs(source, dest, result): '''Callback for copying document attributes''' copy_element(source + '/dct:title', dest + '/title', result) copy_element(source + '/dct:identifier', dest, result) copy_element(source + '/dct:creator', dest + '/creator.0/name', result) copy_element(source + '/nrd:creator', dest + '/creator', result, person_attrs) copy_element(source + '/dct:description', dest + '/description', result) def funding_attrs(source, dest, result): '''Callback for copying project attributes''' copy_element(source + '/rev:arpfo:funds.0/arpfo:grantNumber', dest + '/fundingNumber', result) copy_element(source + '/rev:arpfo:funds.0/rev:arpfo:provides', dest + '/funder', result, person_attrs) def file_attrs(source, dest, result): '''Callback for copying manifestation attributes''' copy_element(source + '/dcat:mediaType', dest + '/mimetype', result) copy_element(source + '/fp:checksum.0/fp:checksumValue.0', dest + '/checksum.0', result) copy_element(source + '/fp:checksum.0/fp:generator.0', dest + '/checksum.0/algorithm', result) copy_element(source + '/dcat:byteSize', dest + '/size', result) mapping = [ (u'dataset', u'versionidentifier', None), (u'dataset/nrd:continuityIdentifier', u'continuityidentifier', None), (u'dataset/rev:foaf:primaryTopic.0/nrd:metadataIdentifier', u'metadata/identifier', None), (u'dataset/rev:foaf:primaryTopic.0/nrd:metadataModified', u'metadata/modified', None), (u'dataset/dct:title', u'title', None), (u'dataset/nrd:modified', u'modified', None), (u'dataset/nrd:rights', u'rights', None), (u'dataset/nrd:language', u'language', None), (u'dataset/nrd:owner', u'owner', person_attrs), (u'dataset/nrd:creator', u'creator', person_attrs), (u'dataset/nrd:distributor', u'distributor', person_attrs), (u'dataset/nrd:contributor', u'contributor', person_attrs), (u'dataset/nrd:subject', u'subject', None), # fetch tags? (u'dataset/nrd:producerProject', u'project', funding_attrs), (u'dataset/dct:isPartOf', u'collection', document_attrs), (u'dataset/dct:requires', u'requires', None), (u'dataset/nrd:discipline', u'discipline', None), (u'dataset/nrd:temporal', u'temporalcoverage', None), (u'dataset/nrd:spatial', u'spatialcoverage', None), # names? (u'dataset/nrd:manifestation', u'resource', file_attrs), (u'dataset/nrd:observationMatrix', u'variables', None), # TODO (u'dataset/nrd:usedByPublication', u'publication', document_attrs), (u'dataset/dct:description', u'description', None), ] for source, dest, callback in mapping: copy_element(source, dest, result, callback) try: rights = lxml.etree.XML(result[u'rights']) rightsclass = rights.attrib['RIGHTSCATEGORY'].lower() result[u'rightsclass'] = rightsclass if rightsclass == 'licensed': result[u'license'] = rights[0].text if rightsclass == 'contractual': result[u'accessURL'] = rights[0].text except: pass return oc.Metadata(result)
def __call__(self, metashare_elem ): xml = etree.tostring(metashare_elem[0], pretty_print=True ) return common.Metadata({ "raw_xml": xml })
def _record_for_dataset_b2f(self, dataset, set_spec): '''Show a tuple of a header and metadata for this dataset. ''' package = get_action('package_show')({}, {'id': dataset.id}) # coverage = [] # temporal_begin = package.get('temporal_coverage_begin', '') # temporal_end = package.get('temporal_coverage_end', '') # geographic = package.get('geographic_coverage', '') # if geographic: # coverage.extend(geographic.split(',')) # if temporal_begin or temporal_end: # coverage.append("%s/%s" % (temporal_begin, temporal_end)) # Loops through extras -table: extras = {} for item in package['extras']: for key, value in item.iteritems(): key = item[ 'key'] # extras table is constructed as key: language, value: English value = item[ 'value'] # instead of language : English, that is why it is looped here extras.update({key: value}) identifiers = self._set_id(package, extras) keywords = [tag.get('display_name') for tag in package['tags'] ] if package.get('tags', None) else None meta = { 'community': package.get('group', None), 'DOI': extras['DOI'] if 'DOI' in extras else None, 'PID': extras['PID'] if 'PID' in extras else None, 'version': extras['Version'] if 'Version' in extras else None, 'source': package.get('url', None), 'relatedIdentifier': extras['RelatedIdentifier'] if 'RelatedIdentifier' in extras else None, 'creator': [author for author in package['author'].split(";")] if 'author' in package else None, 'publisher': extras['Publisher'] if 'Publisher' in extras else None, 'contact': extras['Contact'] if 'Contact' in extras else None, 'publicationYear': extras['PublicationYear'] if 'PublicationYear' in extras else None, 'metadataAccess': extras['MetaDataAccess'] if 'MetaDataAccess' in extras else None, 'resourceType': extras['ResourceType'] if 'ResourceType' in extras else None, 'language': extras['Language'] if 'Language' in extras else None, 'titles': package.get('title', None) or package.get('name'), 'contributor': extras['Contributor'] if 'Contributor' in extras else None, 'descriptions': self._get_json_content(package.get('notes')) if package.get( 'notes', None) else None, 'keywords': keywords, 'disciplines': extras['Discipline'] if 'Discipline' in extras else None, 'rights': extras['Rights'].replace('info:eu-repo/semantics/openAccess', '') if 'Rights' in extras else None, 'openAccess': extras['OpenAccess'] if 'OpenAccess' in extras else None, 'size': extras['Size'] if 'Size' in extras else None, 'format': extras['Format'] if 'Format' in extras else None, 'spatialCoverage': extras['SpatialCoverage'] if 'SpatialCoverage' in extras else None, 'temporalCoverage': extras['TemporalCoverage'] if 'TemporalCoverage' in extras else None, 'fundingReference': extras['FundingReference'] if 'FundingReference' in extras else None, } metadata = {} # Fixes the bug on having a large dataset being scrambled to individual # letters for key, value in meta.items(): if value and not isinstance(value, list): metadata[str(key)] = [value] else: metadata[str(key)] = value return (common.Header('', dataset.id, dataset.metadata_created, set_spec, False), common.Metadata('', metadata), None)
def _record_for_dataset_datacite(self, dataset, set_spec): '''Show a tuple of a header and metadata for this dataset. ''' package = get_action('package_show')({}, {'id': dataset.id}) # Loops through extras -table: extras = {} for item in package['extras']: for key, value in item.iteritems(): key = item[ 'key'] # extras table is constructed as key: language, value: English value = item[ 'value'] # instead of language : English, that is why it is looped here values = value.split(";") extras.update({key: values}) temporal_begin = extras.get('TemporalCoverage:BeginDate') temporal_end = extras.get('TemporalCoverage:EndDate') dates = [] if temporal_begin or temporal_end: begin = temporal_begin[0] if temporal_begin else '' end = temporal_end[0] if temporal_end else '' dates.append("%s/%s" % (begin, end)) # identifiers = self._set_id(package, extras) subj = [tag.get('display_name') for tag in package['tags'] ] if package.get('tags', None) else None if subj is not None and 'Discipline' in extras: subj.extend(extras['Discipline']) author = package.get('author') if author: authors = [a for a in author.split(";")] else: authors = None meta = { 'DOI': extras['DOI'] if 'DOI' in extras else None, 'PID': extras['PID'] if 'PID' in extras else None, 'version': extras['Version'] if 'Version' in extras else None, 'source': package.get('url', None), 'relatedIdentifier': extras['RelatedIdentifier'] if 'RelatedIdentifier' in extras else None, 'creator': authors if authors else None, 'publisher': extras['Publisher'] if 'Publisher' in extras else None, 'publicationYear': extras['PublicationYear'] if 'PublicationYear' in extras else None, 'publicationTimestamp': extras['PublicationTimestamp'] if 'PublicationTimestamp' in extras else None, 'resourceType': extras['ResourceType'] if 'ResourceType' in extras else None, 'language': extras['Language'] if 'Language' in extras else None, 'titles': package.get('title', None) or package.get('name'), 'contributor': extras['Contributor'] if 'Contributor' in extras else None, 'descriptions': self._get_json_content(package.get('notes')) if package.get( 'notes', None) else None, 'subjects': subj, 'rights': extras['Rights'] if 'Rights' in extras else None, 'openAccess': extras['OpenAccess'] if 'OpenAccess' in extras else None, 'size': extras['Size'] if 'Size' in extras else None, 'format': extras['Format'] if 'Format' in extras else None, 'fundingReference': extras['FundingReference'] if 'FundingReference' in extras else None, 'dates': dates if dates else None, 'geoLocation': extras['SpatialCoverage'] if 'SpatialCoverage' in extras else None, } metadata = {} # Fixes the bug on having a large dataset being scrambled to individual # letters for key, value in meta.items(): if value and not isinstance(value, list): metadata[str(key)] = [value] else: metadata[str(key)] = value return (common.Header('', dataset.id, dataset.metadata_created, set_spec, False), common.Metadata('', metadata), None)
def __call__(self, metadata_element, nsprefix="nlmaa:"): map = {} #logging.debug("Parsing " + etree.tostring(metadata_element)) article = self._find_element(metadata_element, "{0}article".format(nsprefix)) #In the case of the bulk importer, the root element is Article if article is None: article = metadata_element # front front = self._find_element(article, "{0}front".format(nsprefix)) # back back = self._find_element(article, "{0}back".format(nsprefix)) # journal meta journal_meta = self._find_element(front, "{0}journal-meta".format(nsprefix)) # article metadata article_meta = self._find_element(front, "{0}article-meta".format(nsprefix)) if journal_meta is not None: try: map["journal"] = {} (self._set_map_with_element_text( map["journal"], "name", journal_meta, "{0}journal-title-group/{0}journal-title".format(nsprefix)) or self._set_map_with_element_text( map["journal"], "name", journal_meta, "{0}journal-title".format(nsprefix))) issns = journal_meta.findall("{0}issn".format(nsprefix), self._namespaces) if issns: map["journal"]["identifier"] = [] for issn in issns: map["journal"]["identifier"].append({ "type": issn.get('pub-type'), "id": issn.text, "canonical": issn.get('pub-type') + ':' + issn.text }) self._set_map_with_element_text( map["journal"], "publisher", journal_meta, "{0}publisher/{0}publisher-name".format(nsprefix)) except: logging.error("Could not extract journal metadata") else: logging.info("No journal metadata found for ") if article_meta is not None: try: #identifiers article_ids = article_meta.findall( "{0}article-id".format(nsprefix), self._namespaces) if article_ids: map["identifier"] = [] for article_id in article_ids: map["identifier"].append({ "type": article_id.get('pub-id-type'), "id": article_id.text, "canonical": article_id.get('pub-id-type') + ':' + article_id.text }) if article_id.get( 'pub-id-type' ) == 'pmid' and article_id.text == '17242517': print "FOUND THE record with missing citations" logging.critical( "FOUND THE record with missing citations") logging.critical(etree.tostring(metadata_element)) except: logging.error( "Could not extract identifiers from article metadata") try: #title self._set_map_with_element_text( map, "title", article_meta, "{0}title-group/{0}article-title".format(nsprefix)) except: logging.error("Could not extract title from article metadata") try: #pagination self._set_map_with_element_text(map, "volume", article_meta, "{0}volume".format(nsprefix)) self._set_map_with_element_text(map, "issue", article_meta, "{0}issue".format(nsprefix)) self._set_map_with_element_text(map, "firstpage", article_meta, "{0}fpage".format(nsprefix)) self._set_map_with_element_text(map, "lastpage", article_meta, "{0}lpage".format(nsprefix)) if "firstpage" in map: if "lastpage" in map and (map["firstpage"] != map["lastpage"]): map["pages"] = map["firstpage"] + "-" + map["lastpage"] else: map["pages"] = map["firstpage"] except: logging.error( "Could not extract pagination from article metadata") try: #publication date # why only use the pmc-release date? need to check with Mark pub_date = article_meta.find( "{0}pub-date[@pub-type='pmc-release']".format(nsprefix), self._namespaces) if pub_date is not None: self._set_map_with_element_text(map, "year", pub_date, "{0}year".format(nsprefix)) self._set_map_with_element_text( map, "month", pub_date, "{0}month".format(nsprefix)) self._set_map_with_element_text(map, "day", pub_date, "{0}day".format(nsprefix)) else: logging.info("No publication data for ") except: logging.error( "Could not extract publication date from article metadata") try: #copyright self._set_map_with_element_text( map, "copyright", article_meta, "{0}permissions/{0}copyright-statement".format(nsprefix)) except: logging.error( "Could not extract copyright info from article metadata") try: #abstract self._set_map_with_element_xml(map, "abstract", article_meta, "{0}abstract".format(nsprefix)) except: logging.error( "Could not extract abstract from article metadata") try: #keywords keywords = article_meta.findall( "{0}kwd_group/{0}kwd".format(nsprefix), self._namespaces) if keywords: map["keyword"] = [] for keyword in keywords: map["keyword"].append(keyword.text) else: logging.info("No keywords for ") except: logging.error( "Could not extract keywords from article metadata") try: #contributors contribs = article_meta.findall( "{0}contrib-group/{0}contrib".format(nsprefix), self._namespaces) if contribs: map["author"] = [] map["editor"] = [] for contrib in contribs: entity = {} if contrib.get('corresp') == 'yes': entity["corresponding"] = 'yes' self._set_map_with_element_text( entity, "lastname", contrib, "{0}name/{0}surname".format(nsprefix)) self._set_map_with_element_text( entity, "forenames", contrib, "{0}name/{0}given-names".format(nsprefix) ) #MW: Changed firstname to forenames. Discuss with Mark. if "lastname" in entity and entity[ "lastname"] is not None and "forenames" in entity and entity[ "forenames"] is not None: entity["name"] = entity[ "lastname"] + ", " + entity["forenames"] email = contrib.find( "{0}address/{0}email".format(nsprefix), self._namespaces) if email is None: email = contrib.find("{0}email".format(nsprefix), self._namespaces) if email is not None: entity["identifier"] = { "type": "email", "id": email.text } xrefs = contrib.findall("{0}xref".format(nsprefix), self._namespaces) affs = article_meta.findall( "{0}aff".format(nsprefix), self._namespaces ) #NOT ContribGroup - check with Mark for xref in xrefs: if xref.get('ref-type') == "aff": rid = xref.get("rid") for aff in affs: if aff.get("id") == rid: if "affiliation" not in entity: entity["affiliation"] = [] for text in aff.itertext(): entity["affiliation"].append(text) if contrib.get("contrib-type") == "author": map["author"].append(entity) if contrib.get("contrib-type") == "editor": map["editor"].append(entity) else: logging.info("No contributors found for ") except: logging.error( "Could not extract contributors from article metadata") else: logging.info("No article metadata found for ") if back is not None: acknowledgements = back.findall( "{0}ack/{0}sec/{0}p".format(nsprefix), self._namespaces) if acknowledgements: map["acknowledgement"] = [] for acknowledgement in acknowledgements: map["acknowledgement"].append(acknowledgement.text) else: logging.info("No acknowledgements found for ") conflicts = back.findall("{0}fn-group/{0}fn/{0}p".format(nsprefix), self._namespaces) if conflicts: map["conflict"] = [] for conflict in conflicts: map["conflict"].append(conflict.text) else: logging.info("No conflicts found for ") refs = back.findall("{0}ref-list/{0}ref".format(nsprefix), self._namespaces) if refs: map["citation"] = [] for ref in refs: entity = {} self._set_map_with_element_text( entity, "label", ref, "{0}label".format(nsprefix)) #Three different ways to cite articles. Check with Mark. citation = ref.find("{0}mixed-citation".format(nsprefix), self._namespaces) if citation is None: citation = ref.find( "{0}element-citation".format(nsprefix), self._namespaces) if citation is None: citation = ref.find("{0}citation".format(nsprefix), self._namespaces) if citation is not None: self._set_map_with_element_text( entity, "title", citation, "{0}article-title".format(nsprefix)) pub_ids = citation.findall( "{0}pub-id".format(nsprefix), self._namespaces) if pub_ids: entity["identifier"] = [] for pub_id in pub_ids: entity["identifier"].append({ "type": pub_id.get('pub-id-type'), "id": pub_id.text, 'canonical': pub_id.get('pub-id-type') + ':' + pub_id.text }) # TODO: should this append happen even if the entity is empty? or bring into the above IF map["citation"].append(entity) # add code here to create a record for this citation if it does not already exist else: logging.info("No refs found for ") else: logging.info("No back metadata for ") #logging.debug("MAP: ") #logging.debug(map) return common.Metadata(map)
def __call__(self, metadata_element): map = {'identifier': [], 'journal': {}, 'author': []} arXiv = self._find_element(metadata_element, "arXiv:arXiv") arXiv_id = self._find_element_text(arXiv, "arXiv:id") doi = self._find_element_text(arXiv, "arXiv:doi") if arXiv_id is not None: map["identifier"].append({ 'type': 'arXiv', 'id': arXiv_id, 'canonical': 'arXiv:' + arXiv_id }) if doi is not None: map["identifier"].append({ 'type': 'doi', 'id': doi, 'canonical': 'doi:' + doi }) #title self._set_map_with_element_text(map, "title", arXiv, "arXiv:title") #copyright ? #license self._set_map_with_element_text(map, "license", arXiv, "arXiv:license") #abstract self._set_map_with_element_text(map, "abstract", arXiv, "arXiv:abstract") #journal self._set_map_with_element_text(map['journal'], "reference", arXiv, "arXiv:journal-ref") self._set_map_with_element_text(map['journal'], "comments", arXiv, "arXiv:comments") self._set_map_with_element_text(map['journal'], "categories", arXiv, "arXiv:categories") #authors for author in self._find_elements(arXiv, "arXiv:authors/arXiv:author"): entity = {} self._set_map_with_element_text(entity, "lastname", author, "arXiv:keyname") self._set_map_with_element_text( entity, "forenames", author, "arXiv:forenames" ) #MW: Changed firstname to forenames. Discuss with Mark. self._set_map_with_element_text(entity, "suffix", author, "arXiv:suffix") if "lastname" in entity and entity[ "lastname"] is not None and "forenames" in entity and entity[ "forenames"] is not None: entity[ "name"] = entity["lastname"] + ", " + entity["forenames"] affiliations = self._find_elements(author, "arXiv:affiliation") if affiliations: entity["affiliation"] = [] for affiliation in affiliations: entity["affiliation"].append(affiliation.text) map["author"].append(entity) return common.Metadata(map)
def build_record(self, metadata): """Construct a OAI-PMH payload for a record""" return common.Metadata(None, metadata)
def read(self): """ Parse metadata and return metadata (oaipmh.common.Metadata) with unified dictionty. """ unified = self._read() result = xml_reader(self.xml).getMap() result['unified'] = unified return oc.Metadata(self.xml, result)