def map_creator(self): """<mods:name><mods:namePart> when <mods:role><mods:roleTerm> equals Creator""" prop = self.root_key + "name" roleTypes = [] _dict = {"creator": []} for s in iterify(getprop(self.provider_data, prop, True)): name = s.get("namePart") if name: try: # Get all the roleTerm values for a given mods:name # entity roleTypes = [textnode(r.get("roleTerm")) for r in iterify(s.get("role"))] except Exception as e: continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if "creator" in map(unicode.lower, roleTypes): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def first_date(els): """Return first date string from originInfo elements""" for el in els: # Allow for each of the following elements to be an array of # dicts or strings, or one on its own. date_created = iterify(el.get('dateCreated', [])) date_issued = iterify(el.get('dateIssued', [])) date_other = iterify(el.get('dateOther', [])) sort_date = iterify(el.get('sortDate', [])) try: for d in date_created: if type(d) == dict and d.get('keyDate') == 'yes': return textnode(d) for d in date_issued: if type(d) == dict and d.get('point') == 'start': return textnode(d) # Nothing yet? Take first dateOther, ignoring attributes: if date_other: return textnode(date_other[0]) # OK, then take the first dateIssued we can get, ignoring # attribute ... if date_issued: return textnode(date_issued[0]) # Still nothing? Try sortDate: if sort_date: return textnode(sort_date[0]) except NoTextNodeError: # Weird, but date is not required. pass
def map_date(self): originInfoPath = self.root_key + "originInfo" dateCreated = [] dateIssued = [] date_begin, date_end = None, None if exists(self.provider_data, originInfoPath): for date in iterify(getprop(self.provider_data, originInfoPath)): if "dateCreated" in date: dateCreated.append(textnode(date["dateCreated"])) if "dateIssued" in date: t = date["dateIssued"] try: if "point" not in t: dateIssued.append(textnode(t)) elif "point" in t and t["point"] == "start": date_begin = textnode(t) elif "point" in t and t["point"] == "end": date_end = textnode(t) except Exception as e: logger.error("Exception when trying to map date " "values. for record %s \n\n%s" % (self.provider_data % e.message)) # If there are no dateIssued or dateCreated properties then construct # a date range from begin and end points (if they exist). if date_begin and date_end and not dateCreated and not dateIssued: dateIssued.append(date_begin + "-" + date_end) if dateCreated: self.update_source_resource({"date": dateCreated}) elif dateIssued: self.update_source_resource({"date": dateIssued})
def map_creator(self): """<mods:name><mods:namePart> when <mods:role><mods:roleTerm> equals Creator""" prop = self.root_key + "name" roleTypes = [] _dict = {"creator": []} for s in iterify(getprop(self.provider_data, prop, True)): name = s.get("namePart") if name: try: # Get all the roleTerm values for a given mods:name # entity roleTypes = [ textnode(r.get("roleTerm")) for r in iterify(s.get("role")) ] except Exception as e: continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if "creator" in map(unicode.lower, roleTypes): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord }) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord}) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def idstrings(els): for el in els: try: if type(el) == dict and el.get('type') == 'oclc': t = 'OCLC:' + textnode(el).strip() else: t = textnode(el).strip() yield t except NoTextNodeError: pass
def subject_strings(els): """Yield all subject strings from given list of elements""" for el in els: try: s = None if 'topic' in el: s = textnode(el['topic']) elif 'theme' in el: s = textnode(el['theme']) if s: for subj in s.split(';'): yield subj.strip() except NoTextNodeError: pass
def _genre_strings(els): """Yield genre strings from list of elements""" for el in els: try: yield textnode(el).strip() except NoTextNodeError: pass
def map_data_provider(self, prop="source"): path = "/metadata/mods/recordInfo/recordContentSource" data_provider = [] if exists(self.provider_data, path): for dp in iterify(getprop(self.provider_data, path)): data_provider.append(textnode(dp)) self.mapped_data.update({"dataProvider": data_provider})
def map_data_provider(self): """Map dataProvider In feed XML: //record/metadata/mods/note[@type='ownership'] In body of mapper request JSON: .note """ prop = self.root_key + 'note' note = None if exists(self.provider_data, prop): note = iterify(getprop(self.provider_data, prop)) if note: try: ownership = [ e for e in note if type(e) == dict and e['type'] == 'ownership' ] if ownership: provider = ownership[0] self.mapped_data.update({'dataProvider': textnode(e)}) except (KeyError, NoTextNodeError): # There was no note with a 'type' attribute, or there was, but # it was an XML element lacking a text node. pass
def map_identifier(self): path = "/metadata/mods/identifier" identifiers = [] if exists(self.provider_data, path): for tn_id in iterify(getprop(self.provider_data, path)): identifiers.append(textnode(tn_id)) if identifiers: self.update_source_resource({"identifier": identifiers})
def fmt_from_physdesc(phys_descs): """Yield format strings from a list of physicalDescriptions""" for pd in phys_descs: if 'note' in pd: try: yield textnode(pd['note']) except NoTextNodeError: pass
def pub_strings(els): for el in els: try: rv = textnode(el['publisher']).strip(' ,\n') if rv: yield rv except (KeyError, NoTextNodeError): pass
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_identifier(self): """//dc:identifier -> .sourceResource.identifier""" if exists(self.provider_data, "dc:identifier"): identifiers = [] for identifier in iterify(self.provider_data.get("identifier")): identifiers.append(textnode(identifier)) identifiers = filter(None, identifiers) if identifiers: self.update_source_resource({"identifier": identifiers})
def map_language(self): lang = [] if exists(self.provider_data, "language"): for s in iterify(getprop(self.provider_data, "language")): lang.append(textnode(s)) lang = filter(None, lang) if lang: self.update_source_resource({"language": lang})
def map_publisher(self): path = "/metadata/mods/originInfo/publisher" pub = [] if exists(self.provider_data, path): for p in iterify(getprop(self.provider_data, path)): pub.append(textnode(p)) if pub: self.update_source_resource({"publisher": pub})
def map_format(self): path = "/metadata/mods/physicalDescription/form" formats = [] if exists(self.provider_data, path): for f in iterify(getprop(self.provider_data, path)): formats.append(textnode(f)) if formats: self.update_source_resource({"format": formats})
def map_format(self): prop = self.root_key + "physicalDescription/form" formats = [] for f in iterify(getprop(self.provider_data, prop, True)): formats.append(textnode(f)) if formats: self.update_source_resource({"format": formats})
def map_extent(self): path = "/metadata/mods/physicalDescription/extent" extents = [] if exists(self.provider_data, path): for e in iterify(getprop(self.provider_data, path)): extents.append(textnode(e)) if extents: self.update_source_resource({"extent": extents})
def map_description(self): description = [] path = "/metadata/mods/abstract" for d in iterify(getprop(self.provider_data, path, True)): description.append(textnode(d)) if description: self.update_source_resource({"description": description})
def map_rights(self): prop = self.root_key + "accessCondition" if exists(self.provider_data, prop): rights = [] rights_uri = "" for s in iterify(getprop(self.provider_data, prop)): if isinstance(s, dict): if s.get("type") == "local rights statements": rights.append(textnode(s)) elif s.get("type") == "use and reproduction": rights_uri = textnode(s) if rights: self.update_source_resource({"rights": rights}) if rights_uri: self.mapped_data.update({"rights": rights_uri})
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_type(self): """<mods:typeofresource>""" prop = self.root_key + "typeOfResource" types = [] for t in iterify(getprop(self.provider_data, prop, True)): types.append(textnode(t)) if types: self.update_source_resource({"type": types})
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update({"isShownAt": textnode(url)})
def _date_values(element): """Pick out the child element of the given XML element that is our date field, and return the date values from it as a list of strings. """ prop = [k for k in element if k in ['dateOther', 'dateIssued']] if not prop: return None date_list = iterify(element.get(prop[0])) return sorted([DateString(textnode(e)) for e in _date_elements(date_list, prop[0])])
def map_title(self): path = "/metadata/mods/titleInfo" titles = [] if exists(self.provider_data, path): for t in iterify(getprop(self.provider_data, path)): if exists(t, "title") and not exists(t, "title/type"): titles.append(textnode(getprop(t, "title"))) if titles: self.update_source_resource({"title": titles})
def map_extent(self): """<mods:physicalDescription><extent>""" prop = self.root_key + "physicalDescription/extent" extents = [] for e in iterify(getprop(self.provider_data, prop, True)): extents.append(textnode(e)) if extents: self.update_source_resource({"extent": extents})
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop,True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def first_media_type(phys_descs): """First internetMediaType string from physicalDescription list""" for pd in phys_descs: imt = pd.get('internetMediaType') if imt: try: return textnode(imt) except NoTextNodeError: pass return None
def map_spatial(self): """<mods:subject><mods:geographic>""" prop = self.root_key + "subject" geo = [] for s in iterify(getprop(self.provider_data, prop, True)): for g in iterify(getprop(s, "geographic", True)): geo.append(textnode(g)) if geo: self.update_source_resource({"spatial": geo})
def _date_values(element): """Pick out the child element of the given XML element that is our date field, and return the date values from it as a list of strings. """ prop = [k for k in element if k in ['dateOther', 'dateIssued']] if not prop: return None date_list = iterify(element.get(prop[0])) return sorted( [DateString(textnode(e)) for e in _date_elements(date_list, prop[0])])
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop, True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def map_creator_and_contributor(self): prop = self.root_key + "name" _dict = { "creator": [], "contributor": [] } if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): name = s.get("namePart") if name: rt = [] try: # Get all the roleTerm values for a given mods:name # entity rt = [textnode(r.get("roleTerm")) for r in iterify(s.get("role"))] except Exception as e: logger.error("Error getting name/role/roleTerm for " + "record %s" % self.provider_data["_id"]) continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if not rt or "creator" in map(unicode.lower, rt): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) elif "contributor" in map(unicode.lower, rt): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["contributor"].append(clean_name) else: _dict["contributor"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def map_creator_and_contributor(self): prop = self.root_key + "name" _dict = {"creator": [], "contributor": []} if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): name = s.get("namePart") if name: rt = [] try: # Get all the roleTerm values for a given mods:name # entity rt = [ textnode(r.get("roleTerm")) for r in iterify(s.get("role")) ] except Exception as e: logger.error("Error getting name/role/roleTerm for " + "record %s" % self.provider_data["_id"]) continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if not rt or "creator" in map(unicode.lower, rt): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) elif "contributor" in map(unicode.lower, rt): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["contributor"].append(clean_name) else: _dict["contributor"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def map_rights(self): path = "/metadata/mods/accessCondition" rights = [] if exists(self.provider_data, path): for r in iterify(getprop(self.provider_data, path)): t = getprop(r, "type", True) if t and t == "local rights statement": rights.append(textnode(r)) if rights: self.update_source_resource({"rights": rights})