class MetadataProcessor(object): def __init__(self, opfdata): self.opfdata = opfdata self.rec = [] self.refines = [] self.other = [] self.op = None self.md = None self.pkg = None self.id2rec = {} self.idlst = [] self.metadata_attr = {} def extract_recognized_metadata(self): self.op = OPFMetadataParser(self.opfdata) self.md = self.op.get_metadata() self.idlst = self.op.get_idlst() self.metadata_attr = self.op.get_metadata_attr() self.pkg = self.op.get_package() # first sort out recognized dc and primary meta from refines, and other metadata # while building up id2rec map, and removing id from idlst numrec = 0 (ver, uid, attr) = self.pkg for mentry in self.md: (mname, mcontent, mattr) = mentry # do not allow the gui to play with the unique-identifier to # prevent font obfuscation issues later if mname == "dc:identifier" and mattr.get("id","") == uid: self.other.append(mentry) continue if mname in _recognized_dc: self.rec.append(mentry) id = mattr.get("id",None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 elif mname == "meta" and "refines" in mattr: self.refines.append(mentry) elif mname == "meta" and "property" in mattr and mattr["property"] in _recognized_meta: # primary meta tag property = mattr["property"] del mattr["property"] mname = property mentry = (mname, mcontent, mattr) self.rec.append(mentry) id = mattr.get("id",None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 else: self.other.append(mentry) # finally convert any refines on metadata to be extra attributes on their target tag # all other types of metadata are added to "others" to they are not touched in any way for mentry in self.refines: (rname, rcontent, rattr) = mentry rid = rattr.get("id",None) tid = rattr["refines"] prop = rattr["property"] scheme = rattr.get("scheme", None) propval = rcontent if tid.startswith("#"): tid = tid[1:] if tid in self.id2rec: pos = self.id2rec[tid] (dname, dcontent, dattr) = self.rec[pos] dattr[prop] = propval if scheme is not None: dattr["scheme"] = scheme if prop == "alternate-script": dattr["altlang"] = rattr["xml:lang"] self.rec[pos] = (dname, dcontent, dattr) if rid is not None: self.idlst.remove(rid) else: # these refines refer to something that is not recognized metadata self.other.append(mentry) else: # this is refinement that doesn't seem to point to anything in the opf self.other.append(mentry) if _DEBUG: print("recongized", self.rec) print("other", self.other) print("idlst", self.idlst) # get recognized metadata with included refines as text based tree of # metadata elements with properties/attributes as indented children def get_recognized_metadata(self): data=[] for (dname, dcontent, dattr) in self.rec: content = xmldecode(dcontent) data.append(dname + _US + content + _RS) keys = sorted(list(dattr.keys())) for key in keys: val = xmldecode(dattr[key]) data.append(_IN + key + _US + val + _RS) return "".join(data) def get_other_meta_xml(self): res = [] for mentry in self.other: res.append(' ' + buildxml(mentry)) return "".join(res) def get_id_list(self): return self.idlst; def get_metadata_tag(self): res = [] res.append('<metadata') if self.metadata_attr is not None: for key in self.metadata_attr: val = self.metadata_attr[key] res.append(' ' + key + '="'+val+'"' ) res.append('>\n') return "".join(res)
class MetadataProcessor(object): def __init__(self, opfdata): self.opfdata = opfdata self.rec = [] self.pkg = None self.other = [] self.op = None self.md = None self.id2rec = {} self.idlst = [] self.metadata_attr = None def extract_recognized_metadata(self): self.op = OPFMetadataParser(self.opfdata) self.md = self.op.get_metadata() self.idlst = self.op.get_idlst() self.metadata_attr = self.op.get_metadata_attr().copy() self.pkg = self.op.get_package() # add the opf attribute namespace to the metadata tag for OPF 2 # and make sure the dc namespace is there as well if self.metadata_attr is None: self.metadata_attr = {} if "xmlsns:opf" not in self.metadata_attr: self.metadata_attr["xmlns:opf"] = "http://www.idpf.org/2007/opf" if "xmlsns:dc" not in self.metadata_attr: self.metadata_attr["xmlns:dc"] = "http://purl.org/dc/elements/1.1/" # first sort out recognized dc and other metadata # while building up id2rec map, and removing id from idlst # special case the cover image meta and the unique id meta (ver, uid, attr) = self.pkg numrec = 0 for mentry in self.md: (mname, mcontent, mattr) = mentry if mname == "dc:identifier" and mattr.get("id","") == uid: self.other.append(mentry) continue if mname in _recognized_dc: self.rec.append(mentry) id = mattr.get("id",None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 elif mname == "meta" and "name" in mattr and mattr["name"] not in _skip_meta: # normal meta tag mname = mattr["name"] del mattr["name"] mcontent = mattr.get("content","") del mattr["content"] mentry = (mname, mcontent, mattr) self.rec.append(mentry) id = mattr.get("id",None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 else: self.other.append(mentry) if _DEBUG: print("recongized", self.rec) print("other", self.other) print("idlst", self.idlst) # get recognized metadata as text based tree of # metadata elements with properties/attributes as indented children def get_recognized_metadata(self): data=[] for (dname, dcontent, dattr) in self.rec: content = xmldecode(dcontent) data.append(dname + _US + content + _RS) keys = sorted(list(dattr.keys())) for key in keys: val = xmldecode(dattr[key]) data.append(_IN + key + _US + val + _RS) return "".join(data) def get_other_meta_xml(self): res = [] for mentry in self.other: res.append(' ' + buildxml(mentry)) return "".join(res) def get_id_list(self): return self.idlst; def get_metadata_tag(self): res = [] res.append('<metadata') if self.metadata_attr is not None: for key in self.metadata_attr: val = self.metadata_attr[key] res.append(' ' + key + '="'+val+'"' ) res.append('>\n') return "".join(res)
class MetadataProcessor(object): def __init__(self, opfdata): self.opfdata = opfdata self.rec = [] self.refines = [] self.other = [] self.op = None self.md = None self.pkg = None self.id2rec = {} self.idlst = [] self.metadata_attr = {} def extract_recognized_metadata(self): self.op = OPFMetadataParser(self.opfdata) self.md = self.op.get_metadata() self.idlst = self.op.get_idlst() self.metadata_attr = self.op.get_metadata_attr() self.pkg = self.op.get_package() # first sort out recognized dc and primary meta from refines, and other metadata # while building up id2rec map, and removing id from idlst numrec = 0 (ver, uid, attr) = self.pkg for mentry in self.md: (mname, mcontent, mattr) = mentry # do not allow the gui to play with the unique-identifier to # prevent font obfuscation issues later if mname == "dc:identifier" and mattr.get("id", "") == uid: self.other.append(mentry) continue if mname in _recognized_dc: self.rec.append(mentry) id = mattr.get("id", None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 elif mname == "meta" and "refines" in mattr: self.refines.append(mentry) elif mname == "meta" and "property" in mattr and mattr[ "property"] in _recognized_meta: # primary meta tag property = mattr["property"] del mattr["property"] mname = property mentry = (mname, mcontent, mattr) self.rec.append(mentry) id = mattr.get("id", None) if id is not None: self.id2rec[id] = numrec self.idlst.remove(id) numrec += 1 else: self.other.append(mentry) # finally convert any refines on metadata to be extra attributes on their target tag # all other types of metadata are added to "others" to they are not touched in any way for mentry in self.refines: (rname, rcontent, rattr) = mentry rid = rattr.get("id", None) tid = rattr["refines"] prop = rattr["property"] scheme = rattr.get("scheme", None) propval = rcontent if tid.startswith("#"): tid = tid[1:] if tid in self.id2rec: pos = self.id2rec[tid] (dname, dcontent, dattr) = self.rec[pos] dattr[prop] = propval if scheme is not None: dattr["scheme"] = scheme if prop == "alternate-script": dattr["altlang"] = rattr["xml:lang"] self.rec[pos] = (dname, dcontent, dattr) if rid is not None: self.idlst.remove(rid) else: # these refines refer to something that is not recognized metadata self.other.append(mentry) else: # this is refinement that doesn't seem to point to anything in the opf self.other.append(mentry) if _DEBUG: print("recongized", self.rec) print("other", self.other) print("idlst", self.idlst) # get recognized metadata with included refines as text based tree of # metadata elements with properties/attributes as indented children def get_recognized_metadata(self): data = [] for (dname, dcontent, dattr) in self.rec: content = xmldecode(dcontent) data.append(dname + _US + content + _RS) keys = sorted(list(dattr.keys())) for key in keys: val = xmldecode(dattr[key]) data.append(_IN + key + _US + val + _RS) return "".join(data) def get_other_meta_xml(self): res = [] for mentry in self.other: res.append(' ' + buildxml(mentry)) return "".join(res) def get_id_list(self): return self.idlst def get_metadata_tag(self): res = [] res.append('<metadata') if self.metadata_attr is not None: for key in self.metadata_attr: val = self.metadata_attr[key] res.append(' ' + key + '="' + val + '"') res.append('>\n') return "".join(res)