def substitute_hyphens(doc): insert_tags( doc, re.compile("(?<=[^-\s])-(?=[^-\s])"), "dywiz", exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"])
class BookInfo(WorkInfo): FIELDS = ( Field(DCNS('audience'), 'audiences', salias='audience', multiple=True, required=False), Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True, required=False), Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True, required=False), Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False), Field(DCNS('contributor.translator'), 'translators', as_person, salias='translator', multiple=True, default=[]), Field(DCNS('relation.hasPart'), 'parts', WLURI, strict=as_wluri_strict, multiple=True, required=False), Field(DCNS('relation.isVariantOf'), 'variant_of', WLURI, strict=as_wluri_strict, required=False), Field(DCNS('relation.coverImage.url'), 'cover_url', required=False), Field(DCNS('relation.coverImage.attribution'), 'cover_by', required=False), Field(DCNS('relation.coverImage.source'), 'cover_source', required=False), # WLCover-specific. Field(WLNS('coverBarColor'), 'cover_bar_color', required=False), Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False), Field('pdf-id', 'isbn_pdf', required=False), Field('epub-id', 'isbn_epub', required=False), Field('mobi-id', 'isbn_mobi', required=False), Field('txt-id', 'isbn_txt', required=False), Field('html-id', 'isbn_html', required=False), )
def mark_subauthors(doc): root_author = ', '.join( elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed'))) last_author = None # jeśli autor jest inny niż autor całości i niż poprzedni autor # to wstawiamy jakiś znacznik w rdf? for subutwor in doc.xpath('/utwor/utwor'): author = ', '.join( elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed'))) if author not in (last_author, root_author): subutwor.find('.//' + RDFNS('RDF')).append( etree.Element('use_subauthor')) last_author = author
def transform(wldoc, fieldfile): # ugly, but we can't use wldoc.book_info here from librarian import DCNS url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url')) if url_elem is None: gallery = '' else: gallery = gallery_url(slug=url_elem.text.rsplit('/', 1)[1]) return wldoc.as_html(options={'gallery': "'%s'" % gallery})
class WorkInfo(six.with_metaclass(DCInfo, object)): FIELDS = ( Field(DCNS('creator'), 'authors', as_person, salias='author', multiple=True), Field(DCNS('title'), 'title'), Field(DCNS('type'), 'type', required=False, multiple=True), Field(DCNS('contributor.editor'), 'editors', as_person, salias='editor', multiple=True, required=False), Field(DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', multiple=True, required=False), Field(DCNS('contributor.funding'), 'funders', salias='funder', multiple=True, required=False), Field(DCNS('contributor.thanks'), 'thanks', required=False), Field(DCNS('date'), 'created_at'), Field(DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), Field(DCNS('publisher'), 'publisher', multiple=True), Field(DCNS('language'), 'language'), Field(DCNS('description'), 'description', required=False), Field(DCNS('source'), 'source_name', required=False), Field(DCNS('source.URL'), 'source_url', required=False), Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict), Field(DCNS('rights.license'), 'license', required=False), Field(DCNS('rights'), 'license_description'), Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True, required=False), Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False), Field(WLNS('developmentStage'), 'stage', required=False), ) @classmethod def from_bytes(cls, xml, *args, **kwargs): return cls.from_file(six.BytesIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break if desc_tag is None: raise NoDublinCore("DublinCore section not found. \ Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError as e: raise ParseError(e) except ExpatError as e: raise ParseError(e) @classmethod def from_element(cls, rdf_tag, *args, **kwargs): # the tree is already parsed, so we don't need to worry about Expat errors field_dict = {} desc = rdf_tag.find(".//" + RDFNS('Description')) if desc is None: raise NoDublinCore("No DublinCore section found.") lang = None p = desc while p is not None and lang is None: lang = p.attrib.get(XMLNS('lang')) p = p.getparent() for e in desc.getchildren(): fv = field_dict.get(e.tag, []) if e.text is not None: text = e.text if not isinstance(text, six.text_type): text = text.decode('utf-8') val = TextPlus(text) val.lang = e.attrib.get(XMLNS('lang'), lang) if e.tag == 'meta': meta_id = e.attrib.get('id') if meta_id and meta_id.endswith('-id'): field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')] else: val = e.text fv.append(val) field_dict[e.tag] = fv return cls(desc.attrib, field_dict, *args, **kwargs) def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the given field. """ self.about = rdf_attrs.get(RDFNS('about')) self.fmap = {} for field in self.FIELDS: value = field.validate(dc_fields, fallbacks=fallbacks, strict=strict) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: self.fmap[field.salias] = field def __getattribute__(self, name): try: field = object.__getattribute__(self, 'fmap')[name] value = object.__getattribute__(self, 'prop_' + field.name) if field.name == name: return value else: # singular alias if not field.multiple: raise "OUCH!! for field %s" % name return value[0] if value else None except (KeyError, AttributeError): return object.__getattribute__(self, name) def __setattr__(self, name, newvalue): try: field = object.__getattribute__(self, 'fmap')[name] if field.name == name: object.__setattr__(self, 'prop_' + field.name, newvalue) else: # singular alias if not field.multiple: raise "OUCH! while setting field %s" % name object.__setattr__(self, 'prop_' + field.name, [newvalue]) except (KeyError, AttributeError): return object.__setattr__(self, name, newvalue) def update(self, field_dict): """Update using field_dict. Verify correctness, but don't check if all required fields are present.""" for field in self.FIELDS: if field.name in field_dict: setattr(self, field.name, field_dict[field.name]) def to_etree(self, parent=None): """XML representation of this object.""" # etree._namespace_map[str(self.RDF)] = 'rdf' # etree._namespace_map[str(self.DC)] = 'dc' if parent is None: root = etree.Element(RDFNS('RDF')) else: root = parent.makeelement(RDFNS('RDF')) description = etree.SubElement(root, RDFNS('Description')) if self.about: description.set(RDFNS('about'), self.about) for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: if len(v) == 0: continue for x in v: e = etree.Element(field.uri) if x is not None: e.text = six.text_type(x) description.append(e) else: e = etree.Element(field.uri) e.text = six.text_type(v) description.append(e) return root def serialize(self): rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}} dc = {} for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: if len(v) == 0: continue v = [six.text_type(x) for x in v if x is not None] else: v = six.text_type(v) dc[field.name] = {'uri': field.uri, 'value': v} rdf['fields'] = dc return rdf def to_dict(self): result = {'about': self.about} for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: if len(v) == 0: continue v = [six.text_type(x) for x in v if x is not None] else: v = six.text_type(v) result[field.name] = v if field.salias: v = getattr(self, field.salias) if v is not None: result[field.salias] = six.text_type(v) return result
class WorkInfo(object): __metaclass__ = DCInfo FIELDS = ( Field(DCNS('creator'), 'authors', as_person, salias='author', multiple=True), Field(DCNS('title'), 'title'), Field(DCNS('type'), 'type', required=False, multiple=True), Field(DCNS('contributor.editor'), 'editors', as_person, salias='editor', multiple=True, default=[]), Field(DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', multiple=True, default=[]), Field(DCNS('contributor.funding'), 'funders', salias='funder', multiple=True, default=[]), Field(DCNS('contributor.thanks'), 'thanks', required=False), Field(DCNS('date'), 'created_at'), Field(DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), Field(DCNS('publisher'), 'publisher', multiple=True), Field(DCNS('language'), 'language'), Field(DCNS('description'), 'description', required=False), Field(DCNS('source'), 'source_name', required=False), Field(DCNS('source.URL'), 'source_url', required=False), Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict), Field(DCNS('rights.license'), 'license', required=False), Field(DCNS('rights'), 'license_description'), Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True, default=[]), Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False), Field(WLNS('developmentStage'), 'stage', required=False), ) @classmethod def from_string(cls, xml, *args, **kwargs): from StringIO import StringIO return cls.from_file(StringIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break if desc_tag is None: raise NoDublinCore("DublinCore section not found. \ Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError, e: raise ParseError(e) except ExpatError, e: raise ParseError(e)
def fix_hanging(doc): insert_tags(doc, re.compile("(?<=\s\w)\s+"), "nbsp", exclude=[DCNS("identifier.url"), DCNS("rights.license")])