Exemplo n.º 1
0
def substitute_hyphens(doc):
    insert_tags(
        doc,
        re.compile("(?<=[^-\s])-(?=[^-\s])"),
        "dywiz",
        exclude=[DCNS("identifier.url"),
                 DCNS("rights.license"), "meta"])
Exemplo n.º 2
0
class BookInfo(WorkInfo):
    FIELDS = (
        Field(DCNS('audience'),
              'audiences',
              salias='audience',
              multiple=True,
              required=False),
        Field(DCNS('subject.period'),
              'epochs',
              salias='epoch',
              multiple=True,
              required=False),
        Field(DCNS('subject.type'),
              'kinds',
              salias='kind',
              multiple=True,
              required=False),
        Field(DCNS('subject.genre'),
              'genres',
              salias='genre',
              multiple=True,
              required=False),
        Field(DCNS('contributor.translator'),
              'translators',
              as_person,
              salias='translator',
              multiple=True,
              default=[]),
        Field(DCNS('relation.hasPart'),
              'parts',
              WLURI,
              strict=as_wluri_strict,
              multiple=True,
              required=False),
        Field(DCNS('relation.isVariantOf'),
              'variant_of',
              WLURI,
              strict=as_wluri_strict,
              required=False),
        Field(DCNS('relation.coverImage.url'), 'cover_url', required=False),
        Field(DCNS('relation.coverImage.attribution'),
              'cover_by',
              required=False),
        Field(DCNS('relation.coverImage.source'),
              'cover_source',
              required=False),
        # WLCover-specific.
        Field(WLNS('coverBarColor'), 'cover_bar_color', required=False),
        Field(WLNS('coverBoxPosition'), 'cover_box_position', required=False),
        Field('pdf-id', 'isbn_pdf', required=False),
        Field('epub-id', 'isbn_epub', required=False),
        Field('mobi-id', 'isbn_mobi', required=False),
        Field('txt-id', 'isbn_txt', required=False),
        Field('html-id', 'isbn_html', required=False),
    )
Exemplo n.º 3
0
def mark_subauthors(doc):
    root_author = ', '.join(
        elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' +
                                          DCNS('creator_parsed')))
    last_author = None
    # jeśli autor jest inny niż autor całości i niż poprzedni autor
    # to wstawiamy jakiś znacznik w rdf?
    for subutwor in doc.xpath('/utwor/utwor'):
        author = ', '.join(
            elem.text
            for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
        if author not in (last_author, root_author):
            subutwor.find('.//' + RDFNS('RDF')).append(
                etree.Element('use_subauthor'))
        last_author = author
Exemplo n.º 4
0
 def transform(wldoc, fieldfile):
     # ugly, but we can't use wldoc.book_info here
     from librarian import DCNS
     url_elem = wldoc.edoc.getroot().find('.//' + DCNS('identifier.url'))
     if url_elem is None:
         gallery = ''
     else:
         gallery = gallery_url(slug=url_elem.text.rsplit('/', 1)[1])
     return wldoc.as_html(options={'gallery': "'%s'" % gallery})
Exemplo n.º 5
0
class WorkInfo(six.with_metaclass(DCInfo, object)):
    FIELDS = (
        Field(DCNS('creator'),
              'authors',
              as_person,
              salias='author',
              multiple=True),
        Field(DCNS('title'), 'title'),
        Field(DCNS('type'), 'type', required=False, multiple=True),
        Field(DCNS('contributor.editor'),
              'editors',
              as_person,
              salias='editor',
              multiple=True,
              required=False),
        Field(DCNS('contributor.technical_editor'),
              'technical_editors',
              as_person,
              salias='technical_editor',
              multiple=True,
              required=False),
        Field(DCNS('contributor.funding'),
              'funders',
              salias='funder',
              multiple=True,
              required=False),
        Field(DCNS('contributor.thanks'), 'thanks', required=False),
        Field(DCNS('date'), 'created_at'),
        Field(DCNS('date.pd'),
              'released_to_public_domain_at',
              as_date,
              required=False),
        Field(DCNS('publisher'), 'publisher', multiple=True),
        Field(DCNS('language'), 'language'),
        Field(DCNS('description'), 'description', required=False),
        Field(DCNS('source'), 'source_name', required=False),
        Field(DCNS('source.URL'), 'source_url', required=False),
        Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict),
        Field(DCNS('rights.license'), 'license', required=False),
        Field(DCNS('rights'), 'license_description'),
        Field(PLMETNS('digitisationSponsor'),
              'sponsors',
              multiple=True,
              required=False),
        Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
        Field(WLNS('developmentStage'), 'stage', required=False),
    )

    @classmethod
    def from_bytes(cls, xml, *args, **kwargs):
        return cls.from_file(six.BytesIO(xml), *args, **kwargs)

    @classmethod
    def from_file(cls, xmlfile, *args, **kwargs):
        desc_tag = None
        try:
            iter = etree.iterparse(xmlfile, ['start', 'end'])
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'start':
                    desc_tag = element
                    break

            if desc_tag is None:
                raise NoDublinCore("DublinCore section not found. \
                    Check if there are rdf:RDF and rdf:Description tags.")

            # continue 'till the end of RDF section
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'end':
                    break

            # if there is no end, Expat should yell at us with an ExpatError

            # extract data from the element and make the info
            return cls.from_element(desc_tag, *args, **kwargs)
        except XMLSyntaxError as e:
            raise ParseError(e)
        except ExpatError as e:
            raise ParseError(e)

    @classmethod
    def from_element(cls, rdf_tag, *args, **kwargs):
        # the tree is already parsed, so we don't need to worry about Expat errors
        field_dict = {}
        desc = rdf_tag.find(".//" + RDFNS('Description'))

        if desc is None:
            raise NoDublinCore("No DublinCore section found.")

        lang = None
        p = desc
        while p is not None and lang is None:
            lang = p.attrib.get(XMLNS('lang'))
            p = p.getparent()

        for e in desc.getchildren():
            fv = field_dict.get(e.tag, [])
            if e.text is not None:
                text = e.text
                if not isinstance(text, six.text_type):
                    text = text.decode('utf-8')
                val = TextPlus(text)
                val.lang = e.attrib.get(XMLNS('lang'), lang)
                if e.tag == 'meta':
                    meta_id = e.attrib.get('id')
                    if meta_id and meta_id.endswith('-id'):
                        field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')]
            else:
                val = e.text
            fv.append(val)
            field_dict[e.tag] = fv

        return cls(desc.attrib, field_dict, *args, **kwargs)

    def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False):
        """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description.
        dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
        given field. """

        self.about = rdf_attrs.get(RDFNS('about'))
        self.fmap = {}

        for field in self.FIELDS:
            value = field.validate(dc_fields,
                                   fallbacks=fallbacks,
                                   strict=strict)
            setattr(self, 'prop_' + field.name, value)
            self.fmap[field.name] = field
            if field.salias:
                self.fmap[field.salias] = field

    def __getattribute__(self, name):
        try:
            field = object.__getattribute__(self, 'fmap')[name]
            value = object.__getattribute__(self, 'prop_' + field.name)
            if field.name == name:
                return value
            else:  # singular alias
                if not field.multiple:
                    raise "OUCH!! for field %s" % name

                return value[0] if value else None
        except (KeyError, AttributeError):
            return object.__getattribute__(self, name)

    def __setattr__(self, name, newvalue):
        try:
            field = object.__getattribute__(self, 'fmap')[name]
            if field.name == name:
                object.__setattr__(self, 'prop_' + field.name, newvalue)
            else:  # singular alias
                if not field.multiple:
                    raise "OUCH! while setting field %s" % name

                object.__setattr__(self, 'prop_' + field.name, [newvalue])
        except (KeyError, AttributeError):
            return object.__setattr__(self, name, newvalue)

    def update(self, field_dict):
        """Update using field_dict. Verify correctness, but don't check if all
        required fields are present."""
        for field in self.FIELDS:
            if field.name in field_dict:
                setattr(self, field.name, field_dict[field.name])

    def to_etree(self, parent=None):
        """XML representation of this object."""
        # etree._namespace_map[str(self.RDF)] = 'rdf'
        # etree._namespace_map[str(self.DC)] = 'dc'

        if parent is None:
            root = etree.Element(RDFNS('RDF'))
        else:
            root = parent.makeelement(RDFNS('RDF'))

        description = etree.SubElement(root, RDFNS('Description'))

        if self.about:
            description.set(RDFNS('about'), self.about)

        for field in self.FIELDS:
            v = getattr(self, field.name, None)
            if v is not None:
                if field.multiple:
                    if len(v) == 0:
                        continue
                    for x in v:
                        e = etree.Element(field.uri)
                        if x is not None:
                            e.text = six.text_type(x)
                        description.append(e)
                else:
                    e = etree.Element(field.uri)
                    e.text = six.text_type(v)
                    description.append(e)

        return root

    def serialize(self):
        rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}

        dc = {}
        for field in self.FIELDS:
            v = getattr(self, field.name, None)
            if v is not None:
                if field.multiple:
                    if len(v) == 0:
                        continue
                    v = [six.text_type(x) for x in v if x is not None]
                else:
                    v = six.text_type(v)

                dc[field.name] = {'uri': field.uri, 'value': v}
        rdf['fields'] = dc
        return rdf

    def to_dict(self):
        result = {'about': self.about}
        for field in self.FIELDS:
            v = getattr(self, field.name, None)

            if v is not None:
                if field.multiple:
                    if len(v) == 0:
                        continue
                    v = [six.text_type(x) for x in v if x is not None]
                else:
                    v = six.text_type(v)
                result[field.name] = v

            if field.salias:
                v = getattr(self, field.salias)
                if v is not None:
                    result[field.salias] = six.text_type(v)

        return result
Exemplo n.º 6
0
class WorkInfo(object):
    __metaclass__ = DCInfo

    FIELDS = (
        Field(DCNS('creator'),
              'authors',
              as_person,
              salias='author',
              multiple=True),
        Field(DCNS('title'), 'title'),
        Field(DCNS('type'), 'type', required=False, multiple=True),
        Field(DCNS('contributor.editor'),
              'editors',
              as_person,
              salias='editor',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.technical_editor'),
              'technical_editors',
              as_person,
              salias='technical_editor',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.funding'),
              'funders',
              salias='funder',
              multiple=True,
              default=[]),
        Field(DCNS('contributor.thanks'), 'thanks', required=False),
        Field(DCNS('date'), 'created_at'),
        Field(DCNS('date.pd'),
              'released_to_public_domain_at',
              as_date,
              required=False),
        Field(DCNS('publisher'), 'publisher', multiple=True),
        Field(DCNS('language'), 'language'),
        Field(DCNS('description'), 'description', required=False),
        Field(DCNS('source'), 'source_name', required=False),
        Field(DCNS('source.URL'), 'source_url', required=False),
        Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict),
        Field(DCNS('rights.license'), 'license', required=False),
        Field(DCNS('rights'), 'license_description'),
        Field(PLMETNS('digitisationSponsor'),
              'sponsors',
              multiple=True,
              default=[]),
        Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False),
        Field(WLNS('developmentStage'), 'stage', required=False),
    )

    @classmethod
    def from_string(cls, xml, *args, **kwargs):
        from StringIO import StringIO
        return cls.from_file(StringIO(xml), *args, **kwargs)

    @classmethod
    def from_file(cls, xmlfile, *args, **kwargs):
        desc_tag = None
        try:
            iter = etree.iterparse(xmlfile, ['start', 'end'])
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'start':
                    desc_tag = element
                    break

            if desc_tag is None:
                raise NoDublinCore("DublinCore section not found. \
                    Check if there are rdf:RDF and rdf:Description tags.")

            # continue 'till the end of RDF section
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'end':
                    break

            # if there is no end, Expat should yell at us with an ExpatError

            # extract data from the element and make the info
            return cls.from_element(desc_tag, *args, **kwargs)
        except XMLSyntaxError, e:
            raise ParseError(e)
        except ExpatError, e:
            raise ParseError(e)
Exemplo n.º 7
0
def fix_hanging(doc):
    insert_tags(doc,
                re.compile("(?<=\s\w)\s+"),
                "nbsp",
                exclude=[DCNS("identifier.url"),
                         DCNS("rights.license")])