示例#1
0
 def anchorlink(self, on, name='', **kw):
     self._elem(u'link', on)
     if on:
         id = kw.get('id', None)
         if id:
             self._curr.xml_attributes[None, u'id'] = U(id)
         self._curr.xml_attributes[None, u'anchor'] = U(name)
     return ''
示例#2
0
 def image(self, src=None, **kw):
     e = tree.element(None, u'img')
     self._curr.xml_append(e)
     valid_attrs = ('src', 'width', 'height', 'alt', 'title')
     kw.update({'src': src})
     for key, value in kw.items():
         if key in valid_attrs:
             self._curr.xml_attributes[None, U(key)] = U(value)
     return ''
示例#3
0
    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        sect_body_items = results_until(
            sect.xml_select(u'following-sibling::*'),
            u'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ]
        field_list = [
            li for elem in sect_body_items for li in elem.xml_select(u'li')
        ]

        def parse_pair(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _(u'Syntax error in relationship expression: {0}'.
                          format(field)))
                prop = matched.group(1).strip()
                val = matched.group(2).strip()
                #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val
            return None, None

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if li.xml_select(u'ul'):
                main = ''.join([
                    U(node) for node in results_until(li.xml_select(u'node()'),
                                                      u'self::ul')
                ])
                #main = li.xml_select(u'string(ul/preceding-sibling::node())')
                prop, val = parse_pair(main)
                subfield_list = [sli for sli in li.xml_select(u'ul/li')]
                subfield_dict = dict(
                    [parse_pair(U(pair)) for pair in subfield_list])
                if None in subfield_dict: del subfield_dict[None]
                yield prop, val, subfield_dict
            #Just a regular, unadorned property
            else:
                prop, val = parse_pair(U(li))
                if prop: yield prop, val, None
示例#4
0
文件: test.py 项目: slitayem/fuxi
def castToTerm(node):
    if node.xml_local == 'bnode':
        return BNode(u'')
    elif node.xml_local == 'uri':
        return URIRef(U(node))
    elif node.xml_local == 'literal':
        if node.xml_select('string(@datatype)'):
            dT = URIRef(U(node.xpath('string(@datatype)')))
            return Literal(U(node), datatype=dT)
        else:
            return Literal(U(node))
    else:
        raise NotImplementedError()
示例#5
0
文件: moin.py 项目: mredar/akara
 def factory(rest_uri, moin_link=None, opener=None):
     opener = opener or urllib2.build_opener()
     logger.debug('rest_uri: ' + rest_uri)
     req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT})
     resp = opener.open(req)
     doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL)
     original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
     #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
     #amara.xml_print(self.content_cache)
     metadata, first_id = metadata_dict(generate_metadata(doc))
     metadata = metadata[first_id]
     akara_type = U(metadata[u'ak-type'])
     logger.debug('Type: ' + akara_type)
     try:
         #Older Moin CMS resource types are implemented by registration to the global node.NODES
         cls = node.NODES[akara_type]
     except KeyError:
         #Newer Moin CMS resource types are implemented by discovery of a URL,
         #to which a POST request executes the desired action
         return node.ENDPOINTS and (rest_uri, akara_type,
                                    node.ENDPOINTS[akara_type], doc,
                                    metadata, original_wiki_base)
     else:
         instance = cls(rest_uri,
                        moin_link,
                        opener,
                        cache=(doc, metadata, original_wiki_base))
         return instance
示例#6
0
 def heading(self, on, depth, id=None, **kw):
     # remember depth of first heading, and adapt current depth accordingly
     if not self._base_depth:
         self._base_depth = depth
     depth = max(depth + (2 - self._base_depth), 2)
     name = u's%i' % depth
     if on:
         found = None
         parent_depth = depth - 1
         while not found:
             found = self._curr.xml_select(u'ancestor-or-self::' + u's%i' %
                                           (parent_depth))
             parent_depth -= 1
             if found:
                 break
         #print name, found
         self._curr = found[0]
         e = tree.element(None, name)
         id = U(id) if id else u''
         e.xml_attributes[None, u'title'] = id
         e.xml_attributes[None, u'id'] = id
         self._curr.xml_append(e)
         self._curr = e
         e = tree.element(None, u'title')
         self._curr.xml_append(e)
         self._curr = e
     else:
         parent = self._curr.xml_parent
         if self._curr.xml_local == u'title':
             parent.xml_remove(self._curr)
         self._curr = parent
     return ''
示例#7
0
 def pagelink(self, on, pagename='', page=None, **kw):
     FormatterBase.pagelink(self, on, pagename, page, **kw)
     if page is None:
         page = Page(self.request, pagename, formatter=self)
     link_text = page.link_to(self.request, on=on, **kw)
     self._curr.xml_append(tree.text(U(link_text)))
     return ''
示例#8
0
文件: moin.py 项目: mredar/akara
 def definition_list(self, list_path, contextnode=None, patterns=None):
     '''
     Helper to construct a dictionary from an indicated definition list on the page
     '''
     #FIXME: rethink this "caching" business
     #Use defaultdict instead, for performance
     patterns = patterns or {None: lambda x: U(x) if x else None}
     doc, metadata, original_wiki_base = self.cache
     contextnode = contextnode or doc.article
     top = contextnode.xml_select(list_path)
     if not top:
         return None
     #Go over the glossentries, and map from term to def, applying the matching
     #Unit transform function from the patterns dict
     result = dict(
         (U(i.glossterm),
          patterns.get(U(i.glossterm), patterns[None])(i.glossdef))
         for i in top[0].glossentry)
     #logger.debug("definition_list: " + repr(result))
     return result
示例#9
0
    def list_records(self, set="", resumption_token="", metadataPrefix=""):
        '''
        List records. Use either the resumption token or set id.
        '''
        if resumption_token:
            params = {'verb' : 'ListRecords', 'resumptionToken': resumption_token}
        else:
            params = {'verb' : 'ListRecords', 'metadataPrefix': metadataPrefix, 'set': set}
        qstr = urllib.urlencode(params)
        url = self.root + '?' + qstr
        self.logger.debug('OAI request URL: {0}'.format(url))
        start_t = time.time()
        resp, content = self.h.request(url)
        retrieved_t = time.time()
        self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t))

        if metadataPrefix == "mods" or metadataPrefix == "marc":
            xml_content = XML_PARSE(content)
            records = []
            for record in xml_content["OAI-PMH"]["ListRecords"]["record"]:
                id = record["header"]["identifier"]
                if "null" not in id:
                    records.append((id, record))
            if "resumptionToken" in xml_content["OAI-PMH"]["ListRecords"]:
                resumption_token = xml_content["OAI-PMH"]["ListRecords"]["resumptionToken"]
            else:
                resumption_token = ''
        else:
            doc = bindery.parse(url, model=LISTRECORDS_MODELS[metadataPrefix])
            records, first_id = metadata_dict(generate_metadata(doc),
                                            nesteddict=False)
          
            for id_, props in records:
                for k, v in props.iteritems():
                    props[k] = [ U(item) for item in v ]
            if (doc.OAI_PMH.ListRecords is not None) and (doc.OAI_PMH.ListRecords.resumptionToken is not None):
                resumption_token = U(doc.OAI_PMH.ListRecords.resumptionToken)
            else:
                resumption_token = ''

        return {'records' : records, 'resumption_token' : resumption_token}
示例#10
0
 def code_area(self,
               on,
               code_id,
               code_type='code',
               show=0,
               start=-1,
               step=-1,
               msg=None):
     self._elem(u'codearea', on)
     if on:
         self._curr.xml_attributes[None, u'id'] = U(code_id)
     return ''
示例#11
0
文件: test.py 项目: slitayem/fuxi
def parseResults(sparqlRT):
    from amara import bindery
    actualRT = []
    doc = bindery.parse(
        sparqlRT,
        prefixes={u'sparql': u'http://www.w3.org/2005/sparql-results#'})
    askAnswer = doc.xml_select('string(/sparql:sparql/sparql:boolean)')
    if askAnswer:
        askAnswer = U(askAnswer)
        actualRT = askAnswer == u'true'
    else:
        for result in doc.xml_select(
                '/sparql:sparql/sparql:results/sparql:result'):
            currBind = {}
            for binding in result.binding:
                varVal = U(binding.name)
                var = Variable(varVal)
                term = castToTerm(binding.xml_select('*')[0])
                currBind[var] = term
            if currBind:
                actualRT.append(currBind)
    return actualRT
示例#12
0
文件: xml.py 项目: erimille/versa
 def process(resource, context):
     subj = interpret(resource.id)
     for rel in resource.xml_select('*'):
         if rel.xml_name == u'rel':
             #Rel id is in an attribute
             pass
         else:
             #Look up rel id from abbrs
             relid = abbrs[rel.xml_local]
         val = U(rel)
         attrs = {}
         if context:
             attrs[u'@context'] = context
         for ans, aname in rel.xml_attributes:
             aval = rel.xml_attributes[ans, aname]
             if aname == u'value':
                 val = interpret(rel.value)
             else:
                 attrs[abbrs.get(U(aname), U(aname))] = interpret(U(aval))
         print(subj, relid, val, attrs)
         #model.add(subj, relid, val, attrs)
     return
示例#13
0
def normalize_generated_ids(meta_list):
    pat = re.compile('r(\d+)e')

    # Takes an ID such as 'r1234e0e4' and returns 'r*e0e4'.
    def normalize_id(id):
        m = pat.match(id)
        if m:
            id = 'r*e' + id[m.end():]
        return id

    for i, (s, p, o) in enumerate(meta_list):
        s = normalize_id(s)
        o = normalize_id(U(o))
        meta_list[i] = (s, p, o)
    return meta_list
示例#14
0
    def get_record(self, id):
        params = {'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id}
        qstr = urllib.urlencode(params)
        url = self.root + '?' + qstr
        self.logger.debug('OAI request URL: {0}'.format(url))
        start_t = time.time()
        resp, content = self.h.request(url)
        retrieved_t = time.time()
        self.logger.debug('Retrieved in {0}s',format(retrieved_t - start_t))
        doc = bindery.parse(url, model=OAI_GETRECORD_MODEL)

        record, rid = metadata_dict(generate_metadata(doc), nesteddict=False)
        for id_, props in (record if isinstance(record, list) else [record]):
            for k, v in props.iteritems():
                props[k] = [ U(item) for item in v ]

        return {'record' : record}
示例#15
0
 def lang(self, on, lang_name):
     self._elem(u'div', on)
     if on:
         self._curr.xml_attributes[None, u'lang'] = U(lang_name)
     return ''
示例#16
0
 def endDocument(self):
     #Yuck! But Moin seems to insist on Unicode object result (see MoinMoin.parser.text_moin_wiki.Parser.scan)
     #print "endDocument", repr(self._doc.xml_encode(encoding=config.charset).decode(config.charset))
     return U(self._doc.xml_encode(encoding=config.charset))
示例#17
0
 def startDocument(self, pagename):
     self._curr = tree.element(None, u's1')
     self._curr.xml_attributes[None, u'title'] = U(pagename)
     self._doc.xml_append(self._curr)
     return ''
示例#18
0
 def code_token(self, on, tok_type):
     self._elem(u'codetoken', on)
     if on:
         self._curr.xml_attributes[None, u'type'] = U(tok_type)
     return ''
示例#19
0
    def render(self):
        '''
        The typical approach is along the lines of "Style-free XSLT Style Sheets"
        * http://www.xml.com/pub/a/2000/07/26/xslt/xsltstyle.html
        * http://www.cocooncenter.org/articles/stylefree.html
        But using div/@id rather than custome elements
        '''
        doc, metadata, original_wiki_base = self.cache
        self.content = content_handlers(original_wiki_base)
        #metadata = doc.article.xml_model.generate_metadata(doc)
        #import pprint
        #pprint.pprint(resources)
        '''
         akara:type:: [[http://purl.org/xml3k/akara/xmlmodel/cms/folder|folder]]
         title:: A page
         template:: http://wiki.example.com/Site;attachment=foo.xslt ##Just XSLT for now.  Plan to support other templating systems soon
         link:: [[http://example.org|]] rel=...
         meta:: dc:Creator value=Uche Ogbuji
         script:: `...` ##preferably they'd only use linked scripts: [[myscript...]]
        '''

        page_id = doc.article.xml_nodeid
        header = doc.article.glosslist[0]
        #node_type = first_item(header.xml_select(u'glossentry[glossterm = "akara:type"]/glossdef'))
        template = unicode(
            first_item(
                header.xml_select(
                    u'glossentry[glossterm = "template"]/glossdef'))).strip()
        template = os.path.join(self.outputdir, template)
        title = first_item(
            header.xml_select(u'glossentry[glossterm = "title"]/glossdef'))
        #title = resources[articleid]['title']
        #sections = dict([ (unicode(s.title), s) for s in page.article.section ])
        #print sections
        # if unicode(g.glossterm) == u'page:header' ]
        #authors = [ a
        #    for a in page.article.section.glosslist.glossentry
        #    if unicode(a.glossterm) == u'entry:authors'
        #]
        #title = article.xml_select(u'section[@title = ]')

        #revdate = dateparse(unicode(page.article.articleinfo.revhistory.revision.date))
        #if revdate.tzinfo == None: revdate = revdate.replace(tzinfo=UTC)

        #Create ouput file
        print >> sys.stderr, 'Writing to ', self.output
        buf = StringIO()
        w = structwriter(indent=u"yes", stream=buf)
        w.feed(
            ROOT(
                E(
                    (XHTML_NAMESPACE, u'html'),
                    {(XML_NAMESPACE, u'xml:lang'): u'en'},
                    E(
                        u'head',
                        E(u'title', title),
                        E(
                            u'meta', {
                                u'content': U(metadata[u'ak-updated']),
                                u'name': u'updated'
                            }),
                        #E(u'link', {u'href': unicode(uri), u'rel': u'alternate', u'title': u"Permalink"}),
                    ),
                    E(u'body',
                      (self.content.dispatch(s) for s in doc.article.section)),
                ), ))
        with open(self.output, 'w') as output:
            #text = f.read().rstrip()
            #print buf.getvalue()
            transform(buf.getvalue(), template, output=output)
        return
示例#20
0
from amara.lib import U
from amara.tree import element, text

SOURCE = '''<catalog>
  <book>
     <title>Spam for Supper</title>
     <authors>By A.X. Ham and Franco Bacon</authors>
  </book>
</catalog>'''

EXTRACT_AUTHORS_PAT = r'(\s*By\s*)|(\s*,\s*)|(\s*and\s*)'
EXTRACT_AUTHORS_PAT_GROUPS = 4

doc = amara.parse(SOURCE)
for author_node in doc.xml_select(u'/catalog/book/authors'):
    authors = re.split(EXTRACT_AUTHORS_PAT, U(author_node))
    for n in author_node.xml_children: author_node.xml_remove(n)
    #Collect the regex match into the regex-defined groups
    for i, subenum in groupby(enumerate(authors), lambda i: i[0]//EXTRACT_AUTHORS_PAT_GROUPS):
        matchgroup = [ group for i, group in subenum ]
        if matchgroup[0]:
            link = element(None, u'a')
            link.xml_attributes[None, u'href'] = 'http://example.org'
            link.xml_append(text(matchgroup[0]))
            author_node.xml_append(link)
        for match in matchgroup[1:]:
            if match:
                author_node.xml_append(text(match))

doc.xml_write()
print
示例#21
0
 def attachment_link(self, on, url=None, **kw):
     self._elem(u'attachment', on)
     if on:
         self._curr.xml_attributes[None, u'href'] = U(url)
     return ''
示例#22
0
 def anchordef(self, id):
     e = tree.element(None, u'anchor')
     self._curr.xml_append(e)
     self._curr.xml_attributes[None, u'id'] = U(id)
     return ''
    def receive_items():
        '''
        Receives each record and processes it by creating an item
        dict which is then forwarded to the sink
        '''
        ix = 1
        while True:
            rec = yield
            recid = u'_' + str(ix)

            leader = U(rec.xml_select(u'ma:leader', prefixes=PREFIXES))
            work_item = {
                u'id': u'work' + recid,
                u'label': recid,
                #u'label': u'{0}, {1}'.format(row['TPNAML'], row['TPNAMF']),
                u'type': u'WorkRecord',
            }
            print >> sys.stderr, 'Begin processing Work: ', work_item[u'id']

            #Instance starts with same as work, with leader added
            instance_item = {
                u'leader': leader,
            }
            instance_item.update(work_item)
            instance_item[u'id'] = u'instance' + recid
            instance_item[u'type'] = u'InstanceRecord'
            work_item[u'instance'] = u'instance' + recid

            for cf in rec.xml_select(u'ma:controlfield', prefixes=PREFIXES):
                key = u'cftag_' + U(cf.xml_select(u'@tag'))
                val = U(cf)
                if list(cf.xml_select(u'ma:subfield', prefixes=PREFIXES)):
                    for sf in cf.xml_select(u'ma:subfield', prefixes=PREFIXES):
                        code = U(sf.xml_select(u'@code'))
                        sfval = U(sf)
                        #For now assume all leader fields are instance level
                        instance_item[key + code] = sfval
                else:
                    #For now assume all leader fields are instance level
                    instance_item[key] = val

            for df in rec.xml_select(u'ma:datafield', prefixes=PREFIXES):
                code = U(df.xml_select(u'@tag'))
                key = u'dftag_' + code
                val = U(df)
                if list(df.xml_select(u'ma:subfield', prefixes=PREFIXES)):
                    subfields = dict(
                        ((U(sf.xml_select(u'@code')), U(sf))
                         for sf in df.xml_select(u'ma:subfield',
                                                 prefixes=PREFIXES)))
                    lookup = code
                    #See if any of the field codes represents a reference to an object which can be materialized
                    handled = False
                    if code in MATERIALIZE:
                        (subst, extra_props) = MATERIALIZE[code]
                        props = {u'marccode': code}
                        props.update(extra_props)
                        #props.update(other_properties)
                        props.update(subfields)
                        #work_item[FIELD_RENAMINGS.get(code, code)] = subid
                        # subid = subobjs.add(props)
                        if ix < len(recs):
                            subid = subobjs.add(props)
                            objects_sink.write(",\n")
                        else:
                            subid = subobjs.add(props, last=True)

                        if code in INSTANCE_FIELDS:
                            instance_item.setdefault(subst, []).append(subid)
                        elif code in WORK_FIELDS:
                            work_item.setdefault(subst, []).append(subid)

                        handled = True

                    if code in MATERIALIZE_VIA_ANNOTATION:
                        (subst, extra_object_props, extra_annotation_props
                         ) = MATERIALIZE_VIA_ANNOTATION[code]
                        object_props = {u'marccode': code}
                        object_props.update(extra_object_props)
                        #props.update(other_properties)

                        #Separate annotation subfields from object subfields
                        object_subfields = subfields.copy()
                        annotation_subfields = {}
                        for k, v in object_subfields.items():
                            if code + k in ANNOTATIONS_FIELDS:
                                annotation_subfields[k] = v
                                del object_subfields[k]

                        object_props.update(object_subfields)
                        # objectid = subobjs.add(object_props)
                        # if ix < len(recs):
                        #    objects_sink.write(",\n")
                        if ix < len(recs):
                            objectid = subobjs.add(object_props)
                            objects_sink.write(",\n")
                        else:
                            objectid = subobjs.add(object_props, last=True)

                        annid = u'annotation' + recid
                        annotation_item = {
                            u'id': annid,
                            u'label': recid,
                            subst: objectid,
                            u'type': u'Annotation',
                            u'on_work': work_item[u'id'],
                            u'on_instance': instance_item[u'id'],
                        }
                        annotation_item.update(extra_annotation_props)
                        annotation_item.update(annotation_subfields)

                        emitter(annotation_item, annotations_sink)
                        if ix < len(recs):
                            annotations_sink.write(",\n")
                        # annotations_sink.write(annotation_item)
                        print >> sys.stderr, 'Processing annotation: ', annotation_item[
                            u'id'], "\n"

                        if code in INSTANCE_FIELDS:
                            instance_item.setdefault('annotation',
                                                     []).append(annid)
                        elif code in WORK_FIELDS:
                            work_item.setdefault('annotation',
                                                 []).append(annid)

                        #The actual subfields go to the annotations sink
                        #annotations_props = {u'annotates': instance_item[u'id']}
                        #annotations_props.update(props)
                        #subid = subobjs.add(annotations_props, annotations_sink)
                        #The reference is from the instance ID
                        #instance_item.setdefault(subst, []).append(subid)

                        handled = True

                        #work_item.setdefault(FIELD_RENAMINGS.get(code, code), []).append(subid)

                    #See if any of the field+subfield codes represents a reference to an object which can be materialized
                    if not handled:
                        for k, v in subfields.items():
                            lookup = code + k
                            if lookup in MATERIALIZE:
                                (subst, extra_props) = MATERIALIZE[lookup]
                                props = {u'marccode': code, k: v}
                                props.update(extra_props)
                                #print >> sys.stderr, lookup, k, props,
                                if ix < len(recs):
                                    subid = subobjs.add(props)
                                    objects_sink.write(",\n")
                                else:
                                    subid = subobjs.add(props, last=True)

                                if lookup in INSTANCE_FIELDS or code in INSTANCE_FIELDS:
                                    instance_item.setdefault(subst,
                                                             []).append(subid)
                                elif lookup in WORK_FIELDS or code in WORK_FIELDS:
                                    work_item.setdefault(subst,
                                                         []).append(subid)
                                handled = True

                            else:
                                field_name = u'dftag_' + lookup
                                if lookup in FIELD_RENAMINGS:
                                    field_name = FIELD_RENAMINGS[lookup]
                                #Handle the simple field_nameitution of a label name for a MARC code
                                if lookup in INSTANCE_FIELDS or code in INSTANCE_FIELDS:
                                    instance_item.setdefault(field_name,
                                                             []).append(v)
                                elif lookup in WORK_FIELDS or code in WORK_FIELDS:
                                    work_item.setdefault(field_name,
                                                         []).append(v)

                #print >> sys.stderr, lookup, key
                elif not handled:
                    if code in INSTANCE_FIELDS:
                        instance_item[key] = val
                    elif code in WORK_FIELDS:
                        work_item[key] = val
                else:
                    if code in INSTANCE_FIELDS:
                        instance_item[key] = val
                    elif code in WORK_FIELDS:
                        work_item[key] = val

            #link = work_item.get(u'cftag_008')

            #Handle ISBNs re: https://foundry.zepheira.com/issues/1976
            new_instances = []

            isbns = instance_item.get('isbn', [])

            def isbn_list(isbns):
                isbn_tags = {}
                for isbn in isbns:
                    parts = isbn.split(None, 1)
                    #Remove any cruft from ISBNs. Leave just the digits
                    cleaned_isbn = NON_ISBN_CHARS.subn(u'', parts[0])[0]
                    if len(parts) == 1:
                        #FIXME: More generally strip non-digit chars from ISBNs
                        isbn_tags[cleaned_isbn] = None
                    else:
                        isbn_tags[cleaned_isbn] = parts[1]
                c14ned = canonicalize_isbns(isbn_tags.keys())
                for c14nisbn, variants in invert_dict(c14ned).items():
                    #We'll use the heuristic that the longest ISBN number is the best
                    variants.sort(key=len,
                                  reverse=True)  # sort by descending length
                    yield variants[0], isbn_tags[variants[0]]
                return  # list(isbnset)

            base_instance_id = instance_item[u'id']
            instance_ids = []
            subscript = ord(u'a')
            for subix, (inum, itype) in enumerate(isbn_list(isbns)):
                #print >> sys.stderr, subix, inum, itype
                subitem = instance_item.copy()
                subitem[u'isbn'] = inum
                subitem[u'id'] = base_instance_id + (unichr(subscript + subix)
                                                     if subix else u'')
                if itype: subitem[u'isbnType'] = itype
                instance_ids.append(subitem[u'id'])
                new_instances.append(subitem)
                isbnnu_url = ISBNNU_PAT.format(inum)
                subitem[u'isbnnu'] = isbnnu_url
                #U(doc.xml_select(u'/rss/channel/item/link'))
                subitem[u'openlibcover'] = OPENLIBRARY_COVER_PAT.format(inum)
                #time.sleep(2) #Be polite!

                #instance_item[u'isbn'] = isbns[0]

            if not new_instances:
                #Make sure it's created as an instance even if it has no ISBN
                new_instances.append(instance_item)
                instance_ids.append(base_instance_id)

            work_item[u'instance'] = instance_ids

            special_properties = {}
            for k, v in process_leader(leader):
                special_properties.setdefault(k, set()).add(v)

            for k, v in process_008(instance_item[u'cftag_008']):
                special_properties.setdefault(k, set()).add(v)

            #We get some repeated values out of leader & 008 processing, and we want to
            #Remove dupes so we did so by working with sets then converting to lists
            for k, v in special_properties.items():
                special_properties[k] = list(v)

            instance_item.update(special_properties)

            #reduce lists of just one item
            for k, v in work_item.items():
                if type(v) is list and len(v) == 1:
                    work_item[k] = v[0]

            # work_sink.write(work_item)
            emitter(work_item, work_sink)
            if ix < len(recs):
                work_sink.write(",\n")

            def send_instance(instance):
                print >> sys.stderr, 'Processing instance: ', instance[u'id']
                emitter(instance, instance_sink)

            i = 0
            for ninst in new_instances:
                i += 1
                send_instance(ninst)
                if i < len(new_instances):
                    instance_sink.write(",\n")

            if ix < len(recs):
                instance_sink.write(",\n")

            print >> sys.stderr, 'Finished processing Work: ', work_item[
                u'id'], "\n"
            ix += 1

        return
示例#24
0
 def icon(self, type_):
     self._elem(u'icon', on)
     self._curr.xml_attributes[None, u'type'] = U(type_)
     self._elem(u'icon', off)
     return ''
示例#25
0
 def smiley(self, text):
     self._curr.xml_append(tree.text(U(text)))
     return ''
示例#26
0
 def attachment_drawing(self, url, text, **kw):
     self._elem(u'attachmentimage', on)
     self._curr.xml_attributes[None, u'href'] = U(url)
     self._curr.xml_append(tree.text(U(text)))
     self._elem(u'attachmentimage', off)
     return ''
示例#27
0
 def interwikilink(self, on, interwiki='', pagename='', **kw):
     self._elem(u'interwiki', on)
     if on:
         self._curr.xml_attributes[None, u'wiki'] = U(interwiki)
         self._curr.xml_attributes[None, u'pagename'] = U(pagename)
     return ''
示例#28
0
def from_markdown(md, output, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    output -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    No return value
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'):
        syntaxtypemap[u'h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'):
        syntaxtypemap[u'h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'):
        syntaxtypemap[u'h3'] = config.get('autotype-h3')
    interp = config.get('interpretations', {})
    #Map the interpretation IRIs to functions to do the data prep
    for prop, interp_key in interp.iteritems():
        if interp_key in PREP_METHODS:
            interp[prop] = PREP_METHODS[interp_key]
        else:
            #just use the identity, i.e. no-op
            interp[prop] = lambda x, **kwargs: x

    #Parse the Markdown
    h = markdown.markdown(md.decode(encoding))

    doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    top_section_fields = results_until(
        doc.xml_select(u'//h1[1]/following-sibling::h2'), u'self::h1')

    docheader = doc.xml_select(u'//h1[.="@docheader"]')[0]
    sections = doc.xml_select(u'//h1|h2|h3[not(.="@docheader")]')

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        sect_body_items = results_until(
            sect.xml_select(u'following-sibling::*'),
            u'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ]
        field_list = [
            li for elem in sect_body_items for li in elem.xml_select(u'li')
        ]

        def parse_pair(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _(u'Syntax error in relationship expression: {0}'.
                          format(field)))
                prop = matched.group(1).strip()
                val = matched.group(2).strip()
                #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val
            return None, None

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if li.xml_select(u'ul'):
                main = ''.join([
                    U(node) for node in results_until(li.xml_select(u'node()'),
                                                      u'self::ul')
                ])
                #main = li.xml_select(u'string(ul/preceding-sibling::node())')
                prop, val = parse_pair(main)
                subfield_list = [sli for sli in li.xml_select(u'ul/li')]
                subfield_dict = dict(
                    [parse_pair(U(pair)) for pair in subfield_list])
                if None in subfield_dict: del subfield_dict[None]
                yield prop, val, subfield_dict
            #Just a regular, unadorned property
            else:
                prop, val = parse_pair(U(li))
                if prop: yield prop, val, None

    #Gather the document-level metadata
    base = propbase = rbase = None
    for prop, val, subfield_dict in fields(docheader):
        if prop == '@base':
            base = val
        if prop == '@property-base':
            propbase = val
        if prop == '@resource-base':
            rbase = val
    if not propbase: propbase = base
    if not rbase: rbase = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(U(sect))
        if not matched:
            raise ValueError(
                _(u'Syntax error in resource header: {0}'.format(U(sect))))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = I(iri.absolutize(output.generate_resource(), base))
        if rtype:
            rtype = I(iri.absolutize(rtype, base))
        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_local)
        if rtype:
            output.add(rid, RDFTYPE, rtype)
        #Add the property
        for prop, val, subfield_dict in fields(sect):
            attrs = subfield_dict or {}
            fullprop = I(iri.absolutize(prop, propbase))
            resinfo = AB_RESOURCE_PAT.match(val)
            if resinfo:
                val = resinfo.group(1)
                valtype = resinfo.group(3)
                if not val: val = output.generate_resource()
                if valtype: attrs[RDFTYPE] = valtype
            if fullprop in interp:
                val = interp[fullprop](val,
                                       rid=rid,
                                       fullprop=fullprop,
                                       base=base,
                                       model=output)
                if val is not None: output.add(rid, fullprop, val)
            else:
                output.add(rid, fullprop, val, attrs)

    return base
示例#29
0
 def attachment_image(self, url, **kw):
     self._elem(u'attachmentimage', on)
     if on:
         self._curr.xml_attributes[None, u'href'] = U(url)
     return ''
示例#30
0
 def url(self, on, url='', css=None, **kw):
     self._elem(u'jump', on)
     self._curr.xml_attributes[None, u'url'] = U(url)
     if css:
         self._curr.xml_attributes[None, u'class'] = U(css)
     return ''