Exemplo n.º 1
0
    def _listIdentifiers(self):
        s = "%sverb=ListIdentifiers&" % (self.server)
        s += urllib.urlencode(self.params)
        resp = self._fetchStream(s)
        data = resp.read()

        # self.lastResponse = resp
        # Now use existing infrastructure to parse
        doc = StringDocument(data, self.id, mimeType='text/xml')
        rec = BSParser.process_document(None, doc)
        dom = rec.get_dom(session)
        for top in dom.childNodes:
            if (top.nodeType == elementType):
                break
        for c in top.childNodes:
            if (c.nodeType == elementType
                    and c.localName == 'ListIdentifiers'):
                for c2 in c.childNodes:
                    if (c2.nodeType == elementType
                            and c2.localName == 'header'):
                        for c3 in c2.childNodes:
                            if (c3.nodeType == elementType
                                    and c3.localName == 'identifier'):
                                self.ids.append(getFirstData(c3))
                    elif (c2.nodeType == elementType
                          and c2.localName == 'resumptionToken'):
                        t = getFirstData(c2)
                        if (t):
                            self.token = t
                        try:
                            self.total = c2.getAttr('completeListSize')
                        except:
                            pass
Exemplo n.º 2
0
    def find_documents(self, session, cache=0):
        # Construct SRU url, fetch, parse.
        start = 1
        docs = []
        while True:
            self.args['startRecord'] = start
            params = urllib.urlencode(self.args)
            req = urllib2.Request(url="%s%s" % (self.server, params))
            f = urllib2.urlopen(req)
            data = f.read()
            f.close()
            # subst out xmldecl
            data = self.xmlver.sub("", data)
            soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data
            ps = ZSI.ParsedSoap(soapy, readerclass=reader)
            resp = ps.Parse(SRW.types.SearchRetrieveResponse)

            self.total = resp.numberOfRecords
            for d in resp.records:
                doc = StringDocument(d.recordData, mimeType='text/xml')
                if cache == 0:
                    yield doc
                elif cache == 2:
                    docs.append(doc)
                else:
                    raise NotImplementedError
            start += len(resp.records)
            if start > self.total:
                if cache == 0:
                    raise StopIteration
                else:
                    break
        self.documents = docs
Exemplo n.º 3
0
 def find_documents(self, session, cache=0):
     docs = []
     curr = 1
     while True:
         self.stream.startRecord = curr
         resp = self.binding.RPC(
             self.binding.url,
             "searchRetrieveRequest",
             self.stream,
             requestclass=SRW.types.SearchRetrieveRequest,
             replytype=SRW.types.SearchRetrieveResponse.typecode,
             readerclass=reader)
         total = resp.numberOfRecords
         curr += len(resp.records)
         for d in resp.records:
             doc = StringDocument(d.recordData, mimeType='text/xml')
             doc.recordSchema = d.recordSchema
             if cache == 0:
                 yield doc
             elif cache == 2:
                 docs.append(doc)
             else:
                 raise NotImplementedError
         if curr > total:
             if cache == 0:
                 raise StopIteration
             else:
                 break
     self.documents = docs
Exemplo n.º 4
0
    def create_document(self, session, doc=None):
        p = self.permissionHandlers.get('info:srw/operation/1/create', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to create an object in %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to create an object in %s" % self.id)

        id = self.generate_id(session)
        if (doc == None):
            # Create a placeholder
            doc = StringDocument("")
        else:
            doc.id = id
        doc.documentStore = self.id

        try:
            self.store_document(session, doc)
        except ObjectAlreadyExistsException:
            # Back out id change
            if type(id) == long:
                self.currentId -= 1
            raise
        except:
            raise
        return doc
Exemplo n.º 5
0
 def process_record(self, session, rec):
     doc = []
     doc.append('<article id="%s" date="%s">\n' %
                (rec.process_xpath(session, '/article/@id')[0],
                 rec.process_xpath(session, '/article/@date')[0]))
     head = rec.process_xpath(session, '/article/head')[0]
     headstr = etree.tounicode(head)
     doc.append(headstr.encode('utf-8'))
     doc.append("\n<body>\n")
     body = rec.process_xpath(session, '/article/body')[0]
     # walk tree looking for <s> tags, and duplicate out any non s tag
     eid = 0
     for sub in body:
         if sub.tag == "p":
             bits = ['<p eid="%s"' % eid]
             eid += 1
             for (name, val) in sub.items():
                 bits.append("%s=\"%s\"" % (name, val))
             bits.append(">")
             doc.append(' '.join(bits))
             for s in sub:
                 # sentences
                 bits = ['<s eid="%s"' % eid]
                 eid += 1
                 for (name, val) in s.items():
                     bits.append("%s=\"%s\"" % (name, val))
                 bits.append(">")
                 doc.append(' '.join(bits))
                 t = s.text
                 if t:
                     try:
                         toks = self.geniafy(t)
                         ttxt = ''.join(toks)
                         val = '<txt>%s</txt><toks>%s</toks>' % (escape(t),
                                                                 ttxt)
                         doc.append(val.encode('utf8'))
                     except:
                         raise
                 doc.append("</s>")
             doc.append("</p>\n")
         elif sub.tag in ["headline", "lead"]:
             # tag headline and lead too
             doc.append('<%s>' % sub.tag)
             t = sub.text
             if t:
                 try:
                     toks = self.geniafy(t)
                     ttxt = ''.join(toks)
                     val = '<txt>%s</txt><toks>%s</toks>' % (escape(t),
                                                             ttxt)
                     doc.append(val.encode('utf8'))
                 except:
                     raise
             doc.append('</%s>' % sub.tag)
         else:
             # just useless <br/> tags
             pass
     doc.append("\n</body>\n</article>\n")
     return StringDocument(''.join(doc))
Exemplo n.º 6
0
 def _process_data(self, session, id, data, preParser=None):
     # Split from fetch record for Iterators
     if (preParser is not None):
         doc = StringDocument(data)
         doc = preParser.process_document(session, doc)
     elif (self.outPreParser is not None):
         doc = StringDocument(data)
         doc = self.outPreParser.process_document(session, doc)
     elif (self.outWorkflow is not None):
         doc = StringDocument(data)
         doc = self.outWorkflow.process(session, doc)
     else:
         doc = StringDocument(data)
     # Ensure basic required info
     doc.id = id
     doc.documentStore = self.id
     return doc
Exemplo n.º 7
0
 def test_unicode_content(self):
     "Check Document with Unicode content returns unaltered."
     if not self.testUc:
         self.skipTest("No test Unicode available")
     uDoc = StringDocument(self.testUc)
     outDoc = self.testObj.process_document(self.session, uDoc)
     outDocContent = outDoc.get_raw(self.session)
     self.assertEqual(outDocContent, self.testUc)
Exemplo n.º 8
0
 def process_record(self, session, rec):
     doc = BaseSVMTransformer.process_record(self, session, rec)
     (l, v) = doc.get_raw(session)
     full = v.items()
     full.sort()
     vstr = ' '.join(["%s:%s" % tuple(x) for x in full])
     data = "%s %s\n" % (l, vstr)
     return StringDocument(data)
Exemplo n.º 9
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     new = b64decode(data)
     return StringDocument(new,
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 10
0
 def process_document(self, session, doc):
     bzdata = doc.get_raw(session)
     data = bz2.decompress(bzdata)
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 11
0
 def process_document(self, session, doc):
     (labels, vectors) = doc.get_raw(session)[:2]
     txt = []
     for v in vectors:
         k = v.keys()
         if k:
             k.sort()
             txt.append(' '.join(map(str, k)))
     return StringDocument('\n'.join(txt))
Exemplo n.º 12
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     string = pickle.loads(data)
     return StringDocument(string,
                           self.id,
                           doc.processHistory,
                           mimeType='text/pickle',
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 13
0
 def accumulate(self,
                session,
                stream,
                format,
                tagName=None,
                codec=None,
                factory=None):
     doc = StringDocument(stream.get_xml(session))  #get rec into doc
     self.data.append(doc.get_raw(session))
Exemplo n.º 14
0
 def process_document(self, session, doc):
     rst = doc.get_raw(session)
     data = publish_string(rst, writer_name="xml")
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           mimeType=self.outMimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 15
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     new = self.normalizer.process_string(session, data)
     return StringDocument(new,
                           self.id,
                           doc.processHistory,
                           mimeType=doc.mimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 16
0
 def process_document(self, session, doc):
     txt = doc.get_raw(session)
     txt = escape(txt)
     return StringDocument("<data>" + txt + "</data>",
                           self.id,
                           doc.processHistory,
                           mimeType='text/xml',
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 17
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     m = MARC(data)
     return StringDocument(m.toSGML(),
                           self.id,
                           doc.processHistory,
                           mimeType='text/sgml',
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 18
0
 def process_document(self, session, doc):
     (qqq, fn) = tempfile.mkstemp('.pdf')
     fh = file(fn, 'w')
     fh.write(doc.get_raw(session))
     fh.close()	  
     cmd = "java -Djava.awt.headless=true -cp /users/azaroth/cheshire3/code/mvd/Multivalent20050929.jar tool.doc.ExtractText -output xml %s" % fn
     (i, o, err) = os.popen3(cmd)
     data = o.read()            
     os.remove(fn)
     return StringDocument(data)
Exemplo n.º 19
0
 def __call__(self, element, rec):
     """Apply any necessary transformation to a record, and appends resulting XML to the elementTree. """
     if self.txr:
         # use transformer object
         doc = self.txr.process_record(session, rec)
     else:
         # make no assumptions about class of record
         doc = StringDocument(rec.get_xml(session))
     lxmlRec = lxmlParser.process_document(session, doc)
     dom = lxmlRec.get_dom(session)
     return element.append(dom)
Exemplo n.º 20
0
def unpack_record(self, session, req):
    declre = re.compile('<\?xml(.*?)\?>')
    if req.record:
        packing = req.record.recordPacking
        if packing == "string":
            data = req.record.recordData
            data = declre.sub('', data)            
            doc = StringDocument(data)
        elif packing == "url":
            raise NotImplementedError
        elif packing == "xml":
            # Should be a DOM node, not string repr?
            doc = StringDocument(req.record.recordData)
        else:
            diag = Diagnostic1()
            raise diag
        doc._schema = req.record.recordSchema
    else:
        doc = None
    return doc
Exemplo n.º 21
0
 def process_document(self, session, doc):
     txt = doc.get_raw(session)
     for e in self.entities.keys():
         txt = txt.replace("&%s;" % (e), self.entities[e])
     txt = self.amp_re.sub(self._loneAmpersand, txt)
     return StringDocument(txt,
                           self.id,
                           doc.processHistory,
                           mimeType=doc.mimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 22
0
 def process_document(self, session, doc):
     try:
         data = doc.get_raw(session).decode(self.codec)
     except UnicodeDecodeError as e:
         raise e
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           mimeType=doc.mimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 23
0
 def process_record(self, session, rec):
     elems = []
     for m in self.maps:
         (xpath, tagPath) = m
         node = z3950.TaggedElement()            
         data = self._resolveData(session, rec, xpath)
         node.content = ('string', str(data))
         node.tagType = 2
         node.tagValue = ('numeric', int(tagPath))
         elems.append(node)
     return StringDocument(elems, self.id, rec.processHistory, parent=rec.parent)
Exemplo n.º 24
0
    def process_record(self, session, rec):
        # return StringDocument
        dom = rec.get_dom(session)
        if (session.environment == 'apache'):
            self.txr = etree.XSLT(self.parsedXslt)

        if self.params:
            result = self.txr(dom, **self.params)
        else:
            result = self.txr(dom)
        return StringDocument(str(result))
Exemplo n.º 25
0
    def process_document(self, session, doc):
        data = doc.get_raw(session)
        # This is bizarre, but otherwise:
        # UnicodeDecodeError: 'ascii' codec can't decode byte ...
        if type(data) == unicode:
            data = data.replace(u"\xe2\x80\x9c", u'&quot;')
            data = data.replace(u"\xe2\x80\x9d", u'&quot;')
            data = data.replace(u"\xe2\x80\x9e", u'&quot;')
            data = data.replace(u"\xe2\x80\x93", u'-')
            data = data.replace(u"\xe2\x80\x98", u"'")
            data = data.replace(u"\xe2\x80\x99", u"'")
            data = data.replace(u"\xe2\x80\x9a", u",")
            data = data.replace(u"\x99", u"'")
            data = data.replace(u'\xa0', u' ')
        else:
            data = data.replace("\xe2\x80\x9c", '&quot;')
            data = data.replace("\xe2\x80\x9d", '&quot;')
            data = data.replace("\xe2\x80\x9e", '&quot;')
            data = data.replace("\xe2\x80\x93", '-')
            data = data.replace("\xe2\x80\x98", "'")
            data = data.replace("\xe2\x80\x99", "'")
            data = data.replace("\xe2\x80\x9a", ",")
            data = data.replace("\x99", "'")
            data = data.replace('\xa0', ' ')

        data = self.nonxmlRe.sub(' ', data)
        if self.strip:
            return StringDocument(self.asciiRe.sub('', data),
                                  self.id,
                                  doc.processHistory,
                                  mimeType=doc.mimeType,
                                  parent=doc.parent,
                                  filename=doc.filename)
        else:
            fn = lambda x: "&#%s;" % ord(x.group(1))
            return StringDocument(self.asciiRe.sub(fn, data),
                                  self.id,
                                  doc.processHistory,
                                  mimeType=doc.mimeType,
                                  parent=doc.parent,
                                  filename=doc.filename)
Exemplo n.º 26
0
 def process_document(self, session, doc):
     d = tidy.parseString(doc.get_raw(session),
                          output_xhtml=1,
                          add_xml_decl=0,
                          tidy_mark=0,
                          indent=0)
     return StringDocument(str(d),
                           self.id,
                           doc.processHistory,
                           mimeType=doc.mimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 27
0
 def process_document(self, session, doc):
     self.load_model(session)
     data = doc.get_raw(session)
     # data should be list of list of ints to map
     g = self.model.get
     ndata = []
     for d in data:
         n = []
         for i in d:
             n.append(g(i))
         ndata.append(n)
     return StringDocument(ndata)
Exemplo n.º 28
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     try:
         xml = self._send_request(session, data)
     except:
         xml = "<error/>"
     return StringDocument(xml,
                           self.id,
                           doc.processHistory,
                           mimeType='text/xml',
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 29
0
 def process_document(self, session, doc):
     # Must be raw text after passed through tagger
     txt = doc.get_raw(session)
     lines = txt.split('\n')
     all = []
     for l in lines:
         self.pipe.stdin.write(l)
         self.pipe.stdin.write("\n")
         self.pipe.stdin.flush()
         tagd = self.pipe.stdout.readline()
         all.append(tagd)
     return StringDocument('\n'.join(all))
Exemplo n.º 30
0
 def process_document(self, session, doc):
     buff = StringIO.StringIO(doc.get_raw(session))
     zfile = gzip.GzipFile(mode='rb', fileobj=buff)
     data = zfile.read()
     zfile.close()
     buff.close()
     del zfile
     del buff
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)