def _listIdentifiers(self): s = "%sverb=ListIdentifiers&" % (self.server) s += urllib.urlencode(self.params) resp = self._fetchStream(s) data = resp.read() # self.lastResponse = resp # Now use existing infrastructure to parse doc = StringDocument(data, self.id, mimeType='text/xml') rec = BSParser.process_document(None, doc) dom = rec.get_dom(session) for top in dom.childNodes: if (top.nodeType == elementType): break for c in top.childNodes: if (c.nodeType == elementType and c.localName == 'ListIdentifiers'): for c2 in c.childNodes: if (c2.nodeType == elementType and c2.localName == 'header'): for c3 in c2.childNodes: if (c3.nodeType == elementType and c3.localName == 'identifier'): self.ids.append(getFirstData(c3)) elif (c2.nodeType == elementType and c2.localName == 'resumptionToken'): t = getFirstData(c2) if (t): self.token = t try: self.total = c2.getAttr('completeListSize') except: pass
def find_documents(self, session, cache=0): # Construct SRU url, fetch, parse. start = 1 docs = [] while True: self.args['startRecord'] = start params = urllib.urlencode(self.args) req = urllib2.Request(url="%s%s" % (self.server, params)) f = urllib2.urlopen(req) data = f.read() f.close() # subst out xmldecl data = self.xmlver.sub("", data) soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data ps = ZSI.ParsedSoap(soapy, readerclass=reader) resp = ps.Parse(SRW.types.SearchRetrieveResponse) self.total = resp.numberOfRecords for d in resp.records: doc = StringDocument(d.recordData, mimeType='text/xml') if cache == 0: yield doc elif cache == 2: docs.append(doc) else: raise NotImplementedError start += len(resp.records) if start > self.total: if cache == 0: raise StopIteration else: break self.documents = docs
def find_documents(self, session, cache=0): docs = [] curr = 1 while True: self.stream.startRecord = curr resp = self.binding.RPC( self.binding.url, "searchRetrieveRequest", self.stream, requestclass=SRW.types.SearchRetrieveRequest, replytype=SRW.types.SearchRetrieveResponse.typecode, readerclass=reader) total = resp.numberOfRecords curr += len(resp.records) for d in resp.records: doc = StringDocument(d.recordData, mimeType='text/xml') doc.recordSchema = d.recordSchema if cache == 0: yield doc elif cache == 2: docs.append(doc) else: raise NotImplementedError if curr > total: if cache == 0: raise StopIteration else: break self.documents = docs
def create_document(self, session, doc=None): p = self.permissionHandlers.get('info:srw/operation/1/create', None) if p: if not session.user: raise PermissionException("Authenticated user required to create an object in %s" % self.id) okay = p.hasPermission(session, session.user) if not okay: raise PermissionException("Permission required to create an object in %s" % self.id) id = self.generate_id(session) if (doc == None): # Create a placeholder doc = StringDocument("") else: doc.id = id doc.documentStore = self.id try: self.store_document(session, doc) except ObjectAlreadyExistsException: # Back out id change if type(id) == long: self.currentId -= 1 raise except: raise return doc
def process_record(self, session, rec): doc = [] doc.append('<article id="%s" date="%s">\n' % (rec.process_xpath(session, '/article/@id')[0], rec.process_xpath(session, '/article/@date')[0])) head = rec.process_xpath(session, '/article/head')[0] headstr = etree.tounicode(head) doc.append(headstr.encode('utf-8')) doc.append("\n<body>\n") body = rec.process_xpath(session, '/article/body')[0] # walk tree looking for <s> tags, and duplicate out any non s tag eid = 0 for sub in body: if sub.tag == "p": bits = ['<p eid="%s"' % eid] eid += 1 for (name, val) in sub.items(): bits.append("%s=\"%s\"" % (name, val)) bits.append(">") doc.append(' '.join(bits)) for s in sub: # sentences bits = ['<s eid="%s"' % eid] eid += 1 for (name, val) in s.items(): bits.append("%s=\"%s\"" % (name, val)) bits.append(">") doc.append(' '.join(bits)) t = s.text if t: try: toks = self.geniafy(t) ttxt = ''.join(toks) val = '<txt>%s</txt><toks>%s</toks>' % (escape(t), ttxt) doc.append(val.encode('utf8')) except: raise doc.append("</s>") doc.append("</p>\n") elif sub.tag in ["headline", "lead"]: # tag headline and lead too doc.append('<%s>' % sub.tag) t = sub.text if t: try: toks = self.geniafy(t) ttxt = ''.join(toks) val = '<txt>%s</txt><toks>%s</toks>' % (escape(t), ttxt) doc.append(val.encode('utf8')) except: raise doc.append('</%s>' % sub.tag) else: # just useless <br/> tags pass doc.append("\n</body>\n</article>\n") return StringDocument(''.join(doc))
def _process_data(self, session, id, data, preParser=None): # Split from fetch record for Iterators if (preParser is not None): doc = StringDocument(data) doc = preParser.process_document(session, doc) elif (self.outPreParser is not None): doc = StringDocument(data) doc = self.outPreParser.process_document(session, doc) elif (self.outWorkflow is not None): doc = StringDocument(data) doc = self.outWorkflow.process(session, doc) else: doc = StringDocument(data) # Ensure basic required info doc.id = id doc.documentStore = self.id return doc
def test_unicode_content(self): "Check Document with Unicode content returns unaltered." if not self.testUc: self.skipTest("No test Unicode available") uDoc = StringDocument(self.testUc) outDoc = self.testObj.process_document(self.session, uDoc) outDocContent = outDoc.get_raw(self.session) self.assertEqual(outDocContent, self.testUc)
def process_record(self, session, rec): doc = BaseSVMTransformer.process_record(self, session, rec) (l, v) = doc.get_raw(session) full = v.items() full.sort() vstr = ' '.join(["%s:%s" % tuple(x) for x in full]) data = "%s %s\n" % (l, vstr) return StringDocument(data)
def process_document(self, session, doc): data = doc.get_raw(session) new = b64decode(data) return StringDocument(new, self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): bzdata = doc.get_raw(session) data = bz2.decompress(bzdata) return StringDocument(data, self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): (labels, vectors) = doc.get_raw(session)[:2] txt = [] for v in vectors: k = v.keys() if k: k.sort() txt.append(' '.join(map(str, k))) return StringDocument('\n'.join(txt))
def process_document(self, session, doc): data = doc.get_raw(session) string = pickle.loads(data) return StringDocument(string, self.id, doc.processHistory, mimeType='text/pickle', parent=doc.parent, filename=doc.filename)
def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None): doc = StringDocument(stream.get_xml(session)) #get rec into doc self.data.append(doc.get_raw(session))
def process_document(self, session, doc): rst = doc.get_raw(session) data = publish_string(rst, writer_name="xml") return StringDocument(data, self.id, doc.processHistory, mimeType=self.outMimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): data = doc.get_raw(session) new = self.normalizer.process_string(session, data) return StringDocument(new, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): txt = doc.get_raw(session) txt = escape(txt) return StringDocument("<data>" + txt + "</data>", self.id, doc.processHistory, mimeType='text/xml', parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): data = doc.get_raw(session) m = MARC(data) return StringDocument(m.toSGML(), self.id, doc.processHistory, mimeType='text/sgml', parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): (qqq, fn) = tempfile.mkstemp('.pdf') fh = file(fn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = "java -Djava.awt.headless=true -cp /users/azaroth/cheshire3/code/mvd/Multivalent20050929.jar tool.doc.ExtractText -output xml %s" % fn (i, o, err) = os.popen3(cmd) data = o.read() os.remove(fn) return StringDocument(data)
def __call__(self, element, rec): """Apply any necessary transformation to a record, and appends resulting XML to the elementTree. """ if self.txr: # use transformer object doc = self.txr.process_record(session, rec) else: # make no assumptions about class of record doc = StringDocument(rec.get_xml(session)) lxmlRec = lxmlParser.process_document(session, doc) dom = lxmlRec.get_dom(session) return element.append(dom)
def unpack_record(self, session, req): declre = re.compile('<\?xml(.*?)\?>') if req.record: packing = req.record.recordPacking if packing == "string": data = req.record.recordData data = declre.sub('', data) doc = StringDocument(data) elif packing == "url": raise NotImplementedError elif packing == "xml": # Should be a DOM node, not string repr? doc = StringDocument(req.record.recordData) else: diag = Diagnostic1() raise diag doc._schema = req.record.recordSchema else: doc = None return doc
def process_document(self, session, doc): txt = doc.get_raw(session) for e in self.entities.keys(): txt = txt.replace("&%s;" % (e), self.entities[e]) txt = self.amp_re.sub(self._loneAmpersand, txt) return StringDocument(txt, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): try: data = doc.get_raw(session).decode(self.codec) except UnicodeDecodeError as e: raise e return StringDocument(data, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_record(self, session, rec): elems = [] for m in self.maps: (xpath, tagPath) = m node = z3950.TaggedElement() data = self._resolveData(session, rec, xpath) node.content = ('string', str(data)) node.tagType = 2 node.tagValue = ('numeric', int(tagPath)) elems.append(node) return StringDocument(elems, self.id, rec.processHistory, parent=rec.parent)
def process_record(self, session, rec): # return StringDocument dom = rec.get_dom(session) if (session.environment == 'apache'): self.txr = etree.XSLT(self.parsedXslt) if self.params: result = self.txr(dom, **self.params) else: result = self.txr(dom) return StringDocument(str(result))
def process_document(self, session, doc): data = doc.get_raw(session) # This is bizarre, but otherwise: # UnicodeDecodeError: 'ascii' codec can't decode byte ... if type(data) == unicode: data = data.replace(u"\xe2\x80\x9c", u'"') data = data.replace(u"\xe2\x80\x9d", u'"') data = data.replace(u"\xe2\x80\x9e", u'"') data = data.replace(u"\xe2\x80\x93", u'-') data = data.replace(u"\xe2\x80\x98", u"'") data = data.replace(u"\xe2\x80\x99", u"'") data = data.replace(u"\xe2\x80\x9a", u",") data = data.replace(u"\x99", u"'") data = data.replace(u'\xa0', u' ') else: data = data.replace("\xe2\x80\x9c", '"') data = data.replace("\xe2\x80\x9d", '"') data = data.replace("\xe2\x80\x9e", '"') data = data.replace("\xe2\x80\x93", '-') data = data.replace("\xe2\x80\x98", "'") data = data.replace("\xe2\x80\x99", "'") data = data.replace("\xe2\x80\x9a", ",") data = data.replace("\x99", "'") data = data.replace('\xa0', ' ') data = self.nonxmlRe.sub(' ', data) if self.strip: return StringDocument(self.asciiRe.sub('', data), self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename) else: fn = lambda x: "&#%s;" % ord(x.group(1)) return StringDocument(self.asciiRe.sub(fn, data), self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): d = tidy.parseString(doc.get_raw(session), output_xhtml=1, add_xml_decl=0, tidy_mark=0, indent=0) return StringDocument(str(d), self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): self.load_model(session) data = doc.get_raw(session) # data should be list of list of ints to map g = self.model.get ndata = [] for d in data: n = [] for i in d: n.append(g(i)) ndata.append(n) return StringDocument(ndata)
def process_document(self, session, doc): data = doc.get_raw(session) try: xml = self._send_request(session, data) except: xml = "<error/>" return StringDocument(xml, self.id, doc.processHistory, mimeType='text/xml', parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): # Must be raw text after passed through tagger txt = doc.get_raw(session) lines = txt.split('\n') all = [] for l in lines: self.pipe.stdin.write(l) self.pipe.stdin.write("\n") self.pipe.stdin.flush() tagd = self.pipe.stdout.readline() all.append(tagd) return StringDocument('\n'.join(all))
def process_document(self, session, doc): buff = StringIO.StringIO(doc.get_raw(session)) zfile = gzip.GzipFile(mode='rb', fileobj=buff) data = zfile.read() zfile.close() buff.close() del zfile del buff return StringDocument(data, self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)