Exemplo n.º 1
0
 def find_documents(self, session, cache=0):
     docs = []
     curr = 1
     while True:
         self.stream.startRecord = curr
         resp = self.binding.RPC(
             self.binding.url,
             "searchRetrieveRequest",
             self.stream,
             requestclass=SRW.types.SearchRetrieveRequest,
             replytype=SRW.types.SearchRetrieveResponse.typecode,
             readerclass=reader)
         total = resp.numberOfRecords
         curr += len(resp.records)
         for d in resp.records:
             doc = StringDocument(d.recordData, mimeType='text/xml')
             doc.recordSchema = d.recordSchema
             if cache == 0:
                 yield doc
             elif cache == 2:
                 docs.append(doc)
             else:
                 raise NotImplementedError
         if curr > total:
             if cache == 0:
                 raise StopIteration
             else:
                 break
     self.documents = docs
Exemplo n.º 2
0
 def _getRecord(self):
     for oaiid in self.idcache:
         s = "%sverb=GetRecord&%s" % (
             self.server,
             urllib.urlencode({
                 'metadataPrefix': self.metadataPrefix,
                 'identifier': oaiid
             }))
         resp = self._fetchStream(s)
         data = resp.read()
         doc = StringDocument(data, self.id, mimeType='text/xml')
         rec = BSParser.process_document(None, doc)
         dom = rec.get_dom(session)
         for top in dom.childNodes:
             if top.nodeType == elementType:
                 break
         for c in top.childNodes:
             if (c.nodeType == elementType and c.localName == 'GetRecord'):
                 for c2 in c.childNodes:
                     if (c2.nodeType == elementType
                             and c2.localName == 'record'):
                         for c3 in c2.childNodes:
                             if (c3.nodeType == elementType
                                     and c3.localName == 'metadata'):
                                 for c4 in c3.childNodes:
                                     if (c4.nodeType == elementType):
                                         data = c4.toxml()
                                         yield StringDocument(
                                             data,
                                             self.id,
                                             mimeType='text/xml')
                                         break
                         break
                 break
     raise StopIteration
Exemplo n.º 3
0
 def save_collocates(self, collocates, id):
     string = Pickle.dumps(collocates)
     doc = StringDocument(string)
     doc.id = id
     self.collStore.store_document(self.session, doc)
     self.collStore.commit_storing(self.session)
     return id
Exemplo n.º 4
0
    def process_document(self, session, doc):

        # write out our temp file
        (qq, infn) = tempfile.mkstemp(".tfp")
        fh = file(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()

        # go to TFP directory and run
        o = os.getcwd()
        os.chdir(self.filePath)
        results = commands.getoutput(
            "%s -Xms%sm -Xmx%sm AprioriTFPapp -F../%s -S%s -C%s" %
            (self.java, self.memory, self.memory, infn, self.support,
             self.confidence))
        os.chdir(o)

        # process results
        resultLines = results.split('\n')
        matches = []
        for l in resultLines:
            m = freqRe.search(l)
            if m:
                (set, freq) = m.groups()
                matches.append((int(freq), set))

        if not matches:
            # no FIS for some reason, return results??
            return StringDocument(results)
        matches.sort(reverse=True)
        return StringDocument(matches)
Exemplo n.º 5
0
    def process_document(self, session, doc):
        # write out our temp file
        (qq, infn) = tempfile.mkstemp(".arm")
        fh = file(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()

        if self.absSupport:
            t = len(doc.get_raw(session).split('\n'))
            self.support = (float(self.absSupport) / float(t)) * 100

        (qq, outfn) = tempfile.mkstemp(".txt")
        # go to directory and run
        o = os.getcwd()
        #os.chdir(self.filePath)

        if self.confidence > 0:
            cmd = "apriori %s %s %f %s" % (infn, outfn, self.support / 100,
                                           self.confidence / 100)
        else:
            cmd = "apriori %s %s %s" % (infn, outfn, self.support / 100)
        results = commands.getoutput(cmd)

        #os.chdir(o)

        inh = file(outfn)
        fis = self.fisre
        rule = self.rulere
        singleItems = self.singleItems
        matches = []
        rules = []
        for line in inh:
            # matching line looks like N N N (N)
            # rules look like N N ==> N (f, N)
            m = fis.match(line)
            if m:
                (set, freq) = m.groups()
                if singleItems or set.find(' ') > -1:
                    matches.append((int(freq), set))
            elif self.confidence > 0:
                m = rule.match(line)
                if m:
                    (ante, conc, conf, supp) = m.groups()
                    al = map(int, ante.split(' '))
                    cl = map(int, conc.split(' '))
                    rules.append((float(conf), int(supp), al, cl))
        inh.close()
        # delete temp files!
        os.remove(outfn)
        os.remove(infn)

        if not matches:
            # no FIS for some reason, return results??
            return StringDocument([results, []])

        matches.sort(reverse=True)
        rules.sort(reverse=True)
        os.chdir(o)
        doc = StringDocument([matches, rules])
        return doc
Exemplo n.º 6
0
 def fetch_document(self, session, id):
     p = self.permissionHandlers.get('info:srw/operation/2/retrieve', None)
     if p:
         if not session.user:
             msg = ("Authenticated user required to retrieve an object "
                    "from %s" % self.id)
             raise PermissionException(msg)
         okay = p.hasPermission(session, session.user)
         if not okay:
             msg = ("Permission required to retrieve an object from "
                    "%s" % self.id)
             raise PermissionException(msg)
     data = self.fetch_data(session, id)
     if (data):
         doc = StringDocument(data)
         if (self.outPreParser is not None):
             doc = self.outPreParser.process_document(session, doc)
         elif (self.outWorkflow is not None):
             doc = self.outWorkflow.process(session, doc)
         doc.id = id
         doc.documentStore = self.id
         doc.parent = ('document', self.id, id)
         return doc
     elif (isinstance(data, DeletedObject)):
         raise ObjectDeletedException(data)
     else:
         raise ObjectDoesNotExistException(id)
Exemplo n.º 7
0
 def create_document(self, session, doc=None):
     p = self.permissionHandlers.get('info:srw/operation/1/create', None)
     if p:
         if not session.user:
             msg = ("Authenticated user required to create an object in "
                    "%s" % self.id)
             raise PermissionException(msg)
         okay = p.hasPermission(session, session.user)
         if not okay:
             msg = "Permission required to create an object in %s" % self.id
             raise PermissionException(msg)
     id = self.generate_id(session)
     if (doc is None):
         # Create a placeholder
         doc = StringDocument("")
     else:
         doc.id = id
     doc.documentStore = self.id
     try:
         self.store_document(session, doc)
     except ObjectAlreadyExistsException:
         # Back out id change
         if type(id) == long:
             self.currentId -= 1
         raise
     except:
         raise
     return doc
Exemplo n.º 8
0
 def process_record(self, session, record):
     # Get RecordStore and identifier of parent record
     try:
         parentId = record.process_xpath(session, '/c3component/@parent')[0]
     except IndexError:
         parentId = record.process_xpath(
             session,
             '/c3:component/@c3:parent',
             maps={'c3': "http://www.cheshire3.org/schemas/component/"})[0]
     recStoreId, parentId = parentId.split('/', 1)
     # Get RecordStore object
     if isinstance(self.parent, Database):
         db = self.parent
     elif isinstance(self.parent, Server) and session.database:
         db = self.parent.get_object(session, session.database)
     elif (session.server and isinstance(session.server, Server)
           and session.database):
         db = session.server.get_object(session, session.database)
     elif not session.server:
         raise ValueError("No session.server")
     else:
         raise ValueError("No session.database")
     recStore = db.get_object(session, recStoreId)
     # Fetch parent record
     parentRec = recStore.fetch_record(session, parentId)
     # Return a new Document with parent data and identifier
     data = parentRec.get_xml(session)
     doc = StringDocument(data, self.id, byteCount=len(data), byteOffset=0)
     doc.id = parentId
     return doc
Exemplo n.º 9
0
 def find_documents(self, session, cache=0):
     docs = []
     curr = 1
     while True:
         self.stream.startRecord = curr
         resp = self.binding.RPC(self.binding.url,
                                 "searchRetrieveRequest",
                                 self.stream,
                                 requestclass=SRW.types.SearchRetrieveRequest,
                                 replytype=SRW.types.SearchRetrieveResponse.typecode,
                                 readerclass=reader)
         total = resp.numberOfRecords
         curr += len(resp.records)
         for d in resp.records:
             doc = StringDocument(d.recordData,  mimeType='text/xml')
             doc.recordSchema = d.recordSchema
             if cache ==0:
                 yield doc
             elif cache == 2:
                 docs.append(doc)
             else:
                 raise NotImplementedError
         if curr > total:
             if cache == 0:
                 raise StopIteration
             else:
                 break
     self.documents = docs
Exemplo n.º 10
0
 def create_document(self, session, doc=None):
     p = self.permissionHandlers.get('info:srw/operation/1/create', None)
     if p:
         if not session.user:
             msg = ("Authenticated user required to create an object in "
                    "%s" % self.id)
             raise PermissionException(msg)
         okay = p.hasPermission(session, session.user)
         if not okay:
             msg = "Permission required to create an object in %s" % self.id
             raise PermissionException(msg)
     id = self.generate_id(session)
     if (doc is None):
         # Create a placeholder
         doc = StringDocument("")
     else:
         doc.id = id
     doc.documentStore = self.id
     try:
         self.store_document(session, doc)
     except ObjectAlreadyExistsException:
         # Back out id change
         if type(id) == long:
             self.currentId -= 1
         raise
     except:
         raise
     return doc
Exemplo n.º 11
0
 def find_documents(self, session, cache=0):
     docs = []
     locs = []
     data = self.stream.read(1536)
     myTell = 0
     while data:
         rt = data.find("\x1D")
         while (rt > -1):
             txt = data[:rt + 1]
             tlen = len(txt)
             if cache == 0:
                 yield StringDocument(txt, mimeType="application/marc")
             elif cache == 1:
                 locs.append((myTell, tlen))
             elif cache == 2:
                 docs.append(
                     StringDocument(txt, mimeType="application/marc"))
             data = data[rt + 1:]
             myTell += tlen
             rt = data.find("\x1D")
         dlen = len(data)
         data += self.stream.read(1536)
         if (len(data) == dlen):
             # Junk at end of file
             data = ""
     self.stream.close()
     self.locations = locs
     self.documents = docs
     self.length = max(len(locs), len(docs))
Exemplo n.º 12
0
 def fetch_document(self, session, id):
     p = self.permissionHandlers.get('info:srw/operation/2/retrieve', None)
     if p:
         if not session.user:
             msg = ("Authenticated user required to retrieve an object "
                    "from %s" % self.id)
             raise PermissionException(msg)
         okay = p.hasPermission(session, session.user)
         if not okay:
             msg = ("Permission required to retrieve an object from "
                    "%s" % self.id)
             raise PermissionException(msg)
     data = self.fetch_data(session, id)
     if (data):
         doc = StringDocument(data)
         if (self.outPreParser is not None):
             doc = self.outPreParser.process_document(session, doc)
         elif (self.outWorkflow is not None):
             doc = self.outWorkflow.process(session, doc)
         doc.id = id
         doc.documentStore = self.id
         doc.parent = ('document', self.id, id)
         return doc
     elif (isinstance(data, DeletedObject)):
         raise ObjectDeletedException(data)
     else:
         raise ObjectDoesNotExistException(id)
def directoryDocumentStoreIter(store):
    session = Session()
    for id_, data in directoryStoreIter(store):
        doc = StringDocument(data)
        doc.id = id_
        internalId = store._normalizeIdentifier(session, id_)
        doc.filename = store._getFilePath(session, internalId)
        yield doc
Exemplo n.º 14
0
    def process_record(self, session, record):
        u"""Apply Workflow to the Record, return the resulting Document."""
        output = self.workflow.process(session, record)
        if isinstance(output, basestring):
            output = StringDocument(output)
        elif isinstance(output, Record):
            output = StringDocument(output.get_xml(session))

        return output
 def process_record(self, session, record):
     u"""Apply Workflow to the Record, return the resulting Document."""
     output = self.workflow.process(session, record)
     if isinstance(output, basestring):
         output = StringDocument(output)
     elif isinstance(output, Record):
         output = StringDocument(output.get_xml(session))
     
     return output
Exemplo n.º 16
0
 def find_documents(self, session, cache=0):
     if cache == 0:
         yield StringDocument(self.stream.read(),
                              filename=self.streamLocation)
     elif cache == 2:
         self.documents = [
             StringDocument(self.stream.read(),
                            filename=self.streamLocation)
         ]
Exemplo n.º 17
0
 def accumulate(self,
                session,
                stream,
                format,
                tagName=None,
                codec=None,
                factory=None):
     doc = StringDocument(stream.get_xml(session))  #get rec into doc
     self.data.append(doc.get_raw(session))
Exemplo n.º 18
0
 def _parse_upload(self, data, interface='admin'):
     if (type(data) == unicode):
         try: data = data.encode('utf-8')
         except:
             try: data = data.encode('utf-16')
             except: pass # hope for the best!
     doc = StringDocument(data)
     del data
     doc = ppFlow.process(session, doc)
     try:
         rec = docParser.process_document(session, doc)
     except:
         newlineRe = re.compile('(\s\s+)')
         doc.text = newlineRe.sub('\n\g<1>', doc.get_raw(session))
         # repeat parse with correct line numbers
         try:
             rec = docParser.process_document(session, doc)
         except:
             self.htmlTitle.append('Error')
             e = sys.exc_info()
             self.logger.log('*** %s: %s' % (repr(e[0]), e[1]))
             # try and highlight error in specified place
             lines = doc.get_raw(session).split('\n')
             positionRe = re.compile(':(\d+):(\d+):')
             mo = positionRe.search(str(e[1]))
             if (mo is None):
                 positionRe = re.compile('line (\d+), column (\d+)')
                 mo = positionRe.search(str(e[1]))
             line, posn = lines[int(mo.group(1))-1], int(mo.group(2))
             try:
                 startspace = newlineRe.match(line).group(0)
             except:
                 if interface=='admin':
                     link = '<a href="files.html">Back to file page</a>'
                 else :
                     link = '<a href="edit.html">Back to edit/create menu</a>'
                 return '''<div id="single"><p class="error">An error occured while parsing your file. 
     Please check the file is a valid ead file and try again.</p><p>%s</p></div>''' % link
             else:
                 if interface=='admin':
                     link = '<a href="files.html">Back to file page</a>'
                 else :
                     link = '<a href="edit.html">Back to edit/create menu</a>'
                 return '''\
         <div id="single"><p class="error">An error occured while parsing your file. 
         Please check the file at the suggested location and try again.</p>
         <code>%s: %s</code>
         <pre>
         %s
         <span class="error">%s</span>
         </pre>
         <p>%s</p></div>
                 ''' % (html_encode(repr(e[0])), e[1], html_encode(line[:posn+20]) + '...',  startspace + str('-'*(posn-len(startspace))) +'^', link)
                 
     del doc
     return rec
Exemplo n.º 19
0
 def find_documents(self, session, cache=0):
     # step through terms
     if cache == 0:
         for k in self.stream:
             yield StringDocument(k)
         raise StopIteration
     elif cache == 2:
         documents = []
         for k in self.stream:
             documents.append(StringDocument(k))
         self.documents = documents
Exemplo n.º 20
0
 def find_documents(self, session, cache=0):
     results = self.stream.search(self.query)
     docs = []
     for r in results:
         doc = self._toXml(r)
         if cache == 0:
             yield StringDocument(doc)
         elif cache == 2:
             docs.append(StringDocument(doc))
         else:
             raise NotImplementedError
     self.documents = docs
Exemplo n.º 21
0
 def _processFile(self, session, item):
     name = self._fetchName(item)
     if self.filterRe:
         m = self.filterRe.search(name)
         if not m:
             return None
     mimetype = mimetypes.guess_type(name, 0)
     if (mimetype[0] in [
             'text/sgml', 'text/xml', 'application/sgml', 'application/xml'
     ]):
         if mimetype[1] == 'gzip':
             raise NotImplementedError(
                 'XML files compressed using gzip are not yet supported. You could try using zip.'
             )
         trip = ('stream', XmlDocumentStream, 'xml')
     elif (mimetype[0] == 'application/x-tar'):
         if mimetype[1] == 'gzip':
             trip = ('stream', TarDocumentStream, 'tar.gz')
         elif mimetype[1] == 'bzip2':
             trip = ('stream', TarDocumentStream, 'tar.bz2')
         else:
             trip = ('stream', TarDocumentStream, 'tar')
     elif (mimetype[0] == 'application/zip'):
         trip = ('stream', ZipDocumentStream, 'zip')
     elif (mimetype[0] == 'application/marc'):
         trip = ('stream', MarcDocumentStream, 'marc')
     else:
         if self.tagName is not None:
             trip = ('stream', XmlDocumentStream, 'xml')
         else:
             trip = ('document', None, mimetype[0])
     s = self._fetchStream(item)
     if trip[0] == 'stream':
         cls = trip[1]
         nstream = cls(session,
                       s,
                       format=trip[2],
                       tagName=self.tagName,
                       codec=self.codec,
                       factory=self.factory)
         # copy streamLocation in to copy to document
         nstream.streamLocation = item
         return ('stream', nstream)
     elif trip[0] == 'document':
         data = s.read()
         s.close()
         doc = StringDocument(data, mimeType=trip[2], filename=name)
         if mimetype[1]:
             doc.compression = mimetype[1]
         return ('document', doc)
Exemplo n.º 22
0
 def find_documents(self, session, cache=0):
     docs = []
     linked = self.factory.get_setting(session, 'linkedItem', 0)
     for e in self.stream.entries:
         if linked == 0:
             doc = self._toXml(e)
         else:
             s = self._fetchStream(e.link)
             doc = s.read()
         if cache == 0:
             yield StringDocument(doc)
         elif cache == 2:
             docs.append(StringDocument(doc))
         else:
             raise NotImplementedError
     self.documents = docs
Exemplo n.º 23
0
    def process_document(self, session, doc):
        (labels, vectors) = doc.get_raw(session)

        # find max attr
        all = {}
        for v in vectors:
            all.update(v)
        keys = all.keys()
        keys.sort()
        maxattr = keys[-1]
        nattrs = len(keys)

        # remap vectors to reduced space
        renumbers = range(self.offset, nattrs + self.offset)
        renumberhash = dict(zip(keys, renumbers))
        newvectors = []
        for vec in vectors:
            new = {}
            for (k, v) in vec.items():
                new[renumberhash[k]] = v
            newvectors.append(new)

        # pickle renumberhash
        pick = cPickle.dumps(renumberhash)
        filename = self.get_path(session, 'modelPath', None)
        if not filename:
            dfp = self.get_path(session, 'defaultPath')
            filename = os.path.join(dfp, self.id + "_ATTRHASH.pickle")
        f = file(filename, 'w')
        f.write(pick)
        f.close()

        return StringDocument((labels, newvectors, nattrs))
Exemplo n.º 24
0
 def setUp(self):
     PreParserTestCase.setUp(self)
     self.testUc = self._get_testUnicode()
     if self.testUc:
         self.inDoc = StringDocument(self.testUc)
         self.outDoc = self.testObj.process_document(
             self.session, self.inDoc)
Exemplo n.º 25
0
    def find_documents(self, session, cache=0):
        # Construct SRU url, fetch, parse.
        start = 1
        docs = []
        while True:
            self.args['startRecord'] = start
            params = urllib.urlencode(self.args)
            req = urllib2.Request(url="%s%s" % (self.server, params))
            f = urllib2.urlopen(req)
            data = f.read()
            f.close()
            # subst out xmldecl
            data = self.xmlver.sub("", data)
            soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data
            ps = ZSI.ParsedSoap(soapy, readerclass=reader)
            resp = ps.Parse(SRW.types.SearchRetrieveResponse)

            self.total = resp.numberOfRecords
            for d in resp.records:
                doc = StringDocument(d.recordData, mimeType='text/xml')
                if cache == 0:
                    yield doc
                elif cache == 2:
                    docs.append(doc)
                else:
                    raise NotImplementedError
            start += len(resp.records)
            if start > self.total:
                if cache == 0:
                    raise StopIteration
                else:
                    break
        self.documents = docs
Exemplo n.º 26
0
    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        txt = txt.replace('\n', ' ')
        txt = txt.replace('\r', ' ')
        for x in range(9, 14):
            txt = txt.replace('&#%d;' % (x), ' ')
        txt = self.doctype_re.sub('', txt)
        for e in self.entities.keys():
            txt = txt.replace("&%s;" % (e), self.entities[e])
        txt = self.amp_re.sub(self._loneAmpersand, txt)
        txt = txt.replace('&<', '&amp;<')
        txt = self.attr_re.sub(self._attributeFix, txt)
        txt = self.elem_re.sub(self._lowerElement, txt)
        for t in self.emptyTags:
            empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t)
            txt = empty_re.sub(self._emptyElement, txt)
        # strip processing instructions.
        txt = self.pi_re.sub('', txt)

        return StringDocument(txt,
                              self.id,
                              doc.processHistory,
                              mimeType=doc.mimeType,
                              parent=doc.parent,
                              filename=doc.filename)
Exemplo n.º 27
0
 def process_document(self, session, doc):
     data = self.script.sub('', doc.get_raw(session))
     data = self.style.sub('', data)
     data = self.comment.sub('', data)
     tm = self.title.search(data)
     if tm:
         title = data[tm.start():tm.end()]
     else:
         title = ""
     m = self.body.search(data)
     if m:
         body = data[m.start():m.end()]
     else:
         body = data
     text = self.tagstrip.sub(' ', body)
     text = text.replace('<', '&lt;')
     text = text.replace('>', '&gt;')
     text = text.replace("&nbsp;", ' ')
     text = text.replace("&nbsp", ' ')
     l = text.split()
     text = ' '.join(l)
     data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           mimeType=doc.mimeType,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 28
0
    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        # Replace entities that can be represented with simple chars
        for (fromEnt, toEnt) in self.inane.iteritems():
            txt = txt.replace("&%s;" % fromEnt, toEnt)
        # Fix some common mistakes
        for (fromEnt, toEnt) in self.preEntities.iteritems():
            txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt)
        # Fix straight forward entites
        for (s, enty) in enumerate(self.entities):
            txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s))
        # Fix additional random entities
        for (fent, totxt) in self.otherEntities.iteritems():
            txt = txt.replace("&%s;" % fent, "&%s;" % totxt)
        # Add missing # in &123;

        def hashed(mo):
            return '&#%s;' % mo.group(1)

        txt = self.numericalEntRe.sub(hashed, txt)

        # Fix made up fraction entities. (?)

        def fraction(mo):
            return '%s&#8260;%s' % (mo.group(1), mo.group(2))

        txt = self.fractionRe.sub(fraction, txt)
        # Kill remaining invalid character entities
        txt = self.invalidRe.sub('', txt)
        return StringDocument(txt,
                              self.id,
                              doc.processHistory,
                              mimeType=doc.mimeType,
                              parent=doc.parent,
                              filename=doc.filename)
Exemplo n.º 29
0
 def process_record(self, session, rec):
     doc = []
     doc.append(
         '<article id="%s" date="%s">\n',
         '' % (rec.process_xpath(session, '/article/@id')[0],
               rec.process_xpath(session, '/article/@date')[0]))
     head = rec.process_xpath(session, '/article/head')[0]
     headstr = etree.tounicode(head)
     doc.append(headstr.encode('utf-8'))
     doc.append("\n<body>\n")
     body = rec.process_xpath(session, '/article/body')[0]
     # walk tree looking for <s> tags, and duplicate out any non s tag
     eid = 0
     for sub in body:
         if sub.tag == "p":
             bits = ['<p eid="%s"' % eid]
             eid += 1
             for (name, val) in sub.items():
                 bits.append("%s=\"%s\"" % (name, val))
             bits.append(">")
             doc.append(' '.join(bits))
             for s in sub:
                 # sentences
                 bits = ['<s eid="%s"' % eid]
                 eid += 1
                 for (name, val) in s.items():
                     bits.append("%s=\"%s\"" % (name, val))
                 bits.append(">")
                 doc.append(' '.join(bits))
                 t = s.text
                 if t:
                     try:
                         toks = self.geniafy(t)
                         ttxt = ''.join(toks)
                         val = '<txt>%s</txt><toks>%s</toks>' % (escape(t),
                                                                 ttxt)
                         doc.append(val.encode('utf8'))
                     except:
                         raise
                 doc.append("</s>")
             doc.append("</p>\n")
         elif sub.tag in ["headline", "lead"]:
             # tag headline and lead too
             doc.append('<%s>' % sub.tag)
             t = sub.text
             if t:
                 try:
                     toks = self.geniafy(t)
                     ttxt = ''.join(toks)
                     val = '<txt>%s</txt><toks>%s</toks>' % (escape(t),
                                                             ttxt)
                     doc.append(val.encode('utf8'))
                 except:
                     raise
             doc.append('</%s>' % sub.tag)
         else:
             # just useless <br/> tags
             pass
     doc.append("\n</body>\n</article>\n")
     return StringDocument(''.join(doc))
Exemplo n.º 30
0
    def _listIdentifiers(self):
        s = "%sverb=ListIdentifiers&" % (self.server)
        s += urllib.urlencode(self.params)
        resp = self._fetchStream(s)
        data = resp.read()

        # self.lastResponse = resp
        # Now use existing infrastructure to parse
        doc = StringDocument(data, self.id, mimeType='text/xml')
        rec = BSParser.process_document(None, doc)
        dom = rec.get_dom(session)
        for top in dom.childNodes:
            if (top.nodeType == elementType):
                break
        for c in top.childNodes:
            if (c.nodeType == elementType
                    and c.localName == 'ListIdentifiers'):
                for c2 in c.childNodes:
                    if (c2.nodeType == elementType
                            and c2.localName == 'header'):
                        for c3 in c2.childNodes:
                            if (c3.nodeType == elementType
                                    and c3.localName == 'identifier'):
                                self.ids.append(getFirstData(c3))
                    elif (c2.nodeType == elementType
                          and c2.localName == 'resumptionToken'):
                        t = getFirstData(c2)
                        if (t):
                            self.token = t
                        try:
                            self.total = c2.getAttr('completeListSize')
                        except:
                            pass
Exemplo n.º 31
0
 def test_process_document_returnProcessHistory(self):
     "Check that returned Record has parser in history."
     for data in self._get_data():
         rec = self.testObj.process_document(self.session,
                                             StringDocument(data))
         self.assertEqual(len(rec.processHistory), 1)
         self.assertEqual(rec.processHistory[0], self.testObj.id)
Exemplo n.º 32
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     return StringDocument(binascii.b2a_base64(data),
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)
Exemplo n.º 33
0
 def process_record(self, session, rec):
     doc = []
     for c in self.copyElems:
         res = rec.process_xpath(session, c[0], c[1])
         for match in res:
             txt = rec.get_xml(session, match)
             doc.append(txt)
     for t in self.tagElems:
         res = rec.process_xpath(session, t[0], t[1])
         for match in res:
             # Process all text nodes together
             totag = []
             for event in match:
                 if event[0] == '3':
                     totag.append(event[1:])
             tagtxt = ''.join(totag)
             tagged = self.tag(session, tagtxt)
             tagged = ''.join(tagged)
             if match[0][0] != '3':
                 (name, attrhash) = rec._convert_elem(match[0])
                 attrs = []
                 for a in attrhash:
                     attrs.append('%s="%s"' % (a, attribs[a]))
                 attribtxt = ' '.join(attrs)
                 if (attribtxt):
                     attribtxt = " " + attribtxt
                 txt = "<%s%s>%s</%s>" % (name, attribtxt, tagged, name)
             else:
                 txt = "<text>%s</text>" % (tagged)
             doc.append(txt)
     doctxt = "<record>%s</record>" % '\n'.join(doc)
     strdoc = StringDocument(doctxt, self.id, rec.processHistory,
                             'text/xml')
     return strdoc
Exemplo n.º 34
0
 def find_documents(self, session, cache=0):
     # read in single file
     doc = StringDocument(self.stream.read(), filename=self.stream.getName())
     # attach any iRODS metadata
     umd = self.stream.getUserMetadata()
     self.stream.close()
     self.cxn.disconnect()
     md = {}
     for x in umd:
         md[x[0]] = icatValToPy(x[1], x[2])
     if len(md):
         doc.metadata['iRODS'] = md
     if cache == 0:
         yield doc
     elif cache == 2:
         self.documents = [doc]
Exemplo n.º 35
0
 def process_record(self, session, rec):
     if isinstance(rec, GraphRecord):
         fmt = self.get_setting(session, 'format', 'xml')
         data = rec.graph.serialize(format=fmt)
         return StringDocument(data)
     else:
         raise NotImplementedError("Can only transform GraphRecords")
Exemplo n.º 36
0
 def process_document(self, session, doc):
     global METS_NAMESPACES
     mets = self._get_metsWrapper(doc)
     objid = mets.get("OBJID")
     # Get the fileSec element
     fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]',
                          namespaces=METS_NAMESPACES)[0]
     file_ = self._get_metsFile(
         '/'.join([objid, mets.attrib.get("LABEL", "file0001")]),
         doc.get_raw(session), doc.byteCount, doc.mimeType)
     # Append the file element to fileGrp
     fileGrp.append(file_)
     # Update last modification date
     mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
     # Serialize METS
     data = etree.tostring(mets, pretty_print=True)
     # Return a Document
     return StringDocument(data,
                           self.id,
                           doc.processHistory,
                           self.outMimeType,
                           parent=doc.parent,
                           filename=doc.filename,
                           byteCount=len(data),
                           byteOffset=0)
Exemplo n.º 37
0
 def process_document(self, session, doc):
     text = doc.get_raw(session)
     tt = self.tag(session, text, xml=1)
     ttj = '\n'.join(tt)
     ttj = "<text>" + ttj + "</text>"
     return StringDocument(ttj, self.id, doc.processHistory, 'text/xml',
                           doc.parent)
Exemplo n.º 38
0
 def process_record(self, session, rec):
     p = self.permissionHandlers.get('info:srw/operation/2/transform', None)
     if p:
         if not session.user:
             raise PermissionException("Authenticated user required to transform using %s" % self.id)
         okay = p.hasPermission(session, session.user)
         if not okay:
             raise PermissionException("Permission required to transform using %s" % self.id)
     self.initState()
     try:
         rec.saxify(session, self)
     except AttributeError:
         saxp = session.server.get_object(session, 'SaxParser')
         saxRec = saxp.process_document(session, StringDocument(rec.get_xml(session)))
         saxRec.saxify(session, self)
     return StringDocument(self.top, self.id, rec.processHistory, parent=rec.parent)
Exemplo n.º 39
0
def unpack_record(self, session, req):
    declre = re.compile('<\?xml(.*?)\?>')
    if req.record:
        packing = req.record.recordPacking
        if packing == "string":
            data = req.record.recordData
            data = declre.sub('', data)            
            doc = StringDocument(data)
        elif packing == "url":
            raise NotImplementedError
        elif packing == "xml":
            # Should be a DOM node, not string repr?
            doc = StringDocument(req.record.recordData)
        else:
            diag = Diagnostic1()
            raise diag
        doc._schema = req.record.recordSchema
    else:
        doc = None
    return doc
Exemplo n.º 40
0
 def _processFile(self, session, item):
     name = self._fetchName(item)
     if self.filterRe:
         m = self.filterRe.search(name)
         if not m:
             return None
     mimetype = mimetypes.guess_type(name, 0)
     if (mimetype[0] in ['text/sgml', 'text/xml', 'application/sgml', 'application/xml']):
         if mimetype[1] == 'gzip':
             raise NotImplementedError('XML files compressed using gzip are not yet supported. You could try using zip.')
         trip = ('stream', XmlDocumentStream, 'xml')
     elif (mimetype[0] == 'application/x-tar'):
         if mimetype[1] == 'gzip':
             trip = ('stream', TarDocumentStream, 'tar.gz')
         elif mimetype[1] == 'bzip2':
             trip = ('stream', TarDocumentStream, 'tar.bz2')
         else:
             trip = ('stream', TarDocumentStream, 'tar')
     elif (mimetype[0] == 'application/zip'):
         trip = ('stream', ZipDocumentStream, 'zip')
     elif (mimetype[0] == 'application/marc'):
         trip = ('stream', MarcDocumentStream, 'marc')
     else:
         if self.tagName is not None:
             trip = ('stream', XmlDocumentStream, 'xml')
         else:
             trip = ('document', None, mimetype[0])
     s = self._fetchStream(item)
     if trip[0] == 'stream':
         cls = trip[1]
         nstream = cls(session, s, format=trip[2], tagName=self.tagName, codec=self.codec, factory=self.factory)
         # copy streamLocation in to copy to document
         nstream.streamLocation = item
         return ('stream', nstream)
     elif trip[0] == 'document':
         data = s.read()
         s.close()
         doc = StringDocument(data, mimeType=trip[2], filename=name)
         if mimetype[1]:
             doc.compression = mimetype[1]
         return ('document', doc)
Exemplo n.º 41
0
    def save_concordance(self, clines, id, wordWindow):
        global maxSize
#        self.logger.log('saving concordance - %d' % len(clines))
        if len(clines) > maxSize :
            i = 1
            for j in range(0, len(clines), maxSize):
                slice = clines[j:j+maxSize]
                slice.insert(0, [len(clines), wordWindow])
                string = Pickle.dumps(slice)
                doc = StringDocument(string)
                doc.id = '%s_%d' % (id, i)
                i += 1
                self.concStore.store_document(self.session, doc)
        else :
            clines.insert(0, [len(clines), wordWindow])
            string = Pickle.dumps(clines)
            doc = StringDocument(string)
            doc.id = '%s_1' % id
            self.concStore.store_document(self.session, doc)
        self.concStore.commit_storing(self.session)
        return id
 def process_record(self, session, record):
     # Get RecordStore and identifier of parent record
     try:
         parentId = record.process_xpath(session, '/c3component/@parent')[0]
     except IndexError:
         parentId = record.process_xpath(
             session,
             '/c3:component/@c3:parent',
             maps={'c3': "http://www.cheshire3.org/schemas/component/"}
         )[0]
     recStoreId, parentId = parentId.split('/', 1)
     # Get RecordStore object
     if isinstance(self.parent, Database):
         db = self.parent
     elif isinstance(self.parent, Server) and session.database:
         db = self.parent.get_object(session, session.database)
     elif (
             session.server and
             isinstance(session.server, Server) and
             session.database
     ):
         db = session.server.get_object(session, session.database)
     elif not session.server:
         raise ValueError("No session.server")
     else:
         raise ValueError("No session.database")
     recStore = db.get_object(session, recStoreId)
     # Fetch parent record
     parentRec = recStore.fetch_record(session, parentId)
     # Return a new Document with parent data and identifier
     data = parentRec.get_xml(session)
     doc = StringDocument(
         data,
         self.id,
         byteCount=len(data),
         byteOffset=0
     )
     doc.id = parentId
     return doc
Exemplo n.º 43
0
def mercurialDocumentStoreIter(store):
    session = Session()
    for id_, data in directoryStoreIter(store):
        doc = StringDocument(data)
        doc.id = id_
        internalId = store._normalizeIdentifier(session, id_)
        doc.filename = store._getFilePath(session, internalId)
        doc.documentStore = store.id
        # Assign byteCount and other useful metadata
        stat = os.stat(doc.filename)
        doc.byteCount = stat.st_size
        doc.metadata['lastModified'] = stat.st_mtime
        yield doc
Exemplo n.º 44
0
 def _process_data(self, session, id, data, preParser=None):
     # Split from fetch record for Iterators
     if (preParser is not None):
         doc = StringDocument(data)
         doc = preParser.process_document(session, doc)
     elif (self.outPreParser is not None):
         doc = StringDocument(data)
         doc = self.outPreParser.process_document(session, doc)
     elif (self.outWorkflow is not None):
         doc = StringDocument(data)
         doc = self.outWorkflow.process(session, doc)
     else:
         doc = StringDocument(data)
     # Ensure basic required info
     doc.id = id
     doc.documentStore = self.id
     return doc
Exemplo n.º 45
0
 def find_documents(self, session, cache=0):
     doc = StringDocument([self.classes, self.vectors])
     doc.totalAttributes = self.totalAttributes
     yield doc
Exemplo n.º 46
0
 def next(self):
     d = BdbIter.next(self)
     doc = StringDocument(d[1])
     doc.id = d[0]
     return doc
Exemplo n.º 47
0
 def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None):
     doc = StringDocument(stream.get_xml(session))  # get rec into doc
     self.data.append(doc.get_raw(session))
Exemplo n.º 48
0
    def process_document(self, session, doc):
        # take in Doc with match list, return doc with rule object list
        (matches, armrules) = doc.get_raw(session)

        out = StringDocument([])

        # Initial setup
        termHash = {}
        termFreqHash = {}
        termRuleFreq = {}
        rules = []
        ruleLengths = {}

        if self.recordStore:
            totalDocs = self.recordStore.get_dbSize(session)
        else:
            # get default from session's database
            db = session.server.get_object(session, session.database)
            recStore = db.get_path(session, 'recordStore', None)
            if recStore:
                totalDocs = recStore.get_dbSize(session)
        if totalDocs == 0:
            # avoid e_divzero
            totalDocs = 1
        totalDocs = float(totalDocs)

        # step through rules and turn into objects, do math, do global stats
        for m in matches:
            r = FrequentSet(session, m, out, self.unrenumber)
            
            freqs = []
            for t in r.termids:
                try:
                    termFreq = termFreqHash[t]
                    termRuleFreq[t] += 1
                except:
                    termRuleFreq[t] = 1
                    term = self.index.fetch_termById(session, t)
                    termHash[t] = term
                    termFreq = self.index.fetch_term(session, term, summary=True)[1]
                    termFreqHash[t] = termFreq
                freqs.append(termFreq)
            r.freqs = freqs

            if self.calcRankings:
                if self.calcRuleLengths:
                    try:
                        ruleLengths[(len(r.termids))] += 1
                    except:
                        ruleLengths[(len(r.termids))] = 1

                # some basic stats needed
                avgs = []
                entropy = []
                gini = []
                ftd = float(totalDocs)
                for t in freqs:
                    bit = float(t)/ftd
                    avgs.append(bit)
                    entropy.append((0-bit) * math.log(bit, 2))
                    gini.append(bit**2)

                r.pctg = reduce(operator.mul, avgs)
                r.avg = r.pctg * float(totalDocs)
                r.opctg = (float(r.freq) / ftd)
                r.entropy = reduce(operator.add, entropy)
                r.gini = 1.0 - reduce(operator.add, gini)

                # This is log-likelihood.  Better than just support
                ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0)
                g2 = 2 * ((r.avg * math.log( r.avg / ei,10)) + (r.freq * math.log(r.freq / ei,10)))
                if r.freq < r.avg:
                    g2 = 0 - g2
                r.ll = g2
                # Dunno what this is but it works quite well (for some things)
                r.surprise = (totalDocs / r.avg) * r.freq
                # r.surprise2 = (1.0/r.pctg) * r.freq
            rules.append(r)

        if self.sortBy:
            rules.sort(key=self.sortFuncs[self.sortBy], reverse=True)

        nrules = []
        if armrules:
            # unrenumber arm found rules
            # conf, supp, [antes], [concs]
            for r in armrules:
                d = StringDocument([r[2], r[3]])
                if self.unrenumber:
                    d = self.unrenumber.process_document(session, d)
                antes = []
                concs = []
                renmbrd = d.get_raw(session)
                for a in renmbrd[0]:
                    antes.append(termHash[a])
                for c in renmbrd[1]:
                    concs.append(termHash[c])
                nrules.append([r[0], r[1], antes, concs])

        out.text = [rules, nrules]
        out.termHash = termHash
        out.termRuleFreq = termRuleFreq
        out.ruleLengths = ruleLengths
        # XXX this is even nastier, but useful
        out.sortFuncs = self.sortFuncs

        return out