Exemplo n.º 1
0
 def _parse_upload(self, data, interface='admin'):
     if (type(data) == unicode):
         try: data = data.encode('utf-8')
         except:
             try: data = data.encode('utf-16')
             except: pass # hope for the best!
     doc = StringDocument(data)
     del data
     doc = ppFlow.process(session, doc)
     try:
         rec = docParser.process_document(session, doc)
     except:
         newlineRe = re.compile('(\s\s+)')
         doc.text = newlineRe.sub('\n\g<1>', doc.get_raw(session))
         # repeat parse with correct line numbers
         try:
             rec = docParser.process_document(session, doc)
         except:
             self.htmlTitle.append('Error')
             e = sys.exc_info()
             self.logger.log('*** %s: %s' % (repr(e[0]), e[1]))
             # try and highlight error in specified place
             lines = doc.get_raw(session).split('\n')
             positionRe = re.compile(':(\d+):(\d+):')
             mo = positionRe.search(str(e[1]))
             if (mo is None):
                 positionRe = re.compile('line (\d+), column (\d+)')
                 mo = positionRe.search(str(e[1]))
             line, posn = lines[int(mo.group(1))-1], int(mo.group(2))
             try:
                 startspace = newlineRe.match(line).group(0)
             except:
                 if interface=='admin':
                     link = '<a href="files.html">Back to file page</a>'
                 else :
                     link = '<a href="edit.html">Back to edit/create menu</a>'
                 return '''<div id="single"><p class="error">An error occured while parsing your file. 
     Please check the file is a valid ead file and try again.</p><p>%s</p></div>''' % link
             else:
                 if interface=='admin':
                     link = '<a href="files.html">Back to file page</a>'
                 else :
                     link = '<a href="edit.html">Back to edit/create menu</a>'
                 return '''\
         <div id="single"><p class="error">An error occured while parsing your file. 
         Please check the file at the suggested location and try again.</p>
         <code>%s: %s</code>
         <pre>
         %s
         <span class="error">%s</span>
         </pre>
         <p>%s</p></div>
                 ''' % (html_encode(repr(e[0])), e[1], html_encode(line[:posn+20]) + '...',  startspace + str('-'*(posn-len(startspace))) +'^', link)
                 
     del doc
     return rec
Exemplo n.º 2
0
 def accumulate(self,
                session,
                stream,
                format,
                tagName=None,
                codec=None,
                factory=None):
     doc = StringDocument(stream.get_xml(session))  #get rec into doc
     self.data.append(doc.get_raw(session))
Exemplo n.º 3
0
    def process_document(self, session, doc):
        # take in Doc with match list, return doc with rule object list
        (matches, armrules) = doc.get_raw(session)

        out = StringDocument([])

        # Initial setup
        termHash = {}
        termFreqHash = {}
        termRuleFreq = {}
        rules = []
        ruleLengths = {}

        if self.recordStore:
            totalDocs = self.recordStore.get_dbSize(session)
        else:
            # get default from session's database
            db = session.server.get_object(session, session.database)
            recStore = db.get_path(session, 'recordStore', None)
            if recStore:
                totalDocs = recStore.get_dbSize(session)
        if totalDocs == 0:
            # avoid e_divzero
            totalDocs = 1
        totalDocs = float(totalDocs)

        # step through rules and turn into objects, do math, do global stats
        for m in matches:
            r = FrequentSet(session, m, out, self.unrenumber)
            
            freqs = []
            for t in r.termids:
                try:
                    termFreq = termFreqHash[t]
                    termRuleFreq[t] += 1
                except:
                    termRuleFreq[t] = 1
                    term = self.index.fetch_termById(session, t)
                    termHash[t] = term
                    termFreq = self.index.fetch_term(session, term, summary=True)[1]
                    termFreqHash[t] = termFreq
                freqs.append(termFreq)
            r.freqs = freqs

            if self.calcRankings:
                if self.calcRuleLengths:
                    try:
                        ruleLengths[(len(r.termids))] += 1
                    except:
                        ruleLengths[(len(r.termids))] = 1

                # some basic stats needed
                avgs = []
                entropy = []
                gini = []
                ftd = float(totalDocs)
                for t in freqs:
                    bit = float(t)/ftd
                    avgs.append(bit)
                    entropy.append((0-bit) * math.log(bit, 2))
                    gini.append(bit**2)

                r.pctg = reduce(operator.mul, avgs)
                r.avg = r.pctg * float(totalDocs)
                r.opctg = (float(r.freq) / ftd)
                r.entropy = reduce(operator.add, entropy)
                r.gini = 1.0 - reduce(operator.add, gini)

                # This is log-likelihood.  Better than just support
                ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0)
                g2 = 2 * ((r.avg * math.log( r.avg / ei,10)) + (r.freq * math.log(r.freq / ei,10)))
                if r.freq < r.avg:
                    g2 = 0 - g2
                r.ll = g2
                # Dunno what this is but it works quite well (for some things)
                r.surprise = (totalDocs / r.avg) * r.freq
                # r.surprise2 = (1.0/r.pctg) * r.freq
            rules.append(r)

        if self.sortBy:
            rules.sort(key=self.sortFuncs[self.sortBy], reverse=True)

        nrules = []
        if armrules:
            # unrenumber arm found rules
            # conf, supp, [antes], [concs]
            for r in armrules:
                d = StringDocument([r[2], r[3]])
                if self.unrenumber:
                    d = self.unrenumber.process_document(session, d)
                antes = []
                concs = []
                renmbrd = d.get_raw(session)
                for a in renmbrd[0]:
                    antes.append(termHash[a])
                for c in renmbrd[1]:
                    concs.append(termHash[c])
                nrules.append([r[0], r[1], antes, concs])

        out.text = [rules, nrules]
        out.termHash = termHash
        out.termRuleFreq = termRuleFreq
        out.ruleLengths = ruleLengths
        # XXX this is even nastier, but useful
        out.sortFuncs = self.sortFuncs

        return out
Exemplo n.º 4
0
 def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None):
     doc = StringDocument(stream.get_xml(session))  # get rec into doc
     self.data.append(doc.get_raw(session))
Exemplo n.º 5
0
    def process_document(self, session, doc):
        # take in Doc with match list, return doc with rule object list
        (matches, armrules) = doc.get_raw(session)

        out = StringDocument([])

        # Initial setup
        termHash = {}
        termFreqHash = {}
        termRuleFreq = {}
        rules = []
        ruleLengths = {}

        if self.recordStore:
            totalDocs = self.recordStore.get_dbSize(session)
        else:
            # get default from session's database
            db = session.server.get_object(session, session.database)
            recStore = db.get_path(session, 'recordStore', None)
            if recStore:
                totalDocs = recStore.get_dbSize(session)
        if totalDocs == 0:
            # avoid e_divzero
            totalDocs = 1
        totalDocs = float(totalDocs)

        # step through rules and turn into objects, do math, do global stats
        for m in matches:
            r = FrequentSet(session, m, out, self.unrenumber)

            freqs = []
            for t in r.termids:
                try:
                    termFreq = termFreqHash[t]
                    termRuleFreq[t] += 1
                except:
                    termRuleFreq[t] = 1
                    term = self.index.fetch_termById(session, t)
                    termHash[t] = term
                    termFreq = self.index.fetch_term(session,
                                                     term,
                                                     summary=True)[1]
                    termFreqHash[t] = termFreq
                freqs.append(termFreq)
            r.freqs = freqs

            if self.calcRankings:
                if self.calcRuleLengths:
                    try:
                        ruleLengths[(len(r.termids))] += 1
                    except:
                        ruleLengths[(len(r.termids))] = 1

                # some basic stats needed
                avgs = []
                entropy = []
                gini = []
                ftd = float(totalDocs)
                for t in freqs:
                    bit = float(t) / ftd
                    avgs.append(bit)
                    entropy.append((0 - bit) * math.log(bit, 2))
                    gini.append(bit**2)

                r.pctg = reduce(operator.mul, avgs)
                r.avg = r.pctg * float(totalDocs)
                r.opctg = (float(r.freq) / ftd)
                r.entropy = reduce(operator.add, entropy)
                r.gini = 1.0 - reduce(operator.add, gini)

                # This is log-likelihood.  Better than just support
                ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0)
                g2 = 2 * ((r.avg * math.log(r.avg / ei, 10)) +
                          (r.freq * math.log(r.freq / ei, 10)))
                if r.freq < r.avg:
                    g2 = 0 - g2
                r.ll = g2
                # Dunno what this is but it works quite well (for some things)
                r.surprise = (totalDocs / r.avg) * r.freq
                # r.surprise2 = (1.0/r.pctg) * r.freq
            rules.append(r)

        if self.sortBy:
            rules.sort(key=self.sortFuncs[self.sortBy], reverse=True)

        nrules = []
        if armrules:
            # unrenumber arm found rules
            # conf, supp, [antes], [concs]
            for r in armrules:
                d = StringDocument([r[2], r[3]])
                if self.unrenumber:
                    d = self.unrenumber.process_document(session, d)
                antes = []
                concs = []
                renmbrd = d.get_raw(session)
                for a in renmbrd[0]:
                    antes.append(termHash[a])
                for c in renmbrd[1]:
                    concs.append(termHash[c])
                nrules.append([r[0], r[1], antes, concs])

        out.text = [rules, nrules]
        out.termHash = termHash
        out.termRuleFreq = termRuleFreq
        out.ruleLengths = ruleLengths
        # XXX this is even nastier, but useful
        out.sortFuncs = self.sortFuncs

        return out