def _parse_upload(self, data, interface='admin'): if (type(data) == unicode): try: data = data.encode('utf-8') except: try: data = data.encode('utf-16') except: pass # hope for the best! doc = StringDocument(data) del data doc = ppFlow.process(session, doc) try: rec = docParser.process_document(session, doc) except: newlineRe = re.compile('(\s\s+)') doc.text = newlineRe.sub('\n\g<1>', doc.get_raw(session)) # repeat parse with correct line numbers try: rec = docParser.process_document(session, doc) except: self.htmlTitle.append('Error') e = sys.exc_info() self.logger.log('*** %s: %s' % (repr(e[0]), e[1])) # try and highlight error in specified place lines = doc.get_raw(session).split('\n') positionRe = re.compile(':(\d+):(\d+):') mo = positionRe.search(str(e[1])) if (mo is None): positionRe = re.compile('line (\d+), column (\d+)') mo = positionRe.search(str(e[1])) line, posn = lines[int(mo.group(1))-1], int(mo.group(2)) try: startspace = newlineRe.match(line).group(0) except: if interface=='admin': link = '<a href="files.html">Back to file page</a>' else : link = '<a href="edit.html">Back to edit/create menu</a>' return '''<div id="single"><p class="error">An error occured while parsing your file. Please check the file is a valid ead file and try again.</p><p>%s</p></div>''' % link else: if interface=='admin': link = '<a href="files.html">Back to file page</a>' else : link = '<a href="edit.html">Back to edit/create menu</a>' return '''\ <div id="single"><p class="error">An error occured while parsing your file. Please check the file at the suggested location and try again.</p> <code>%s: %s</code> <pre> %s <span class="error">%s</span> </pre> <p>%s</p></div> ''' % (html_encode(repr(e[0])), e[1], html_encode(line[:posn+20]) + '...', startspace + str('-'*(posn-len(startspace))) +'^', link) del doc return rec
def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None): doc = StringDocument(stream.get_xml(session)) #get rec into doc self.data.append(doc.get_raw(session))
def process_document(self, session, doc): # take in Doc with match list, return doc with rule object list (matches, armrules) = doc.get_raw(session) out = StringDocument([]) # Initial setup termHash = {} termFreqHash = {} termRuleFreq = {} rules = [] ruleLengths = {} if self.recordStore: totalDocs = self.recordStore.get_dbSize(session) else: # get default from session's database db = session.server.get_object(session, session.database) recStore = db.get_path(session, 'recordStore', None) if recStore: totalDocs = recStore.get_dbSize(session) if totalDocs == 0: # avoid e_divzero totalDocs = 1 totalDocs = float(totalDocs) # step through rules and turn into objects, do math, do global stats for m in matches: r = FrequentSet(session, m, out, self.unrenumber) freqs = [] for t in r.termids: try: termFreq = termFreqHash[t] termRuleFreq[t] += 1 except: termRuleFreq[t] = 1 term = self.index.fetch_termById(session, t) termHash[t] = term termFreq = self.index.fetch_term(session, term, summary=True)[1] termFreqHash[t] = termFreq freqs.append(termFreq) r.freqs = freqs if self.calcRankings: if self.calcRuleLengths: try: ruleLengths[(len(r.termids))] += 1 except: ruleLengths[(len(r.termids))] = 1 # some basic stats needed avgs = [] entropy = [] gini = [] ftd = float(totalDocs) for t in freqs: bit = float(t)/ftd avgs.append(bit) entropy.append((0-bit) * math.log(bit, 2)) gini.append(bit**2) r.pctg = reduce(operator.mul, avgs) r.avg = r.pctg * float(totalDocs) r.opctg = (float(r.freq) / ftd) r.entropy = reduce(operator.add, entropy) r.gini = 1.0 - reduce(operator.add, gini) # This is log-likelihood. Better than just support ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0) g2 = 2 * ((r.avg * math.log( r.avg / ei,10)) + (r.freq * math.log(r.freq / ei,10))) if r.freq < r.avg: g2 = 0 - g2 r.ll = g2 # Dunno what this is but it works quite well (for some things) r.surprise = (totalDocs / r.avg) * r.freq # r.surprise2 = (1.0/r.pctg) * r.freq rules.append(r) if self.sortBy: rules.sort(key=self.sortFuncs[self.sortBy], reverse=True) nrules = [] if armrules: # unrenumber arm found rules # conf, supp, [antes], [concs] for r in armrules: d = StringDocument([r[2], r[3]]) if self.unrenumber: d = self.unrenumber.process_document(session, d) antes = [] concs = [] renmbrd = d.get_raw(session) for a in renmbrd[0]: antes.append(termHash[a]) for c in renmbrd[1]: concs.append(termHash[c]) nrules.append([r[0], r[1], antes, concs]) out.text = [rules, nrules] out.termHash = termHash out.termRuleFreq = termRuleFreq out.ruleLengths = ruleLengths # XXX this is even nastier, but useful out.sortFuncs = self.sortFuncs return out
def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None): doc = StringDocument(stream.get_xml(session)) # get rec into doc self.data.append(doc.get_raw(session))
def process_document(self, session, doc): # take in Doc with match list, return doc with rule object list (matches, armrules) = doc.get_raw(session) out = StringDocument([]) # Initial setup termHash = {} termFreqHash = {} termRuleFreq = {} rules = [] ruleLengths = {} if self.recordStore: totalDocs = self.recordStore.get_dbSize(session) else: # get default from session's database db = session.server.get_object(session, session.database) recStore = db.get_path(session, 'recordStore', None) if recStore: totalDocs = recStore.get_dbSize(session) if totalDocs == 0: # avoid e_divzero totalDocs = 1 totalDocs = float(totalDocs) # step through rules and turn into objects, do math, do global stats for m in matches: r = FrequentSet(session, m, out, self.unrenumber) freqs = [] for t in r.termids: try: termFreq = termFreqHash[t] termRuleFreq[t] += 1 except: termRuleFreq[t] = 1 term = self.index.fetch_termById(session, t) termHash[t] = term termFreq = self.index.fetch_term(session, term, summary=True)[1] termFreqHash[t] = termFreq freqs.append(termFreq) r.freqs = freqs if self.calcRankings: if self.calcRuleLengths: try: ruleLengths[(len(r.termids))] += 1 except: ruleLengths[(len(r.termids))] = 1 # some basic stats needed avgs = [] entropy = [] gini = [] ftd = float(totalDocs) for t in freqs: bit = float(t) / ftd avgs.append(bit) entropy.append((0 - bit) * math.log(bit, 2)) gini.append(bit**2) r.pctg = reduce(operator.mul, avgs) r.avg = r.pctg * float(totalDocs) r.opctg = (float(r.freq) / ftd) r.entropy = reduce(operator.add, entropy) r.gini = 1.0 - reduce(operator.add, gini) # This is log-likelihood. Better than just support ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0) g2 = 2 * ((r.avg * math.log(r.avg / ei, 10)) + (r.freq * math.log(r.freq / ei, 10))) if r.freq < r.avg: g2 = 0 - g2 r.ll = g2 # Dunno what this is but it works quite well (for some things) r.surprise = (totalDocs / r.avg) * r.freq # r.surprise2 = (1.0/r.pctg) * r.freq rules.append(r) if self.sortBy: rules.sort(key=self.sortFuncs[self.sortBy], reverse=True) nrules = [] if armrules: # unrenumber arm found rules # conf, supp, [antes], [concs] for r in armrules: d = StringDocument([r[2], r[3]]) if self.unrenumber: d = self.unrenumber.process_document(session, d) antes = [] concs = [] renmbrd = d.get_raw(session) for a in renmbrd[0]: antes.append(termHash[a]) for c in renmbrd[1]: concs.append(termHash[c]) nrules.append([r[0], r[1], antes, concs]) out.text = [rules, nrules] out.termHash = termHash out.termRuleFreq = termRuleFreq out.ruleLengths = ruleLengths # XXX this is even nastier, but useful out.sortFuncs = self.sortFuncs return out