Пример #1
0
    def __init__(self, filename, target, inputs=None, threshold=1.0e-9):
        CountsBase.__init__(self, threshold=threshold)
        if isinstance(filename, UniTable):
            data = filename
        else:
            data = UniTable().fromfile(filename)
        self.model_build(data, target, inputs)

        self.verify_result = verify = UniTable()
        verify['orig'] = data[self.target]
        verify['pred'] = self.model_predict(data)
        verify['agree'] = verify['orig'] == verify['pred']
        self.accuracy = float(verify['agree'].sum()) / len(data)
Пример #2
0
 def __str__(self):
     out = UniTable()
     out['key'] = self.keys()
     out['name'] = self.names()
     out['expr'] = self.values()
     out['rule'] = self.rules()
     return str(out)
Пример #3
0
 def __str__(self):
   out = UniTable()
   out['(#)'] = list(self.rows) + ['_totals_']
   col_sums = self.col_sums()
   for i,col in enumerate(self.cols):
     out[col] = list(self.matrix[:,i]) + [col_sums[i]]
   out['_totals_'] = list(self.row_sums()) + [self.sum()]
   return str(out)
Пример #4
0
 def handle_select(self, opt, tbl):
     fldexpr = FieldExprList(*opt.select)
     rules = fldexpr.rules()
     tbl = EvalTable(rules).update(tbl)
     out = UniTable()
     for key, name in zip(fldexpr.keys(), fldexpr.names()):
         out[name] = tbl[key]
     return out
Пример #5
0
 def export(self):
     out = UniTable()
     out['(#)'] = list(self.rownames) + ['_totals_']
     col_sums = self.col_sums()
     for i, col in enumerate(self.colnames):
         out[col] = list(self.matrix[:, i]) + [col_sums[i]]
     out['_totals_'] = list(self.row_sums()) + [self.sum()]
     return out
Пример #6
0
def top_ten(filenames):

    # track values for each field
    seen_fields = {}
    total_recs = 0

    # read each file in turn
    for filename in filenames:
        tbl = UniTable()
        tbl.fromfile(filename)

        keys = tbl.keys()[:]
        if '_count_' in keys:
            total_recs += tbl['_count_'].sum()
            keys.remove('_count_')
        else:
            total_recs += len(tbl)
            tbl['_count_'] = 1

        # read each column in turn
        for key in keys:
            seen_values = seen_fields.setdefault(key, {})

            # iterate over counts and values
            for cnt, value in izip(tbl['_count_'], tbl[key]):
                try:
                    seen_values[value] += cnt
                except KeyError:
                    seen_values[value] = cnt

    # report results
    for key, seen_values in seen_fields.items():

        # find top ten
        top_cnts = sorted(seen_values.values())
        cutoff = top_cnts[-10:][0]
        tmp = sorted([cnt, value] for (value, cnt) in seen_values.items()
                     if cnt >= cutoff)
        top = reversed(tmp[-10:])

        # report
        print 'Field:', key
        for (cnt, value) in top:
            percent = 100.0 * cnt / float(total_recs)
            print '\t(%8.5f%%) %r' % (percent, value)
Пример #7
0
def top_ten(filenames):

  # track values for each field
  seen_fields = {}
  total_recs = 0

  # read each file in turn
  for filename in filenames:
    tbl = UniTable()
    tbl.fromfile(filename)

    keys = tbl.keys()[:]
    if '_count_' in keys:
      total_recs += tbl['_count_'].sum()
      keys.remove('_count_')
    else:
      total_recs += len(tbl)
      tbl['_count_'] = 1

    # read each column in turn
    for key in keys:
      seen_values = seen_fields.setdefault(key,{})

      # iterate over counts and values
      for cnt,value in izip(tbl['_count_'],tbl[key]):
        try:
          seen_values[value] += cnt
        except KeyError:
          seen_values[value] = cnt

  # report results
  for key,seen_values in seen_fields.items():

    # find top ten
    top_cnts = sorted(seen_values.values())
    cutoff = top_cnts[-10:][0]
    tmp = sorted([cnt,value] for (value,cnt) in seen_values.items() if cnt >= cutoff)
    top = reversed(tmp[-10:])

    # report
    print 'Field:', key
    for (cnt,value) in top:
      percent = 100.0*cnt/float(total_recs)
      print '\t(%8.5f%%) %r' % (percent,value)
Пример #8
0
 def __init__(self,filename=None,keys=[]):
   self.keys = keys
   self.data = data = {'':0}  # try to pre-assign empty string value
   self.filename = filename
   if filename and os.path.exists(filename):
     from augustus.unitable import UniTable
     tbl = UniTable().fromfile(filename)
     for i,value in it.izip(tbl['index'],tbl['data']):
       data[value] = i
     del tbl
Пример #9
0
 def __call__(self,data):
   state = self._state = UniTable()
   state['data'] = data
   state['nullmodel'] = self.nullmodel(state['data'])
   state['altmodel'] = self.altmodel(state['data'])
   state['odds'] = state['altmodel']/state['nullmodel']
   state['log_odds'] = na.log(state['odds'])
   state['cusum'] = list(gen_cusum(state['log_odds'],self.reset_value))
   state['score'] = state['cusum'] >self.threshold
   return state['score'][-1]
Пример #10
0
 def _make_tbl(self, cfunc, ccfunc):
     out = UniTable()
     ikvlist = list(self.iter_ikv())
     out['__fld__'] = [''] + [ikv[0] for ikv in ikvlist]
     out['__val__'] = [''] + [ikv[1] for ikv in ikvlist]
     for tval in self.all_tval():
         value = cfunc(tval)
         ikv_vals = [ccfunc(tval, ikey, ival) for (ikey, ival) in ikvlist]
         out[str(tval)] = [value] + ikv_vals
     return str(out)
Пример #11
0
 def flush(self):
   if self.filename and len(self.data) > 1:
     from augustus.unitable import UniTable
     tbl = UniTable(keys=['index','data'])
     tmp = self.data.items()
     tbl['index'] = [x[1] for x in tmp]
     tbl['data'] = [x[0] for x in tmp]
     del tmp
     tbl.sort_on('index')
     tbl.to_csv_file(self.filename)
     del tbl
Пример #12
0
 def handle_arg(self, opt, arg):
     tbl = UniTable().fromfile(arg)
     if opt.select:
         tbl = self.handle_select(opt, tbl)
     tbl = self.handle_counttable(opt, arg, tbl)
     print tbl.export().to_csv_str()
Пример #13
0
 def handle_arg(self,opt,arg):
   tbl = UniTable().fromfile(arg)
   if opt.select:
     tbl = self.handle_select(opt,tbl)
   tbl = self.handle_counttable(opt,arg,tbl)
   print tbl.export().to_csv_str()
Пример #14
0
 def __init__(self,nullmodel,altmodel,threshold,reset_value=0.0):
   self.nullmodel = nullmodel
   self.altmodel = altmodel
   self.threshold = threshold
   self.reset_value = reset_value
   self._state = UniTable()