示例#1
0
文件: gini.py 项目: soedjais/augustus
 def groupby3(arg):
     arg = as_num_array(arg)
     n = float(len(arg))
     gfx = as_num_array([len(list(g)) for k, g in groupby(sort(arg))]) / n
     gfx *= gfx
     out = 1.0 - gfx.sum()
     return out
示例#2
0
 def isort(dep,indep,cutpoints=None,**kwargs):
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   if not len(cutpoints):
     return []
   # sort both vectors by *indep*
   idx = argsort(indep)
   dep = take(dep,idx)
   indep = take(indep,idx)
   cutidx = [0,0]
   for ival,isub in it.groupby(indep):
     ilen = len(list(isub))
     if ival < cutpoints[len(cutidx)-2]:
       cutidx[-1] += ilen
     else:
       if len(cutidx) > len(cutpoints):
         break
       cutidx.append(cutidx[-1]+ilen)
   assert len(cutidx)-1 == len(cutpoints), '%s != %s' % (len(cutidx)-1,len(cutpoints))
   out = []
   cnt2 = dict(histo_tuple(dep))
   cnt1 = dict.fromkeys(cnt2.keys(),0)
   for i1,i2 in izip(cutidx[:-1],cutidx[1:]):
     # update the counts from the last cut
     for d,cnt in histo_tuple(dep[i1:i2]):
       cnt1[d] += cnt
       cnt2[d] -= cnt
     # calculate results based on counts
     a1 = as_num_array([val for val in cnt1.itervalues() if val != 0])
     a2 = as_num_array([val for val in cnt2.itervalues() if val != 0])
     out.append(gini2_counts(a1,a2))
   assert len(out) == len(cutpoints), '%s != %s' % (len(out),len(cutpoints))
   return out
示例#3
0
文件: gini.py 项目: soedjais/augustus
 def vector1(arg1, arg2):
     a1 = as_num_array(arg1)
     a2 = as_num_array(arg2)
     c1 = a1.sum()
     c2 = a2.sum()
     n = float(c1 + c2)
     return (gini_counts(a1) * c1 / n) + (gini_counts(a2) * c2 / n)
示例#4
0
 def missing1(arg,delta=1,first=False):
   # build answer lookup mapping each arg value to first index
   run_lens = [(k,len(list(g))) for k,g in groupby(arg)]
   keys = as_num_array([k for k,l in run_lens])
   lens = as_num_array([l for k,l in run_lens])
   ends = cumsum(lens)
   starts = ends - lens
   if first:
     answer = dict(izip(keys,starts))
   else:
     answer = dict(izip(keys,ends-1))
   # identify missing keys
   need = keys + delta
   needset = set(need)
   haveset = set(answer)
   fillset = needset.difference(haveset)
   fill = as_num_array(sorted(fillset))
   #
   minkey,maxkey = arg[0],arg[-1]
   #
   have_iter = iter(keys[-1::-1])
   fill_iter = iter(fill[-1::-1])
   thiskey = maxkey
   thisval = answer[thiskey]
   for fillkey in fill_iter:
     if thiskey >= fillkey:
       try:
         thiskey = dropwhile(lambda x:x>=fillkey,have_iter).next()
       except StopIteration:
         thiskey = minkey
       thisval = answer[thiskey]
     answer[fillkey] = thisval
   out = [answer[val+delta] for val in arg]
   return out
示例#5
0
 def vector1(arg1,arg2):
   a1 = as_num_array(arg1)
   a2 = as_num_array(arg2)
   c1 = a1.sum()
   c2 = a2.sum()
   n = float(c1+c2)
   return (gini_counts(a1)*c1/n) + (gini_counts(a2)*c2/n)
示例#6
0
 def deltacnt1(dep,indep,cutpoints=None,**kwargs):
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   out = []
   # dictionary of counts in each dataset
   cnt2 = dict(histo_tuple(dep))
   cnt1 = dict.fromkeys(cnt2.keys(),0)
   lastmask = (indep != indep)
   for cut in cutpoints:
     mask = indep < cut
     # examine only the new values from the last cut
     maskdelta = mask & ~lastmask
     lastmask |= mask
     idxdelta = nonzero(maskdelta)[0]
     # update the counts from the last cut
     for d in dep[idxdelta]:
       cnt1[d] += 1
       cnt2[d] -= 1
     # calculate results based on counts
     a1 = as_num_array([val for val in cnt1.itervalues() if val != 0])
     a2 = as_num_array([val for val in cnt2.itervalues() if val != 0])
     out.append(gini2_counts(a1,a2))
   return out
示例#7
0
 def deltacnt1(dep, indep, cutpoints=None, **kwargs):
     dep = as_num_array(dep)
     indep = as_num_array(indep)
     if cutpoints is None:
         cutpoints = midpoints_integer(indep)
     out = []
     # dictionary of counts in each dataset
     cnt2 = dict(histo_tuple(dep))
     cnt1 = dict.fromkeys(cnt2.keys(), 0)
     lastmask = (indep != indep)
     for cut in cutpoints:
         mask = indep < cut
         # examine only the new values from the last cut
         maskdelta = mask & ~lastmask
         lastmask |= mask
         idxdelta = nonzero(maskdelta)[0]
         # update the counts from the last cut
         for d in dep[idxdelta]:
             cnt1[d] += 1
             cnt2[d] -= 1
         # calculate results based on counts
         a1 = as_num_array([val for val in cnt1.itervalues() if val != 0])
         a2 = as_num_array([val for val in cnt2.itervalues() if val != 0])
         out.append(gini2_counts(a1, a2))
     return out
示例#8
0
 def groupby3(arg):
   arg = as_num_array(arg)
   n = float(len(arg))
   gfx = as_num_array([len(list(g)) for k,g in groupby(sort(arg))])/n
   gfx *= gfx
   out = 1.0 - gfx.sum()
   return out
示例#9
0
 def naive2(arg,sel=None,step=1):
   arg = as_num_array(arg)
   idx = arg_sel_step_to_idx(arg,sel,step)
   jj = idx[:-step]
   kk = idx[step:]
   omin = [arg[j+1:k+1].min() for j,k in izip(jj,kk)]
   omax = [arg[j+1:k+1].max() for j,k in izip(jj,kk)]
   return as_num_array(omax) - omin
示例#10
0
 def naive2(arg, sel=None, step=1):
     arg = as_num_array(arg)
     idx = arg_sel_step_to_idx(arg, sel, step)
     jj = idx[:-step]
     kk = idx[step:]
     omin = [arg[j + 1:k + 1].min() for j, k in izip(jj, kk)]
     omax = [arg[j + 1:k + 1].max() for j, k in izip(jj, kk)]
     return as_num_array(omax) - omin
示例#11
0
 def smart(dep,indep,cutpoints=None,**kwargs):
   # not needed unless high penalty for small datasets
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   if len(dep) < 100:
     return PartitionIntegerGini.naive(dep,indep,cutpoints=cutpoints,**kwargs)
   return PartitionIntegerGini.isort(dep,indep,cutpoints=cutpoints,**kwargs)
示例#12
0
 def fast(arg1, arg2, reset_value=0.0, out=None):
     arg1 = as_num_array(arg1)
     arg2 = as_num_array(arg2)
     if not out:
         out = arg1.new()
     cusum_func = CusumReset().iterfunc
     log_odds = log(arg2 / arg1)
     cusum_func(log_odds, reset_value=reset_value, out=out)
     return out
示例#13
0
 def fast(arg1, arg2, reset_value=0.0, out=None):
     arg1 = as_num_array(arg1)
     arg2 = as_num_array(arg2)
     if not out:
         out = arg1.new()
     cusum_func = CusumReset().iterfunc
     log_odds = log(arg2 / arg1)
     cusum_func(log_odds, reset_value=reset_value, out=out)
     return out
示例#14
0
 def naive1(arg,sel=None,step=1):
   arg = as_num_array(arg)
   idx = arg_sel_step_to_idx(arg,sel,step)
   jj = idx[:-step]
   kk = idx[step:]
   segments = [arg[j+1:k+1] for j,k in izip(jj,kk) if j!=k]
   omin = [seg.min() for seg in segments if len(seg)]
   omax = [seg.max() for seg in segments if len(seg)]
   return as_num_array(omax) - omin
示例#15
0
 def naive1(arg, sel=None, step=1):
     arg = as_num_array(arg)
     idx = arg_sel_step_to_idx(arg, sel, step)
     jj = idx[:-step]
     kk = idx[step:]
     segments = [arg[j + 1:k + 1] for j, k in izip(jj, kk) if j != k]
     omin = [seg.min() for seg in segments if len(seg)]
     omax = [seg.max() for seg in segments if len(seg)]
     return as_num_array(omax) - omin
示例#16
0
 def naive2(arg,sel=None,step=1,func=sum):
   arg = as_num_array(arg)
   idx = arg_sel_step_to_idx(arg,sel,step)
   jj = idx[:-step]
   kk = idx[step:]
   out = []
   for j,k in izip(jj,kk):
     chunk = arg[j+1:k+1]
     out.append(func(chunk))
   return as_num_array(out)
示例#17
0
 def naive1(arg,sel=None,step=1,func=sum):
   arg = as_num_array(arg)
   idx = arg_sel_step_to_idx(arg,sel,step)
   out = []
   for i in xrange(len(idx)-step):
     j = idx[i]
     k = idx[i+step]
     chunk = arg[j+1:k+1]
     out.append(func(chunk))
   return as_num_array(out)
示例#18
0
 def naive1(arg, sel=None, step=1, func=sum):
     arg = as_num_array(arg)
     idx = arg_sel_step_to_idx(arg, sel, step)
     out = []
     for i in xrange(len(idx) - step):
         j = idx[i]
         k = idx[i + step]
         chunk = arg[j + 1:k + 1]
         out.append(func(chunk))
     return as_num_array(out)
示例#19
0
 def naive2(arg, sel=None, step=1, func=sum):
     arg = as_num_array(arg)
     idx = arg_sel_step_to_idx(arg, sel, step)
     jj = idx[:-step]
     kk = idx[step:]
     out = []
     for j, k in izip(jj, kk):
         chunk = arg[j + 1:k + 1]
         out.append(func(chunk))
     return as_num_array(out)
示例#20
0
 def masksel(dep,indep,cutpoints=None,**kwargs):
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   out = []
   for cut in cutpoints:
     mask = indep < cut
     a1 = dep[mask]
     a2 = dep[~mask]
     out.append(gini2(a1,a2))
   return out
示例#21
0
 def masksel(dep, indep, cutpoints=None, **kwargs):
     dep = as_num_array(dep)
     indep = as_num_array(indep)
     if cutpoints is None:
         cutpoints = midpoints_integer(indep)
     out = []
     for cut in cutpoints:
         mask = indep < cut
         a1 = dep[mask]
         a2 = dep[~mask]
         out.append(gini2(a1, a2))
     return out
示例#22
0
 def idxsel(dep, indep, cutpoints=None, **kwargs):
     dep = as_num_array(dep)
     indep = as_num_array(indep)
     if cutpoints is None:
         cutpoints = midpoints_integer(indep)
     out = []
     for cut in cutpoints:
         mask = indep < cut
         i1 = nonzero(mask)[0]
         i2 = nonzero(~mask)[0]
         a1 = dep[i1]
         a2 = dep[i2]
         out.append(gini2(a1, a2))
     return out
示例#23
0
 def idxsel(dep,indep,cutpoints=None,**kwargs):
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   out = []
   for cut in cutpoints:
     mask = indep < cut
     i1 = nonzero(mask)[0]
     i2 = nonzero(~mask)[0]
     a1 = dep[i1]
     a2 = dep[i2]
     out.append(gini2(a1,a2))
   return out
示例#24
0
 def smart(dep, indep, cutpoints=None, **kwargs):
     # not needed unless high penalty for small datasets
     dep = as_num_array(dep)
     indep = as_num_array(indep)
     if cutpoints is None:
         cutpoints = midpoints_integer(indep)
     if len(dep) < 100:
         return PartitionIntegerGini.naive(dep,
                                           indep,
                                           cutpoints=cutpoints,
                                           **kwargs)
     return PartitionIntegerGini.isort(dep,
                                       indep,
                                       cutpoints=cutpoints,
                                       **kwargs)
示例#25
0
 def naive_comp(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   out[1:] = [arg[i]-arg[i-1] for i in xrange(1,len(arg))]
   return out
示例#26
0
 def fast(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   subtract(arg[1:],arg[:-1],out[1:])
   return out
示例#27
0
 def naive_comp(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   out[1:] = [arg[i]-arg[i-1] for i in xrange(1,len(arg))]
   return out
示例#28
0
 def fast(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   subtract(arg[1:],arg[:-1],out[1:])
   return out
示例#29
0
 def simple1(arg1,arg2):
   gini1 = Gini()
   gini2 = Gini2()
   args = [as_num_array(arg) for arg in (arg1,arg2) if len(arg)]
   if len(args) != 2:
     return 0.0
   return gini1(concatenate(args)) - gini2(*args)
示例#30
0
文件: gini.py 项目: soedjais/augustus
 def simple1(arg1, arg2):
     gini1 = Gini()
     gini2 = Gini2()
     args = [as_num_array(arg) for arg in (arg1, arg2) if len(arg)]
     if len(args) != 2:
         return 0.0
     return gini1(concatenate(args)) - gini2(*args)
示例#31
0
 def naive(arg,lower=None,upper=None):
   arg = as_num_array(arg)
   if lower is not None:
     arg = maximum(arg,lower)
   if upper is not None:
     arg = minimum(arg,upper)
   return arg
示例#32
0
 def naive_loop(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   for i in xrange(1,len(arg)):
     out[i] = arg[i] - arg[i-1]
   return out
示例#33
0
 def iterloop(arg, reset_value=0.0, out=None):
     arg = as_num_array(arg)
     if not out:
         out = arg.new()
     last = 0.0
     for i, value in it.izip(it.count(), arg):
         out[i] = max(reset_value, last + value)
     return out
示例#34
0
 def _prep_testdata(self,*args,**kwargs):
   out = []
   for arg in args:
     enum = {}
     for val in arg:
       enum[val] = 1 + enum.setdefault(val,0)
     out.append(as_num_array(enum.values()))
   return out
示例#35
0
 def naive_loop(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
     out[0] = 0
   for i in xrange(1,len(arg)):
     out[i] = arg[i] - arg[i-1]
   return out
示例#36
0
 def iterloop(arg, reset_value=0.0, out=None):
     arg = as_num_array(arg)
     if not out:
         out = arg.new()
     last = 0.0
     for i, value in it.izip(it.count(), arg):
         out[i] = max(reset_value, last + value)
     return out
示例#37
0
 def groupby2(arg):
   arg = as_num_array(arg)
   n = float(len(arg))
   gfx = [len(list(g))/n for k,g in groupby(sorted(arg))]
   out = 1.0
   for gf in gfx:
     out -= gf * gf
   return out
示例#38
0
文件: gini.py 项目: soedjais/augustus
 def groupby2(arg):
     arg = as_num_array(arg)
     n = float(len(arg))
     gfx = [len(list(g)) / n for k, g in groupby(sorted(arg))]
     out = 1.0
     for gf in gfx:
         out -= gf * gf
     return out
示例#39
0
文件: gini.py 项目: soedjais/augustus
 def _prep_testdata(self, *args, **kwargs):
     out = []
     for arg in args:
         enum = {}
         for val in arg:
             enum[val] = 1 + enum.setdefault(val, 0)
         out.append(as_num_array(enum.values()))
     return out
示例#40
0
 def fast(arg,boundaries=[0,100,1000],values=None):
   assert len(boundaries), "at least one boundary is required"
   if values is not None:
     assert len(boundaries)+1 == len(values), "len(values) must be len(boundaries)+1, (%s,%s)" % (len(values),len(boundaries))
   idx = searchsorted(boundaries,arg)
   if values is None:
     return idx
   return as_num_array(values).take(idx)
示例#41
0
 def presort(dep,indep,cutpoints=None,dep_sorted=False):
   dep = as_num_array(dep)
   indep = as_num_array(indep)
   if not dep_sorted:
     idx = argsort(dep)
     dep = take(dep,idx)
     indep = take(indep,idx)
   if cutpoints is None:
     cutpoints = midpoints_integer(indep)
   out = []
   for cut in cutpoints:
     mask = indep < cut
     i1 = nonzero(mask)[0]
     i2 = nonzero(~mask)[0]
     a1 = dep[i1]
     a2 = dep[i2]
     out.append(gini2_presorted(a1,a2))
   return out
示例#42
0
 def presort(dep, indep, cutpoints=None, dep_sorted=False):
     dep = as_num_array(dep)
     indep = as_num_array(indep)
     if not dep_sorted:
         idx = argsort(dep)
         dep = take(dep, idx)
         indep = take(indep, idx)
     if cutpoints is None:
         cutpoints = midpoints_integer(indep)
     out = []
     for cut in cutpoints:
         mask = indep < cut
         i1 = nonzero(mask)[0]
         i2 = nonzero(~mask)[0]
         a1 = dep[i1]
         a2 = dep[i2]
         out.append(gini2_presorted(a1, a2))
     return out
示例#43
0
 def naive_loop(arg,first=False):
   arg = as_num_array(arg)
   if first:
     out = [1]
     for i in xrange(len(arg)-1):
       if arg[i] != arg[i+1]:
         out.append(1)
       else:
         out.append(0)
   else:
     out = []
     for i in xrange(1,len(arg)):
       if arg[i] != arg[i-1]:
         out.append(1)
       else:
         out.append(0)
     out.append(1)
   return as_num_array(out,type='Bool')
示例#44
0
 def groupby1(arg):
   arg = as_num_array(arg)
   histo = [(k,len(list(g))) for k,g in groupby(sorted(arg))]
   n = float(len(arg))
   out = 1.0
   for (val,cnt) in histo:
     gf = cnt/n
     out -= gf * gf
   return out
示例#45
0
 def gsl(arg, mean=0.0, variance=0.0, out=None):
     arg = as_num_array(arg)
     if not out:
         out = arg.new()
     if mean == 0.0:
         out[:] = gaussian_pdf(arg - mean, variance)
     else:
         out[:] = gaussian_pdf(arg, variance)
     return out
示例#46
0
 def gsl(arg,mean=0.0,variance=0.0,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
   if mean == 0.0:
     out[:] = gaussian_pdf(arg-mean,variance)
   else:
     out[:] = gaussian_pdf(arg,variance)
   return out
示例#47
0
文件: gini.py 项目: soedjais/augustus
 def groupby1(arg):
     arg = as_num_array(arg)
     histo = [(k, len(list(g))) for k, g in groupby(sorted(arg))]
     n = float(len(arg))
     out = 1.0
     for (val, cnt) in histo:
         gf = cnt / n
         out -= gf * gf
     return out
示例#48
0
 def naive_loop(arg, first=False):
     arg = as_num_array(arg)
     if first:
         out = [1]
         for i in xrange(len(arg) - 1):
             if arg[i] != arg[i + 1]:
                 out.append(1)
             else:
                 out.append(0)
     else:
         out = []
         for i in xrange(1, len(arg)):
             if arg[i] != arg[i - 1]:
                 out.append(1)
             else:
                 out.append(0)
         out.append(1)
     return as_num_array(out, type='Bool')
示例#49
0
 def naive_loop(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
   last = 0
   for i in xrange(len(arg)):
     if arg[i] != 0:
       last = arg[i]
     out[i] = last
   return out
示例#50
0
 def fast(arg,first=False):
   arg = as_num_array(arg)
   out = ones(len(arg),type='Bool')
   if first:
     reject = arg[1:] == arg[:-1]
     out[1:] -= reject
   else:
     reject = arg[:-1] == arg[1:]
     out[:-1] -= reject
   return out
示例#51
0
 def naive_loop(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
   last = 0
   for i in xrange(len(arg)):
     if arg[i] != 0:
       last = arg[i]
     out[i] = last
   return out
示例#52
0
 def clip(arg,lower=None,upper=None):
   arg = as_num_array(arg)
   if lower is not None and upper is not None:
     arg = clip(arg,lower,upper)
   else:
     if lower is not None:
       arg = maximum(arg,lower)
     if upper is not None:
       arg = minimum(arg,upper)
   return arg
示例#53
0
 def fast(arg, first=False):
     arg = as_num_array(arg)
     out = ones(len(arg), type='Bool')
     if first:
         reject = arg[1:] == arg[:-1]
         out[1:] -= reject
     else:
         reject = arg[:-1] == arg[1:]
         out[:-1] -= reject
     return out
示例#54
0
 def _prep_testdata(self, *args, **kwargs):
     # benchmark for inputs that are already vectors
     # simplification for tests: dep == indep
     out = [as_num_array(arg) for arg in args]
     if len(out) == 1:
         out.append(out[0].copy())
     if not kwargs.get('dep_sorted'):
         idx = argsort(out[0])
         out = [take(vec, idx) for vec in out]
         kwargs['dep_sorted'] = True
     return (out, kwargs)
示例#55
0
 def loop3(arg):
   arg = as_num_array(arg)
   n = float(len(arg))
   enum = {}
   for val in arg:
     enum[val] = 1 + enum.setdefault(val,0)
   out = 1.0
   for cnt in enum.itervalues():
     gf = (cnt*1.0)/n
     out -= gf * gf
   return out
示例#56
0
 def naive_iter(arg,out=None):
   arg = as_num_array(arg)
   if not out:
     out = arg.new()
   last = 0
   for i,value in izip(it.count(),arg):
     if value != 0:
       out[i] = last = value
     else:
       out[i] = last
   return out