def aov_padded(arows, feature_name, min_N_per_user=20, formula = "%s ~ cat + participant"): fla = formula % feature_name fmla = ro.Formula(fla) fmla.environment = ro.Environment() env = fmla.environment feats = r.c() owners = r.c() cats = r.c() for row in arows: rf = get_feature_for_note(row[0],feature_name) if rf is None: continue feats = r.c(feats,rf) cats = r.c(cats,row[aci('primary')]) owners = r.c(owners,row[aci('owner_id')]) # if we have less than N notes per person, we have less than a representative sample :( # so, to prevent the world from blowing, we fill in notes_per_owner_in_arows = nltk.FreqDist( [ row[aci('owner_id')] for row in arows ] ) note_ids = set( [row[0] for row in arows ]) for owner_id,v in notes_per_owner_in_arows.iteritems(): owned_notes_not_yet_chosen = [x for x in User.objects.filter(id=owner_id)[0].note_owner.all() if x.id not in note_ids and len(x.contents.strip()) > 0] if v < min_N_per_user: to_choose_k = min(len(owned_notes_not_yet_chosen),min_N_per_user-v) chosen = random.sample(owned_notes_not_yet_chosen,to_choose_k) assert to_choose_k == len(chosen), "Could not find note, somethings wrong %d %d" % (to_choose_k , len(chosen) ) for chnote in chosen: feat = nl.compute_feature_named(feature_name,chnote) feats = r.c(feats,nl.compute_feature_named(feature_name,chnote)) cats = r.c(cats,'uncategorized') owners = r.c(owners,owner_id) print "adding ",to_choose_k," to ", owner_id env[feature_name] = feats env['cat'] = r('as.factor')(cats) env['participant'] = r('as.factor')(owners) print 'feats', env[feature_name] print 'cat', env['cat'] print 'part', env['participant'] return fmla
def compute_avg_for_overall_interesting(interesting_users,feature_name): interesting_notes = reduce(lambda x,y: x+y, [ list(i.note_owner.all().values()) for i in interesting_users]) def printstats(varr): return [("len: ", len(varr)), ("mean: ",mean(varr)), ("median: ",median(varr)), ("min ", min(varr)), ("max ", max(varr)), ("stdev:", pow(ca.var(varr),0.5) if len(varr) > 1 else "CANT COMPUTE len = 1")] print printstats([ nl.compute_feature_named(feature_name, n) for n in interesting_notes if nl.compute_feature_named(feature_name, n) is not None and nl.compute_feature_named(feature_name, n) >= 0])
def get_feature_for_note(nid,feature_name,coerce_fn=lambda x: float(x)): nlread = nl.read() N = filter(lambda n_: n_["id"] == str(nid), nlread["notes"]) if len(N) == 0 or feature_name not in nlread['note_fields'] + nlread['feature_fields'] + nlread['label_fields']: ## debug # if len(N) == 0: # print "warning unknown note computing ", nid, feature_name # else: # print "unknown feature name, trying to compute ", feature_name #print "result .... ", Note.objects.filter(id=nid).count(), nl.feature_named(feature_name,Note.objects.filter(id=nid).values()[0]) return nl.compute_feature_named(feature_name,Note.objects.filter(id=nid).values()[0]) N = N[0] return coerce_fn(N[feature_name])