def main(): user_df = get_user_data("*") user_list = list(pandas.unique(user_df[(user_df["role"]=="STUDENT")]["id"])) zpdp = ZPDPredictor() S_list = [] X_list = [] U_list = [] Qv_list = [] pass_list = [] atts_list = [] i_users = 0 n_users = 50 for u in user_list[0:n_users]: i_users += 1 print(i_users) attempts = get_attempts_from_db(u) ts_list = list(attempts["timestamp"]) # S_list = gen_semi_static() tX = gen_experience(u, ts_list) X_list += tX U_list += gen_success(u, ts_list) Qv_list += encode_q_vectors(attempts) S_list += [numpy.zeros(1) for t in tX] pass_list += list(attempts["correct"]==True) atts_list += [numpy.ones(1) for t in tX] if (len(X_list) > 10000) or (i_users==n_users): S_list = numpy.array(S_list) X_list = numpy.array(X_list) U_list = numpy.array(U_list) Qv_list = numpy.array(Qv_list) pass_list = numpy.array(pass_list) atts_list = numpy.array(atts_list) zpdp.train((S_list,X_list,U_list),Qv_list,pass_list, atts_list) S_list = [] X_list = [] U_list = [] Qv_list = [] pass_list = [] atts_list = [] for u in user_list[n_users:2*n_users]: attempts = get_attempts_from_db(u) ts_list = list(attempts["timestamp"]) delta_x = gen_experience(u, ts_list) X_list += delta_x U_list += gen_success(u, ts_list) Qv_list += encode_q_vectors(attempts) S_list += [numpy.zeros(1) for t in delta_x] pass_list += list(attempts["correct"] == True) S_list = numpy.array(S_list) X_list = numpy.array(X_list) U_list = numpy.array(U_list) Qv_list = numpy.array(Qv_list) pass_list = numpy.array(pass_list) metrics = zpdp.pass_model.evaluate([S_list,X_list,U_list,Qv_list], pass_list) print(metrics)
def build_dob_cache(dob_cache, assts): for ix, ass in enumerate(assts.iterrows()): id, ts, gb_id, gr_id = ass_extract(ass) students = list(get_student_list(gr_id)["user_id"]) # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts)) group_df = get_user_data(students) for psi in students: if psi not in dob_cache: # print("age gen...") age_df = get_age_df(ts, group_df) age_df["dob"] = pandas.to_datetime(age_df["dob"]) # age = age_df.loc[psi, "age"] for psi_inner in students: dob = age_df.loc[psi,"dob"] # print(type(dob)) assert isinstance(dob, Timestamp) dob_cache[psi_inner] = dob return dob_cache
def __init__(self, assts, batch_size=512, FRESSSH=False, return_qhist=False): self.assts: pandas.DataFrame = assts self.assts.loc[:,'creation_date'] = pandas.to_datetime(assts['creation_date']) self.gb_qmap = make_gb_question_map() self.batch_size=batch_size if batch_size!="assignment" else 0 self.return_qhist = return_qhist if not FRESSSH: print("APPEND mode") #recycle old pap try: f = open(prof_fname, 'rb') self.profiles = pickle.load(f) print("got this many profiles:",len(self.profiles)) # print(list(profiles.keys())[0:10]) f.close() except: self.profiles = {} # d = open(dob_cache, 'rb') # self.dob_cache = pickle.load(d) # print("loaded dob cache with {} entries".format(self.dob_cache)) # d.close() else: print("Baking FRESH, like cinnamon!") self.profiles = {} # self.dob_cache = {} self.ts_cache = {} self.assid_list = [] self.ts_master_list = [] self.gb_id_list = [] self.gr_id_list = [] self.students_list = [] print("building dob_cache") empty_cache = {} self.dob_cache = build_dob_cache(empty_cache, assts) print(len(empty_cache)) print("done") for ix, ass in enumerate(self.assts.iterrows()): id, ts, gb_id, gr_id = ass_extract(ass) self.assid_list.append(id) self.ts_master_list.append(ts) self.gb_id_list.append(gb_id) self.gr_id_list.append(gr_id) students = list(get_student_list(gr_id)["user_id"]) self.students_list.append(students) # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts)) for psi in students: if psi in self.ts_cache.keys(): # print("try add ts {}".ts) # temp = self.ts_cache[psi] # print(temp) # temp.append(ts) # self.ts_cache[psi] = temp t = self.ts_cache[psi] t.append(ts) self.ts_cache[psi] = t else: self.ts_cache[psi] = [ts] c=-1 for i,ts,gb_id,gr_id in zip(self.assid_list, self.ts_master_list, self.gb_id_list, self.gr_id_list): c += 1 has_changed = False students = list(get_student_list(gr_id)["user_id"]) for psi in students: # set up the training arrays here fn = "prof_{}_{}".format(psi, ts) if fn not in self.profiles: print("{}- - - - profile for {} .. not found .. will create all ={}".format(c,psi, SAVE_TO_PROF_CACHE)) has_changed = True group_df = get_user_data(students) ts_list = self.ts_cache[psi] print("ts_list", ts_list) print("s..") s_psi_list = gen_semi_static(psi, self.dob_cache, ts_list) print("done") print("x..") x_psi_list = gen_experience(psi, ts_list) print("done") print("u..") u_psi_list = gen_success(psi, ts_list) print("done") for ts,s_psi,x_psi,u_psi in zip(sorted(ts_list),s_psi_list,x_psi_list, u_psi_list): loopvar = "prof_{}_{}".format(psi, ts) self.profiles[fn] = zlib.compress(pickle.dumps((s_psi, x_psi, u_psi))) print("created profile for ",loopvar, "xp=",numpy.sum(x_psi),"sxp=",numpy.sum(u_psi),"S=",s_psi) else: print(".. {} f/cache".format(fn)) if has_changed: f = open(prof_fname, 'wb') pickle.dump(self.profiles, f) f.close() print("*** *** *** SAVED")
def evaluate_phybook_loss(tt, sxua, model, sc): aid_list, s_list, x_list, u_list, a_list, y_list = augment_data(tt, sxua) # hex_list = [] # all_page_ids = pid_override # ailist = [] for row in tt.iterrows(): aid = row[1]["id"] # ts = row[1]["creation_date"] gr_id = row[1]["group_id"] gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) print(student_ids) student_data = get_user_data(student_ids) hexes = list(gb_qmap[gb_id]) print(hexes) for _ in student_ids: aid_list.append(aid) # hex_list.append(hexes) s_list = sc.transform(s_list) s_list = numpy.array(s_list) x_list = numpy.array(x_list) u_list = numpy.array(u_list) a_list = numpy.array(a_list) print(s_list.shape, x_list.shape, u_list.shape, a_list.shape) print("results") print(model.get_input_shape_at(0)) predictions = model.predict([s_list, u_list]) j_max = 0 thresh_max = 0 dir_hits_max = 0 for j_thresh in [0.01, 0.025, .05, 0.075, .1, .2, 0.3, 0.4, 0.5, 0.6, 0.7]: # for j_thresh in [0.4]: j_sum = 0 # dir_sum = 0 incl_sum = 0 dir_hits = 0 N = len(predictions) this_ai = None for ai, p, s, x, a, y in zip(aid_list, predictions, s_list, x_list, a_list, y_list): t = [pid_override[yix] for yix, yval in enumerate(y) if yval == 1] if ai != this_ai: print("\n...new asst", ai) this_ai = ai phxs = [] probs = [] print("pshape", p.shape) maxpox = numpy.argmax(p) print(maxpox, len(pid_override)) max_guess = pid_override[maxpox] phxs.append(max_guess) probs.append(p[maxpox]) for ix, el in enumerate(p): if el > j_thresh and pid_override[ix] not in phxs: phxs.append(pid_override[ix]) probs.append(p[ix]) probs_shortlist = list(reversed(sorted(probs))) Z = list(reversed([x for _, x in sorted(zip(probs, phxs))])) # if Z: # for t_el in t: # if t_el in Z:#'direct hit' # dir_sum += 1.0/len(t) print(t, Z) print(probs_shortlist) # print([all_page_ids[hx] for hx,el in enumerate(a) if el==1]) if max_guess not in t: robot = "BAD ROBOT" else: if max_guess == t[0]: robot = "GREAT ROBOT" dir_hits += 1 else: robot = "GOOD ROBOT" print("{} {}, XP={}".format(robot, sc.inverse_transform(s), numpy.sum(x))) t = set(t) phxs = set(phxs) if len(t.intersection(phxs)) > 0: incl_sum += 1 j_sum += len(t.intersection(phxs)) / len(t.union(phxs)) j_score = j_sum / N # dir_score = dir_sum/N if dir_hits > dir_hits_max: j_max = j_score thresh_max = j_thresh dir_hits_max = dir_hits # dir_for_j_max = dir_score print("j_thresh =", j_thresh) print("Jaccard:", j_score) print("Incl:", incl_sum / N) print("D/H:", dir_hits / N) print("~ ~ ~ ~") print("max thresh/jacc:", thresh_max, j_max, dir_hits_max / N) print("num examples", N)
def create_student_scorecards(tt, sxua, model, sc): names_df = get_q_names() names_df.index = names_df["question_id"] cat_list = [] ailist = [] # all_page_ids = pid_override aids = [] for row in tt.iterrows(): aid_list = [] a_list = [] x_list = [] u_list = [] y_list = [] s_list = [] hex_list = [] psi_list = [] print(row) aid = row[1]["id"] ts = row[1]["creation_date"] gr_id = row[1]["group_id"] gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) print(student_ids) student_data = get_user_data(student_ids) hexes = list(gb_qmap[gb_id]) print(hexes) # n-hot binarise the y vector here y_true = numpy.zeros(len(pid_override), dtype=numpy.int8) for hx in hexes: hxix = pid_override.index(hx) y_true[hxix] = 1.0 aid_list.append(aid) incl_psis = [] for psi in student_ids: S, X, U, A = pickle.loads(zlib.decompress(sxua[psi][ts])) if S[0] < 10: print("s0 under 10") continue if S[1] == 0: print("no time on plaform recorded") continue psi_list.append(psi) hex_list.append(hexes) y_list.append(y_true) # print(psi) # S,X,U,A = sxua[psi][ts] s_list.append(S) x_list.append(X) u_list.append(U) a_list.append(A) incl_psis.append(psi) print("student {} done".format(psi)) if len(s_list) == 0: continue s_arr = numpy.array(s_list) x_arr = numpy.array(x_list) u_arr = numpy.array(u_list) a_arr = numpy.array(a_list) predictions = model.predict([s_arr, u_arr]) save_class_report_card(ts, aid, gr_id, s_list, x_list, u_list, a_list, y_list, predictions, incl_psis, names_df) with open("a_ids.txt", "w+") as f: f.write("({})\n".format(len(aid_list))) f.writelines([str(a) + "\n" for a in sorted(aids)]) f.write("\n")
print(group_ids[0:20]) # exit() for gr_id in group_ids: gr_ass = assignments[assignments["group_id"] == gr_id] for row in gr_ass.iterrows(): # for row in assignments.iterrows(): aid = row[1]["id"] # print(row) ts = row[1]["creation_date"] # gr_id = row[1]["group_id"] gc.collect() gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) # print(student_ids) student_data = get_user_data(student_ids) now_hexes = list(gb_qmap[gb_id]) # print(now_hexes) # if 118651 not in student_ids: # continue for psi in student_ids: # if psi != 118651: # continue # print(psi) if psi not in SXUA: S = numpy.zeros(6) X = numpy.zeros(len(all_qids), dtype=numpy.int16) U = numpy.zeros(len(all_qids), dtype=numpy.int8) A = numpy.zeros(len(pid_override), dtype=numpy.int8) SXUA[psi] = {} print("+", psi, S, numpy.sum(X), numpy.sum(U),
def make_data(ass_n, pickle_at, APPEND=True): user_cache = {} ass_df = get_all_assignments() # ass_df = ass_df.iloc[27000:, :] # sprofs = pandas.read_csv(base + "student_profiling/users_all.csv") # sprofs["date_of_birth"] = pandas.to_datetime(sprofs["date_of_birth"]) gb_qmap = make_gb_question_map() ass_ct = 0 ass_df["creation_date"] = pandas.to_datetime(ass_df["creation_date"]) #ass_df = ass_df[ass_df.event_details!="{}"] #ass_df["event_details"] = ass_df["event_details"].str.replace("0L,", "0,") profile_df = get_user_data("*") profile_df["date_of_birth"] = pandas.to_datetime( profile_df["date_of_birth"]) ct = 0 if APPEND: print("APPEND mode") #recycle old pap f = open(asst_fname, 'rb') asses = pickle.load(f) f.close() tracking = open("tracking.dat", "w+") print("loaded {} existing assignments".format(len(asses))) else: f = open(asst_fname, 'wb') f.truncate(0) f.close() tracking = open("tracking.dat", "w") print("FRESH mode") #bake it fresh asses = OrderedDict() start_at = len(asses) number_to_do = ass_n - start_at if number_to_do <= 0: print("We already have {}>{} samples".format(start_at, ass_n)) exit(1) #if ass_n is -1 then this overrides the trimming of the assts ass_df = ass_df.iloc[start_at:, :] if (ass_n > 0) else ass_df for ass in ass_df.iterrows(): id = ass[1]["id"] if id in asses and False == FORCE_OVERWRITE: # print("this assignment has already been processed, skipping!") continue print("assct {} of {} ({} users cached)".format( ass_ct, ass_n, len(user_cache))) ts = ass[1]['creation_date'] # print(ts) # event_details = eval(ass[1]['event_details']) gb_id = ass[1]["gameboard_id"] if gb_id not in gb_qmap: print("gb id unknown") continue this_concepts = set() raw_qns = gb_qmap[gb_id] this_levels = [] this_qns = raw_qns if type(raw_qns) is str: this_qns = eval( raw_qns ) #TODO make sure this works hitting the database as well for q in this_qns: if "|" in q: q = q.split("|")[0] this_levels.append(lev_page_lookup[q]) cs = concept_extract(q) this_concepts.update(cs) gr_id = ass[1]["group_id"] students = get_student_list([gr_id]) if students.empty: print(gr_id, "no students") continue else: print(gr_id, "students!") students = list(students["user_id"]) profile_df = get_user_data(list(students)) # print("get group attempts") # attempts_df = get_attempts_from_db(students) # print("got group attempts") profiles = profile_students(students, profile_df, ts, concepts_all, hwdf, user_cache, attempts_df=None) print(len(profiles), len(students)) assert len(profiles) == len(students) assert len(profiles) > 0 # if len(profiles)==0: # print("no profiles") # continue print("compressing_profiles") c_profiles = zlib.compress(pickle.dumps(profiles)) print("compressed") ass_entry = (ts, gb_id, gr_id, this_qns, this_concepts, this_levels, students, c_profiles) tracking.write(str(ass_entry[0:7] + (len(profiles), ))) tracking.write("\n") # asses.append(ass_entry) asses[id] = ass_entry ass_ct += 1 print("...{} students".format(len(profiles))) # ct+=1 # afile.write(str(ass_entry)+"\n") # if ct > 100: # afile.flush() # ct=0 print("ass_ct", ass_ct) print("pickle at", pickle_at) print("%", (ass_ct % pickle_at)) if (ass_ct == number_to_do) or (ass_ct % pickle_at) == 0: f = open(asst_fname, 'wb') pickle.dump(asses, f) f.flush() print("***SAVED (hallelujah)") if ass_ct == number_to_do: print("we have hit maximum ass limit") break # print("taking massive dump") # # afile.write("]\n") # # afile.close() # # joblib.dump(asses, asst_fname) # # with gzip.open(asst_fname, 'w') as f: # # #_pickle.dump(asses, f) # # f.write(_pickle.dumps(asses)) # with open(asst_fname, 'wb') as f: # pickle.dump(asses, f) f.close() print("We now have {} assignments on disc".format(len(asses))) return tracking.close()
model = load_model(base + "hwg_model.hd5") (ylb, clb) = joblib.load(base + 'hwg_mlb.pkl') up_to_ts = pandas.datetime.now() fout = open("predictions.out", "w") for t in teacher_ids: class_list = get_group_list(t)["id"] print("groups:", class_list) for c in class_list: print("get student lsit for =>", c) students = get_student_list(c) students = list(students["user_id"]) print("students:", students) if not students: continue # students = list(students) profile_df = get_user_data(students) # print("profiles:",profile_df) X = [] for u in students: x_psi = gen_experience(u, up_to_ts) X.append(x_psi) X = numpy.array(X) predictions = model.predict(X) ymax = ylb.inverse_transform(predictions) fout.write("TEACHER {} / CLASS {}\n".format(t, c)) for s, p in zip(students, ymax): fout.write("{}\t{}\n".format(s, p)) fout.close()