示例#1
0
def main():
    user_df = get_user_data("*")
    user_list = list(pandas.unique(user_df[(user_df["role"]=="STUDENT")]["id"]))
    zpdp = ZPDPredictor()
    S_list = []
    X_list = []
    U_list = []
    Qv_list = []
    pass_list = []
    atts_list = []

    i_users = 0
    n_users = 50
    for u in user_list[0:n_users]:
        i_users += 1
        print(i_users)
        attempts = get_attempts_from_db(u)
        ts_list = list(attempts["timestamp"])
        # S_list = gen_semi_static()
        tX = gen_experience(u, ts_list)
        X_list += tX
        U_list += gen_success(u, ts_list)
        Qv_list += encode_q_vectors(attempts)
        S_list += [numpy.zeros(1) for t in tX]
        pass_list += list(attempts["correct"]==True)
        atts_list += [numpy.ones(1) for t in tX]
        if (len(X_list) > 10000) or (i_users==n_users):
            S_list = numpy.array(S_list)
            X_list = numpy.array(X_list)
            U_list = numpy.array(U_list)
            Qv_list = numpy.array(Qv_list)
            pass_list = numpy.array(pass_list)
            atts_list = numpy.array(atts_list)
            zpdp.train((S_list,X_list,U_list),Qv_list,pass_list, atts_list)
            S_list = []
            X_list = []
            U_list = []
            Qv_list = []
            pass_list = []
            atts_list = []

    for u in user_list[n_users:2*n_users]:
        attempts = get_attempts_from_db(u)
        ts_list = list(attempts["timestamp"])
        delta_x = gen_experience(u, ts_list)
        X_list += delta_x
        U_list += gen_success(u, ts_list)
        Qv_list += encode_q_vectors(attempts)
        S_list += [numpy.zeros(1) for t in delta_x]
        pass_list += list(attempts["correct"] == True)
    S_list = numpy.array(S_list)
    X_list = numpy.array(X_list)
    U_list = numpy.array(U_list)
    Qv_list = numpy.array(Qv_list)
    pass_list = numpy.array(pass_list)
    metrics = zpdp.pass_model.evaluate([S_list,X_list,U_list,Qv_list], pass_list)
    print(metrics)
示例#2
0
def build_dob_cache(dob_cache, assts):
    for ix, ass in enumerate(assts.iterrows()):
        id, ts, gb_id, gr_id = ass_extract(ass)
        students = list(get_student_list(gr_id)["user_id"])
        # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts))
        group_df = get_user_data(students)
        for psi in students:
            if psi not in dob_cache:
                # print("age gen...")
                age_df = get_age_df(ts, group_df)
                age_df["dob"] = pandas.to_datetime(age_df["dob"])
                # age = age_df.loc[psi, "age"]
                for psi_inner in students:
                    dob = age_df.loc[psi,"dob"]
                    # print(type(dob))
                    assert isinstance(dob, Timestamp)
                    dob_cache[psi_inner] = dob
    return dob_cache
示例#3
0
    def __init__(self, assts, batch_size=512, FRESSSH=False, return_qhist=False):
        self.assts: pandas.DataFrame = assts
        self.assts.loc[:,'creation_date'] = pandas.to_datetime(assts['creation_date'])

        self.gb_qmap = make_gb_question_map()

        self.batch_size=batch_size if batch_size!="assignment" else 0
        self.return_qhist = return_qhist

        if not FRESSSH:
            print("APPEND mode")
            #recycle old pap
            try:
                f = open(prof_fname, 'rb')
                self.profiles = pickle.load(f)
                print("got this many profiles:",len(self.profiles))
                # print(list(profiles.keys())[0:10])
                f.close()
            except:
                self.profiles = {}
            # d = open(dob_cache, 'rb')
            # self.dob_cache = pickle.load(d)
            # print("loaded dob cache with {} entries".format(self.dob_cache))
            # d.close()
        else:
            print("Baking FRESH, like cinnamon!")
            self.profiles = {}
            # self.dob_cache = {}

        self.ts_cache = {}
        self.assid_list = []
        self.ts_master_list = []
        self.gb_id_list = []
        self.gr_id_list = []
        self.students_list = []

        print("building dob_cache")
        empty_cache = {}
        self.dob_cache = build_dob_cache(empty_cache, assts)
        print(len(empty_cache))
        print("done")

        for ix, ass in enumerate(self.assts.iterrows()):
            id, ts, gb_id, gr_id = ass_extract(ass)
            self.assid_list.append(id)
            self.ts_master_list.append(ts)
            self.gb_id_list.append(gb_id)
            self.gr_id_list.append(gr_id)
            students = list(get_student_list(gr_id)["user_id"])
            self.students_list.append(students)
            # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts))
            for psi in students:
                if psi in self.ts_cache.keys():
                    # print("try add ts {}".ts)
                    # temp = self.ts_cache[psi]
                    # print(temp)
                    # temp.append(ts)
                    # self.ts_cache[psi] = temp
                    t = self.ts_cache[psi]
                    t.append(ts)
                    self.ts_cache[psi] = t
                else:
                    self.ts_cache[psi] = [ts]

        c=-1
        for i,ts,gb_id,gr_id in zip(self.assid_list, self.ts_master_list, self.gb_id_list, self.gr_id_list):
            c += 1
            has_changed = False
            students = list(get_student_list(gr_id)["user_id"])
            for psi in students:  # set up the training arrays here
                fn = "prof_{}_{}".format(psi, ts)
                if fn not in self.profiles:
                    print("{}- - - -   profile for {} .. not found .. will create all ={}".format(c,psi, SAVE_TO_PROF_CACHE))
                    has_changed = True
                    group_df = get_user_data(students)
                    ts_list = self.ts_cache[psi]
                    print("ts_list", ts_list)
                    print("s..")
                    s_psi_list = gen_semi_static(psi, self.dob_cache, ts_list)
                    print("done")
                    print("x..")
                    x_psi_list = gen_experience(psi, ts_list)
                    print("done")
                    print("u..")
                    u_psi_list = gen_success(psi, ts_list)
                    print("done")
                    for ts,s_psi,x_psi,u_psi in zip(sorted(ts_list),s_psi_list,x_psi_list, u_psi_list):
                        loopvar = "prof_{}_{}".format(psi, ts)
                        self.profiles[fn] = zlib.compress(pickle.dumps((s_psi, x_psi, u_psi)))
                        print("created profile for ",loopvar, "xp=",numpy.sum(x_psi),"sxp=",numpy.sum(u_psi),"S=",s_psi)
                else:
                    print(".. {} f/cache".format(fn))
            if has_changed:
                f = open(prof_fname, 'wb')
                pickle.dump(self.profiles, f)
                f.close()
                print("*** *** *** SAVED")
示例#4
0
def evaluate_phybook_loss(tt, sxua, model, sc):
    aid_list, s_list, x_list, u_list, a_list, y_list = augment_data(tt, sxua)

    # hex_list = []
    # all_page_ids = pid_override
    # ailist = []
    for row in tt.iterrows():
        aid = row[1]["id"]
        # ts = row[1]["creation_date"]
        gr_id = row[1]["group_id"]
        gb_id = row[1]["gameboard_id"]
        student_ids = list(get_student_list(gr_id)["user_id"])
        print(student_ids)
        student_data = get_user_data(student_ids)
        hexes = list(gb_qmap[gb_id])
        print(hexes)

        for _ in student_ids:
            aid_list.append(aid)
            # hex_list.append(hexes)

    s_list = sc.transform(s_list)
    s_list = numpy.array(s_list)

    x_list = numpy.array(x_list)
    u_list = numpy.array(u_list)
    a_list = numpy.array(a_list)

    print(s_list.shape, x_list.shape, u_list.shape, a_list.shape)

    print("results")
    print(model.get_input_shape_at(0))
    predictions = model.predict([s_list, u_list])
    j_max = 0
    thresh_max = 0
    dir_hits_max = 0
    for j_thresh in [0.01, 0.025, .05, 0.075, .1, .2, 0.3, 0.4, 0.5, 0.6, 0.7]:
        # for j_thresh in [0.4]:
        j_sum = 0
        # dir_sum = 0
        incl_sum = 0
        dir_hits = 0
        N = len(predictions)
        this_ai = None
        for ai, p, s, x, a, y in zip(aid_list, predictions, s_list, x_list,
                                     a_list, y_list):
            t = [pid_override[yix] for yix, yval in enumerate(y) if yval == 1]
            if ai != this_ai:
                print("\n...new asst", ai)
                this_ai = ai
            phxs = []
            probs = []
            print("pshape", p.shape)
            maxpox = numpy.argmax(p)
            print(maxpox, len(pid_override))
            max_guess = pid_override[maxpox]
            phxs.append(max_guess)

            probs.append(p[maxpox])
            for ix, el in enumerate(p):
                if el > j_thresh and pid_override[ix] not in phxs:
                    phxs.append(pid_override[ix])
                    probs.append(p[ix])
            probs_shortlist = list(reversed(sorted(probs)))
            Z = list(reversed([x for _, x in sorted(zip(probs, phxs))]))
            # if Z:
            #     for t_el in t:
            #         if t_el in Z:#'direct hit'
            #             dir_sum += 1.0/len(t)
            print(t, Z)
            print(probs_shortlist)
            # print([all_page_ids[hx] for hx,el in enumerate(a) if el==1])
            if max_guess not in t:
                robot = "BAD ROBOT"
            else:
                if max_guess == t[0]:
                    robot = "GREAT ROBOT"
                    dir_hits += 1
                else:
                    robot = "GOOD ROBOT"
            print("{} {}, XP={}".format(robot, sc.inverse_transform(s),
                                        numpy.sum(x)))
            t = set(t)
            phxs = set(phxs)
            if len(t.intersection(phxs)) > 0:
                incl_sum += 1
            j_sum += len(t.intersection(phxs)) / len(t.union(phxs))
        j_score = j_sum / N
        # dir_score = dir_sum/N
        if dir_hits > dir_hits_max:
            j_max = j_score
            thresh_max = j_thresh
            dir_hits_max = dir_hits
            # dir_for_j_max = dir_score
        print("j_thresh =", j_thresh)
        print("Jaccard:", j_score)
        print("Incl:", incl_sum / N)
        print("D/H:", dir_hits / N)
        print("~ ~ ~ ~")
    print("max thresh/jacc:", thresh_max, j_max, dir_hits_max / N)
    print("num examples", N)
示例#5
0
def create_student_scorecards(tt, sxua, model, sc):
    names_df = get_q_names()
    names_df.index = names_df["question_id"]
    cat_list = []
    ailist = []
    # all_page_ids = pid_override
    aids = []
    for row in tt.iterrows():
        aid_list = []
        a_list = []
        x_list = []
        u_list = []
        y_list = []
        s_list = []
        hex_list = []
        psi_list = []

        print(row)
        aid = row[1]["id"]
        ts = row[1]["creation_date"]
        gr_id = row[1]["group_id"]
        gb_id = row[1]["gameboard_id"]
        student_ids = list(get_student_list(gr_id)["user_id"])
        print(student_ids)
        student_data = get_user_data(student_ids)
        hexes = list(gb_qmap[gb_id])
        print(hexes)

        # n-hot binarise the y vector here
        y_true = numpy.zeros(len(pid_override), dtype=numpy.int8)
        for hx in hexes:
            hxix = pid_override.index(hx)
            y_true[hxix] = 1.0

        aid_list.append(aid)
        incl_psis = []
        for psi in student_ids:
            S, X, U, A = pickle.loads(zlib.decompress(sxua[psi][ts]))
            if S[0] < 10:
                print("s0 under 10")
                continue
            if S[1] == 0:
                print("no time on plaform recorded")
                continue
            psi_list.append(psi)
            hex_list.append(hexes)
            y_list.append(y_true)
            # print(psi)
            # S,X,U,A = sxua[psi][ts]
            s_list.append(S)
            x_list.append(X)
            u_list.append(U)
            a_list.append(A)
            incl_psis.append(psi)
            print("student {} done".format(psi))

        if len(s_list) == 0:
            continue

        s_arr = numpy.array(s_list)
        x_arr = numpy.array(x_list)
        u_arr = numpy.array(u_list)
        a_arr = numpy.array(a_list)

        predictions = model.predict([s_arr, u_arr])

        save_class_report_card(ts, aid, gr_id, s_list, x_list, u_list, a_list,
                               y_list, predictions, incl_psis, names_df)
    with open("a_ids.txt", "w+") as f:
        f.write("({})\n".format(len(aid_list)))
        f.writelines([str(a) + "\n" for a in sorted(aids)])
        f.write("\n")
示例#6
0
        print(group_ids[0:20])
        # exit()

        for gr_id in group_ids:
            gr_ass = assignments[assignments["group_id"] == gr_id]
            for row in gr_ass.iterrows():
                # for row in assignments.iterrows():
                aid = row[1]["id"]
                # print(row)
                ts = row[1]["creation_date"]
                # gr_id = row[1]["group_id"]
                gc.collect()
                gb_id = row[1]["gameboard_id"]
                student_ids = list(get_student_list(gr_id)["user_id"])
                # print(student_ids)
                student_data = get_user_data(student_ids)
                now_hexes = list(gb_qmap[gb_id])
                # print(now_hexes)
                # if 118651 not in student_ids:
                #     continue
                for psi in student_ids:
                    # if psi != 118651:
                    #     continue
                    # print(psi)
                    if psi not in SXUA:
                        S = numpy.zeros(6)
                        X = numpy.zeros(len(all_qids), dtype=numpy.int16)
                        U = numpy.zeros(len(all_qids), dtype=numpy.int8)
                        A = numpy.zeros(len(pid_override), dtype=numpy.int8)
                        SXUA[psi] = {}
                        print("+", psi, S, numpy.sum(X), numpy.sum(U),
示例#7
0
def make_data(ass_n, pickle_at, APPEND=True):
    user_cache = {}

    ass_df = get_all_assignments()
    # ass_df = ass_df.iloc[27000:, :]
    # sprofs = pandas.read_csv(base + "student_profiling/users_all.csv")
    # sprofs["date_of_birth"] = pandas.to_datetime(sprofs["date_of_birth"])
    gb_qmap = make_gb_question_map()
    ass_ct = 0

    ass_df["creation_date"] = pandas.to_datetime(ass_df["creation_date"])
    #ass_df = ass_df[ass_df.event_details!="{}"]
    #ass_df["event_details"] = ass_df["event_details"].str.replace("0L,", "0,")

    profile_df = get_user_data("*")
    profile_df["date_of_birth"] = pandas.to_datetime(
        profile_df["date_of_birth"])

    ct = 0

    if APPEND:
        print("APPEND mode")
        #recycle old pap
        f = open(asst_fname, 'rb')
        asses = pickle.load(f)
        f.close()
        tracking = open("tracking.dat", "w+")
        print("loaded {} existing assignments".format(len(asses)))

    else:
        f = open(asst_fname, 'wb')
        f.truncate(0)
        f.close()
        tracking = open("tracking.dat", "w")
        print("FRESH mode")
        #bake it fresh
        asses = OrderedDict()

    start_at = len(asses)
    number_to_do = ass_n - start_at
    if number_to_do <= 0:
        print("We already have {}>{} samples".format(start_at, ass_n))
        exit(1)

    #if ass_n is -1 then this overrides the trimming of the assts
    ass_df = ass_df.iloc[start_at:, :] if (ass_n > 0) else ass_df

    for ass in ass_df.iterrows():
        id = ass[1]["id"]
        if id in asses and False == FORCE_OVERWRITE:
            # print("this assignment has already been processed, skipping!")
            continue

        print("assct {} of {} ({} users cached)".format(
            ass_ct, ass_n, len(user_cache)))
        ts = ass[1]['creation_date']
        # print(ts)
        # event_details = eval(ass[1]['event_details'])
        gb_id = ass[1]["gameboard_id"]
        if gb_id not in gb_qmap:
            print("gb id unknown")
            continue

        this_concepts = set()
        raw_qns = gb_qmap[gb_id]
        this_levels = []
        this_qns = raw_qns
        if type(raw_qns) is str:
            this_qns = eval(
                raw_qns
            )  #TODO make sure this works hitting the database as well
        for q in this_qns:
            if "|" in q:
                q = q.split("|")[0]
            this_levels.append(lev_page_lookup[q])
            cs = concept_extract(q)
            this_concepts.update(cs)

        gr_id = ass[1]["group_id"]
        students = get_student_list([gr_id])

        if students.empty:
            print(gr_id, "no students")
            continue
        else:
            print(gr_id, "students!")

        students = list(students["user_id"])
        profile_df = get_user_data(list(students))
        # print("get group attempts")
        # attempts_df = get_attempts_from_db(students)
        # print("got group attempts")

        profiles = profile_students(students,
                                    profile_df,
                                    ts,
                                    concepts_all,
                                    hwdf,
                                    user_cache,
                                    attempts_df=None)
        print(len(profiles), len(students))
        assert len(profiles) == len(students)
        assert len(profiles) > 0
        # if len(profiles)==0:
        #     print("no profiles")
        #     continue
        print("compressing_profiles")
        c_profiles = zlib.compress(pickle.dumps(profiles))
        print("compressed")

        ass_entry = (ts, gb_id, gr_id, this_qns, this_concepts, this_levels,
                     students, c_profiles)
        tracking.write(str(ass_entry[0:7] + (len(profiles), )))
        tracking.write("\n")
        # asses.append(ass_entry)
        asses[id] = ass_entry
        ass_ct += 1

        print("...{} students".format(len(profiles)))
        # ct+=1
        # afile.write(str(ass_entry)+"\n")
        # if ct > 100:
        #     afile.flush()
        #     ct=0
        print("ass_ct", ass_ct)
        print("pickle at", pickle_at)
        print("%", (ass_ct % pickle_at))
        if (ass_ct == number_to_do) or (ass_ct % pickle_at) == 0:
            f = open(asst_fname, 'wb')
            pickle.dump(asses, f)
            f.flush()
            print("***SAVED (hallelujah)")

        if ass_ct == number_to_do:
            print("we have hit maximum ass limit")
            break
    # print("taking massive dump")
    # # afile.write("]\n")
    # # afile.close()
    # # joblib.dump(asses, asst_fname)
    # # with gzip.open(asst_fname, 'w') as f:
    # #     #_pickle.dump(asses, f)
    # #     f.write(_pickle.dumps(asses))
    # with open(asst_fname, 'wb') as f:
    #     pickle.dump(asses, f)
    f.close()
    print("We now have {} assignments on disc".format(len(asses)))
    return
    tracking.close()
示例#8
0
    model = load_model(base + "hwg_model.hd5")
    (ylb, clb) = joblib.load(base + 'hwg_mlb.pkl')

    up_to_ts = pandas.datetime.now()
    fout = open("predictions.out", "w")
    for t in teacher_ids:
        class_list = get_group_list(t)["id"]
        print("groups:", class_list)
        for c in class_list:
            print("get student lsit for =>", c)
            students = get_student_list(c)
            students = list(students["user_id"])
            print("students:", students)
            if not students:
                continue
            # students = list(students)
            profile_df = get_user_data(students)
            # print("profiles:",profile_df)

            X = []
            for u in students:
                x_psi = gen_experience(u, up_to_ts)
                X.append(x_psi)
            X = numpy.array(X)
            predictions = model.predict(X)
            ymax = ylb.inverse_transform(predictions)

            fout.write("TEACHER {} / CLASS {}\n".format(t, c))
            for s, p in zip(students, ymax):
                fout.write("{}\t{}\n".format(s, p))
    fout.close()