link_function = lambda x: np.log(1 + x) elif options.pfa: suffix = 'swf' link_function = identity if options.tw or options.pfa: # Build time windows features df = full if 'skill_id' in full.columns: df = df.dropna(subset=['skill_id']) df['skill_ids'] = df['skill_id'].astype(str) else: df['skill_ids'] = [None] * len(df) dt = time.time() # Prepare counters for time windows q = defaultdict(lambda: OurQueue(only_forever=options.pfa)) # Using zip is the fastest way to iterate DataFrames # Source: https://stackoverflow.com/a/34311080 for i_sample, user, item_id, t, correct, skill_ids in zip( df['i'], df['user'], df['item_id'], df['timestamp'], df['correct'], df['skill_ids']): for skill_id in q_mat[item_id]: # Fallback skill_id = int(skill_id) add(i_sample, codes[skill_id], 1) for pos, value in enumerate(q[user, skill_id].get_counters(t)): if value > 0: add(i_sample, extra_codes['attempts', skill_id, pos], link_function(1 + value)) for pos, value in enumerate(q[user, skill_id, 'correct'].get_counters(t)): if value > 0:
def df_to_sparse(df, Q_mat, active_features, tw=None, verbose=True): """Build sparse features dataset from dense dataset and q-matrix. Arguments: df -- dense dataset, output from one function from prepare_data.py (pandas DataFrame) Q_mat -- q-matrix, output from one function from prepare_data.py (sparse array) active_features -- features used to build the dataset (list of strings) tw -- useful when script is *not* called from command line. verbose -- if True, print information on the encoding process (bool) Output: sparse_df -- sparse dataset. The 5 first columns of sparse_df are just the same columns as in df. Notes: * tw_kc and tw_items respectively encode time windows features instead of regular counter features at the skill and at the item level for wins and attempts, as decribed in our paper. As a consequence, these arguments can only be used along with the wins and/or attempts arguments. With tw_kc, one column per time window x skill is encoded, whereas with tw_items, one column per time window is encoded (it is assumed that items share the same time window biases). """ # Transform q-matrix into dictionary dict_q_mat = {i:set() for i in range(Q_mat.shape[0])} for elt in np.argwhere(Q_mat == 1): dict_q_mat[elt[0]].add(elt[1]) X={} if 'skills' in active_features: X["skills"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'attempts' in active_features: if tw == "tw_kc": X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS))) elif tw == "tw_items": X["attempts"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS))) else: X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'wins' in active_features: if tw == "tw_kc": X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS))) elif tw == "tw_items": X["wins"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS))) else: X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'fails' in active_features: X["fails"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) X['df'] = np.empty((0,5)) # Keep track of the original dataset q = defaultdict(lambda: OurQueue()) # Prepare counters for time windows for stud_id in df["user_id"].unique(): df_stud = df[df["user_id"]==stud_id][["user_id", "item_id", "timestamp", "correct", "inter_id"]].copy() df_stud.sort_values(by="timestamp", inplace=True) # Sort values df_stud = np.array(df_stud) X['df'] = np.vstack((X['df'], df_stud)) if 'skills' in active_features: skills_temp = Q_mat[df_stud[:,1].astype(int)].copy() X['skills'] = sparse.vstack([X["skills"],sparse.csr_matrix(skills_temp)]) if "attempts" in active_features: skills_temp = Q_mat[df_stud[:,1].astype(int)].copy() if tw == "tw_kc": attempts = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS*Q_mat.shape[1])) for l, (item_id, t) in enumerate(zip(df_stud[:,1], df_stud[:,2])): for skill_id in dict_q_mat[item_id]: attempts[l, skill_id*NB_OF_TIME_WINDOWS:(skill_id+1)*NB_OF_TIME_WINDOWS] = np.log(1 + \ np.array(q[stud_id, skill_id].get_counters(t))) q[stud_id, skill_id].push(t) #attempts = np.empty((df_stud.shape[0],0)) #for l in LIST_OF_BOUNDARIES: # attempts_temp = np.zeros((df_stud.shape[0],Q_mat.shape[1])) # a_sw array # for i in range(1,attempts_temp.shape[0]): # 1st line is always full of zeros # list_of_indices = np.where(df_stud[i,2] - df_stud[:i,2] < l) # skills_temp = Q_mat[df_stud[:i,1].astype(int)][list_of_indices] # attempts_temp[i] = np.sum(skills_temp,0) # skills = Q_mat[df_stud[:,1].astype(int)] # attempts_temp = np.log(1+np.multiply(attempts_temp,skills)) # only keep KCs involved # attempts = np.hstack((attempts,attempts_temp)) elif tw == "tw_items": attempts = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS)) for l, (item_id, t) in enumerate(zip(df_stud[:,1], df_stud[:,2])): attempts[l] = np.log(1 + np.array(q[stud_id, item_id].get_counters(t))) q[stud_id, item_id].push(t) #attempts = np.empty((df_stud.shape[0],0)) #for l in LIST_OF_BOUNDARIES: # attempts_temp = np.zeros(df_stud.shape[0]) # a_sw array # for i in range(1,attempts_temp.shape[0]): # 1st line is always full of zeros # list_of_indices = np.where((df_stud[i,2] - df_stud[:i,2] < l) & (df_stud[i,1] == df_stud[:i,1])) # attempts_temp[i] = len(list_of_indices[0]) # attempts_temp = np.log(1+attempts_temp) # attempts = np.hstack((attempts,attempts_temp.reshape(-1,1))) else: attempts = np.multiply(np.cumsum(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)),0)[:-1],skills_temp) X['attempts'] = sparse.vstack([X['attempts'],sparse.csr_matrix(attempts)]) if "wins" in active_features: skills_temp = Q_mat[df_stud[:,1].astype(int)].copy() if tw == "tw_kc": wins = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS*Q_mat.shape[1])) for l, (item_id, t, correct) in enumerate(zip(df_stud[:,1], df_stud[:,2], df_stud[:,3])): for skill_id in dict_q_mat[item_id]: wins[l, skill_id*NB_OF_TIME_WINDOWS:(skill_id+1)*NB_OF_TIME_WINDOWS] = np.log(1 + \ np.array(q[stud_id, skill_id, "correct"].get_counters(t))) if correct: q[stud_id, skill_id, "correct"].push(t) #wins = np.empty((df_stud.shape[0],0)) #for l in LIST_OF_BOUNDARIES: # wins_temp = np.zeros((df_stud.shape[0],Q_mat.shape[1])) # c_sw array # for i in range(1,wins_temp.shape[0]): # 1st line is always full of zeros # list_of_indices = np.where(df_stud[i,2] - df_stud[:i,2] < l) # skills_temp = Q_mat[df_stud[:i,1].astype(int)][list_of_indices] # wins_temp[i] = np.sum(np.multiply(skills_temp,df_stud[:i,3][list_of_indices].reshape(-1,1)),0) # skills = Q_mat[df_stud[:,1].astype(int)] # wins_temp = np.log(1+np.multiply(wins_temp,skills)) # only keep KCs involved # wins = np.hstack((wins,wins_temp)) elif tw == "tw_items": wins = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS)) for l, (item_id, t, correct) in enumerate(zip(df_stud[:,1], df_stud[:,2], df_stud[:,3])): wins[l] = np.log(1 + np.array(q[stud_id, item_id, "correct"].get_counters(t))) if correct: q[stud_id, item_id, "correct"].push(t) #wins = np.empty((df_stud.shape[0],0)) #for l in LIST_OF_BOUNDARIES: # wins_temp = np.zeros(df_stud.shape[0]) # c_sw array # for i in range(1,wins_temp.shape[0]): # 1st line is always full of zeros # list_of_indices = np.where((df_stud[i,2] - df_stud[:i,2] < l) & (df_stud[i,1] == df_stud[:i,1])) # wins_temp[i] = np.log(1+np.sum(df_stud[:i,3][list_of_indices])) # wins = np.hstack((wins,wins_temp.reshape(-1,1))) else: wins = np.multiply(np.cumsum(np.multiply(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)), np.hstack((np.array([0]),df_stud[:,3])).reshape(-1,1)),0)[:-1],skills_temp) X['wins'] = sparse.vstack([X['wins'],sparse.csr_matrix(wins)]) if "fails" in active_features: skills_temp = Q_mat[df_stud[:,1].astype(int)].copy() fails = np.multiply(np.cumsum(np.multiply(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)), np.hstack((np.array([0]),1-df_stud[:,3])).reshape(-1,1)),0)[:-1],skills_temp) X["fails"] = sparse.vstack([X["fails"],sparse.csr_matrix(fails)]) if verbose: print(X["df"].shape) onehot = OneHotEncoder() if 'users' in active_features: X['users'] = onehot.fit_transform(X["df"][:,0].reshape(-1,1)) if verbose: print("Users encoded.") if 'items' in active_features: X['items'] = onehot.fit_transform(X["df"][:,1].reshape(-1,1)) if verbose: print("Items encoded.") sparse_df = sparse.hstack([sparse.csr_matrix(X['df']),sparse.hstack([X[agent] for agent in active_features])]).tocsr() return sparse_df
cols.append(c) data.append(d) suffix = 'ui' if options.tw: # Build time windows features suffix = 'das3h' df = full if 'skill_id' in full.columns: df = df.dropna(subset=['skill_id']) df['skill_ids'] = df['skill_id'].astype(str) else: df['skill_ids'] = [None] * len(df) dt = time.time() q = defaultdict(lambda: OurQueue()) # Prepare counters for time windows # Using zip is the fastest way to iterate DataFrames # Source: https://stackoverflow.com/a/34311080 for i_sample, user, item_id, t, correct, skill_ids in zip( df['i'], df['user'], df['item_id'], df['timestamp'], df['correct'], df['skill_ids']): for skill_id in skill_ids.split('~~') or q_mat[item_id]: # Fallback skill_id = int(skill_id) add(i_sample, codes[skill_id], 1) for pos, value in enumerate(q[user, skill_id].get_counters(t)): if value > 0: add(i_sample, extra_codes['attempts', skill_id, pos], log(1 + value)) for pos, value in enumerate(q[user, skill_id, 'correct'].get_counters(t)): if value > 0:
def test_simple(self): q = OurQueue() q.push(0) q.push(0.8 * 3600 * 24) q.push(5 * 3600 * 24) q.push(40 * 3600 * 24) self.assertEqual(q.get_counters(40 * 3600 * 24), [4, 1, 1, 1, 1])
def test_complex(self): q = OurQueue() q.push(0) q.push(10) q.push(3599) q.push(3600) q.push(3601) q.push(3600 * 24) q.push(3600 * 24 + 1) q.push(3600 * 24 * 7) q.push(3600 * 24 * 7 + 1) q.push(3600 * 24 * 7 * 30) q.push(3600 * 24 * 7 * 30 + 1) self.assertEqual(q.get_counters(3600 * 24 * 7 * 30 + 1), [11, 2, 2, 2, 2])
def df_to_sparse(df, Q_mat, active_features, tw=None, skip_sucessive=True, log_counts=False): """Build sparse features dataset from dense dataset and q-matrix. Arguments: df -- dense dataset, output from one function from prepare_data.py (pandas DataFrame) Q_mat -- q-matrix, output from one function from prepare_data.py (sparse array) active_features -- features used to build the dataset (list of strings) tw -- useful when script is *not* called from command line. Output: sparse_df -- sparse dataset. The 5 first columns of sparse_df are just the same columns as in df. Notes: * tw_kc and tw_items respectively encode time windows features instead of regular counter features at the skill and at the item level for wins and attempts, as decribed in our paper. As a consequence, these arguments can only be used along with the wins and/or attempts arguments. With tw_kc, one column per time window x skill is encoded, whereas with tw_items, one column per time window is encoded (it is assumed that items share the same time window biases). """ # Transform q-matrix into dictionary dt = time.time() dict_q_mat = {i:set() for i in range(Q_mat.shape[0])} for elt in np.argwhere(Q_mat == 1): dict_q_mat[elt[0]].add(elt[1]) X={} if 'skills' in active_features: X["skills"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'attempts' in active_features: if tw == "tw_kc": X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS))) elif tw == "tw_items": X["attempts"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS))) else: X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'wins' in active_features: if tw == "tw_kc": X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS))) elif tw == "tw_items": X["wins"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS))) else: X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) if 'fails' in active_features: X["fails"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]))) X['df'] = np.empty((0,4)) # Keep only track of line index + user/item id + correctness q = defaultdict(lambda: OurQueue()) # Prepare counters for time windows wf_counters = defaultdict(lambda: 0) if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: res = Parallel(n_jobs=-1,verbose=10)(delayed(encode_single_student)(df, stud_id, Q_mat, active_features, NB_OF_TIME_WINDOWS, q, dict_q_mat, tw, wf_counters, log_counts, X) for stud_id in df["user_id"].unique()) for X_stud in res: for key in X_stud.keys(): if key == "df": X[key] = np.vstack((X[key],X_stud[key])) else: X[key] = sparse.vstack([X[key],X_stud[key]]).tocsr() #sparse_df = sparse.vstack([sparse.csr_matrix(X_stud) for X_stud in res]).tocsr() #df["correct"].values.reshape(-1,1)), # sparse.hstack([X[agent] for agent in active_features])]).tocsr() #sparse_df = sparse_df[np.argsort(sparse_df[:,3])] # sort matrix by original index #X_df = sparse_df[:,:5] #sparse_df = sparse_df[:,5:] onehot = OneHotEncoder() if 'users' in active_features: if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: X['users'] = onehot.fit_transform(X["df"][:,0].reshape(-1,1)) else: X['users'] = onehot.fit_transform(df["user_id"].values.reshape(-1,1)) if 'items' in active_features: if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: X['items'] = onehot.fit_transform(X["df"][:,1].reshape(-1,1)) else: X['items'] = onehot.fit_transform(df["item_id"].values.reshape(-1,1)) if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: sparse_df = sparse.hstack([sparse.csr_matrix(X['df'])[:,-2].reshape(-1,1), sparse.hstack([X[agent] for agent in active_features])]).tocsr() #sparse_df = sparse_df[np.argsort(sparse.csr_matrix(X["df"])[:,-1])] # sort matrix by original index sparse_df = sparse_df[np.argsort(X["df"][:,-1])] # sort matrix by original index else: sparse_df = sparse.hstack([sparse.csr_matrix(df["correct"].values.reshape(-1,1)), sparse.hstack([X[agent] for agent in active_features])]).tocsr() # No need to sort sparse matrix here print("Preprocessed data in: ", time.time()-dt) #return sparse_df #if 'users' in active_features: # if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: # sparse_df = sparse.hstack([onehot.fit_transform(X_df[:,0].reshape(-1,1))]) # else: # X_users = onehot.fit_transform(df["user_id"].values.reshape(-1,1)) #if 'items' in active_features: # if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: # X_items = onehot.fit_transform(X_df[:,1].reshape(-1,1)) # else: # X_items = onehot.fit_transform(df["item_id"].values.reshape(-1,1)) #if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0: # sparse_df = sparse.hstack([]) # sparse_df = sparse.hstack([sparse.csr_matrix(X['df'][:,-2].reshape(-1,1)), # sparse.hstack([X[agent] for agent in active_features])]).tocsr() # sparse_df = sparse_df[np.argsort(X["df"][:,-1])] # sort matrix by original index #else: # sparse_df = sparse.hstack([sparse.csr_matrix(df["correct"].values.reshape(-1,1)), # sparse.hstack([X[agent] for agent in active_features])]).tocsr() # No need to sort sparse matrix here #print("Preprocessed data in: ", time.time()-dt) return sparse_df