예제 #1
0
    link_function = lambda x: np.log(1 + x)
elif options.pfa:
    suffix = 'swf'
    link_function = identity

if options.tw or options.pfa:  # Build time windows features
    df = full
    if 'skill_id' in full.columns:
        df = df.dropna(subset=['skill_id'])
        df['skill_ids'] = df['skill_id'].astype(str)
    else:
        df['skill_ids'] = [None] * len(df)

    dt = time.time()
    # Prepare counters for time windows
    q = defaultdict(lambda: OurQueue(only_forever=options.pfa))
    # Using zip is the fastest way to iterate DataFrames
    # Source: https://stackoverflow.com/a/34311080
    for i_sample, user, item_id, t, correct, skill_ids in zip(
            df['i'], df['user'], df['item_id'], df['timestamp'], df['correct'],
            df['skill_ids']):
        for skill_id in q_mat[item_id]:  # Fallback
            skill_id = int(skill_id)
            add(i_sample, codes[skill_id], 1)
            for pos, value in enumerate(q[user, skill_id].get_counters(t)):
                if value > 0:
                    add(i_sample, extra_codes['attempts', skill_id, pos],
                        link_function(1 + value))
            for pos, value in enumerate(q[user, skill_id,
                                          'correct'].get_counters(t)):
                if value > 0:
예제 #2
0
def df_to_sparse(df, Q_mat, active_features, tw=None, verbose=True):
	"""Build sparse features dataset from dense dataset and q-matrix.

	Arguments:
	df -- dense dataset, output from one function from prepare_data.py (pandas DataFrame)
	Q_mat -- q-matrix, output from one function from prepare_data.py (sparse array)
	active_features -- features used to build the dataset (list of strings)
	tw -- useful when script is *not* called from command line.
	verbose -- if True, print information on the encoding process (bool)

	Output:
	sparse_df -- sparse dataset. The 5 first columns of sparse_df are just the same columns as in df.

	Notes:
	* tw_kc and tw_items respectively encode time windows features instead of regular counter features
	  at the skill and at the item level for wins and attempts, as decribed in our paper. As a consequence,
	  these arguments can only be used along with the wins and/or attempts arguments. With tw_kc, one column
	  per time window x skill is encoded, whereas with tw_items, one column per time window is encoded (it is
	  assumed that items share the same time window biases).
	"""

	# Transform q-matrix into dictionary
	dict_q_mat = {i:set() for i in range(Q_mat.shape[0])}
	for elt in np.argwhere(Q_mat == 1):
		dict_q_mat[elt[0]].add(elt[1])

	X={}
	if 'skills' in active_features:
		X["skills"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'attempts' in active_features:
		if tw == "tw_kc":
			X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS)))
		elif tw == "tw_items":
			X["attempts"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS)))
		else:
			X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'wins' in active_features:
		if tw == "tw_kc":
			X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS)))
		elif tw == "tw_items":
			X["wins"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS)))
		else:
			X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'fails' in active_features:
		X["fails"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))

	X['df'] = np.empty((0,5)) # Keep track of the original dataset

	q = defaultdict(lambda: OurQueue())  # Prepare counters for time windows
	for stud_id in df["user_id"].unique():
		df_stud = df[df["user_id"]==stud_id][["user_id", "item_id", "timestamp", "correct", "inter_id"]].copy()
		df_stud.sort_values(by="timestamp", inplace=True) # Sort values 
		df_stud = np.array(df_stud)
		X['df'] = np.vstack((X['df'], df_stud))

		if 'skills' in active_features:
			skills_temp = Q_mat[df_stud[:,1].astype(int)].copy()
			X['skills'] = sparse.vstack([X["skills"],sparse.csr_matrix(skills_temp)])
		if "attempts" in active_features:
			skills_temp = Q_mat[df_stud[:,1].astype(int)].copy()
			if tw == "tw_kc":
				attempts = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS*Q_mat.shape[1]))
				for l, (item_id, t) in enumerate(zip(df_stud[:,1], df_stud[:,2])):
					for skill_id in dict_q_mat[item_id]:
						attempts[l, skill_id*NB_OF_TIME_WINDOWS:(skill_id+1)*NB_OF_TIME_WINDOWS] = np.log(1 + \
							np.array(q[stud_id, skill_id].get_counters(t)))
						q[stud_id, skill_id].push(t)
				#attempts = np.empty((df_stud.shape[0],0))
				#for l in LIST_OF_BOUNDARIES:
				#	attempts_temp = np.zeros((df_stud.shape[0],Q_mat.shape[1])) # a_sw array
				#	for i in range(1,attempts_temp.shape[0]): # 1st line is always full of zeros
				#		list_of_indices = np.where(df_stud[i,2] - df_stud[:i,2] < l)
				#		skills_temp = Q_mat[df_stud[:i,1].astype(int)][list_of_indices]
				#		attempts_temp[i] = np.sum(skills_temp,0)
				#	skills = Q_mat[df_stud[:,1].astype(int)]
				#	attempts_temp = np.log(1+np.multiply(attempts_temp,skills)) # only keep KCs involved
				#	attempts = np.hstack((attempts,attempts_temp))
			elif tw == "tw_items":
				attempts = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS))
				for l, (item_id, t) in enumerate(zip(df_stud[:,1], df_stud[:,2])):
					attempts[l] = np.log(1 + np.array(q[stud_id, item_id].get_counters(t)))
					q[stud_id, item_id].push(t)
				#attempts = np.empty((df_stud.shape[0],0))
				#for l in LIST_OF_BOUNDARIES:
				#	attempts_temp = np.zeros(df_stud.shape[0]) # a_sw array
				#	for i in range(1,attempts_temp.shape[0]): # 1st line is always full of zeros
				#		list_of_indices = np.where((df_stud[i,2] - df_stud[:i,2] < l) & (df_stud[i,1] == df_stud[:i,1]))
				#		attempts_temp[i] = len(list_of_indices[0])
				#	attempts_temp = np.log(1+attempts_temp)
				#	attempts = np.hstack((attempts,attempts_temp.reshape(-1,1)))
			else:
				attempts = np.multiply(np.cumsum(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)),0)[:-1],skills_temp)
			X['attempts'] = sparse.vstack([X['attempts'],sparse.csr_matrix(attempts)])
		if "wins" in active_features:
			skills_temp = Q_mat[df_stud[:,1].astype(int)].copy()
			if tw == "tw_kc":
				wins = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS*Q_mat.shape[1]))
				for l, (item_id, t, correct) in enumerate(zip(df_stud[:,1], df_stud[:,2], df_stud[:,3])):
					for skill_id in dict_q_mat[item_id]:
						wins[l, skill_id*NB_OF_TIME_WINDOWS:(skill_id+1)*NB_OF_TIME_WINDOWS] = np.log(1 + \
							np.array(q[stud_id, skill_id, "correct"].get_counters(t)))
						if correct:
							q[stud_id, skill_id, "correct"].push(t)
				#wins = np.empty((df_stud.shape[0],0))
				#for l in LIST_OF_BOUNDARIES:
				#	wins_temp = np.zeros((df_stud.shape[0],Q_mat.shape[1])) # c_sw array
				#	for i in range(1,wins_temp.shape[0]): # 1st line is always full of zeros
				#		list_of_indices = np.where(df_stud[i,2] - df_stud[:i,2] < l)
				#		skills_temp = Q_mat[df_stud[:i,1].astype(int)][list_of_indices]
				#		wins_temp[i] = np.sum(np.multiply(skills_temp,df_stud[:i,3][list_of_indices].reshape(-1,1)),0)
				#	skills = Q_mat[df_stud[:,1].astype(int)]
				#	wins_temp = np.log(1+np.multiply(wins_temp,skills)) # only keep KCs involved
				#	wins = np.hstack((wins,wins_temp))
			elif tw == "tw_items":
				wins = np.zeros((df_stud.shape[0], NB_OF_TIME_WINDOWS))
				for l, (item_id, t, correct) in enumerate(zip(df_stud[:,1], df_stud[:,2], df_stud[:,3])):
					wins[l] = np.log(1 + np.array(q[stud_id, item_id, "correct"].get_counters(t)))
					if correct:
						q[stud_id, item_id, "correct"].push(t)
				#wins = np.empty((df_stud.shape[0],0))
				#for l in LIST_OF_BOUNDARIES:
				#	wins_temp = np.zeros(df_stud.shape[0]) # c_sw array
				#	for i in range(1,wins_temp.shape[0]): # 1st line is always full of zeros
				#		list_of_indices = np.where((df_stud[i,2] - df_stud[:i,2] < l) & (df_stud[i,1] == df_stud[:i,1]))
				#		wins_temp[i] = np.log(1+np.sum(df_stud[:i,3][list_of_indices]))
				#	wins = np.hstack((wins,wins_temp.reshape(-1,1)))
			else:
				wins = np.multiply(np.cumsum(np.multiply(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)),
					np.hstack((np.array([0]),df_stud[:,3])).reshape(-1,1)),0)[:-1],skills_temp)
			X['wins'] = sparse.vstack([X['wins'],sparse.csr_matrix(wins)])
		if "fails" in active_features:
			skills_temp = Q_mat[df_stud[:,1].astype(int)].copy()
			fails = np.multiply(np.cumsum(np.multiply(np.vstack((np.zeros(skills_temp.shape[1]),skills_temp)),
				np.hstack((np.array([0]),1-df_stud[:,3])).reshape(-1,1)),0)[:-1],skills_temp)
			X["fails"] = sparse.vstack([X["fails"],sparse.csr_matrix(fails)])
		if verbose:
			print(X["df"].shape)

	onehot = OneHotEncoder()
	if 'users' in active_features:
		X['users'] = onehot.fit_transform(X["df"][:,0].reshape(-1,1))
		if verbose:
			print("Users encoded.")
	if 'items' in active_features:
		X['items'] = onehot.fit_transform(X["df"][:,1].reshape(-1,1))
		if verbose:
			print("Items encoded.")
	sparse_df = sparse.hstack([sparse.csr_matrix(X['df']),sparse.hstack([X[agent] for agent in active_features])]).tocsr()
	return sparse_df
예제 #3
0
    cols.append(c)
    data.append(d)


suffix = 'ui'
if options.tw:  # Build time windows features
    suffix = 'das3h'
    df = full
    if 'skill_id' in full.columns:
        df = df.dropna(subset=['skill_id'])
        df['skill_ids'] = df['skill_id'].astype(str)
    else:
        df['skill_ids'] = [None] * len(df)

    dt = time.time()
    q = defaultdict(lambda: OurQueue())  # Prepare counters for time windows
    # Using zip is the fastest way to iterate DataFrames
    # Source: https://stackoverflow.com/a/34311080
    for i_sample, user, item_id, t, correct, skill_ids in zip(
            df['i'], df['user'], df['item_id'], df['timestamp'], df['correct'],
            df['skill_ids']):
        for skill_id in skill_ids.split('~~') or q_mat[item_id]:  # Fallback
            skill_id = int(skill_id)
            add(i_sample, codes[skill_id], 1)
            for pos, value in enumerate(q[user, skill_id].get_counters(t)):
                if value > 0:
                    add(i_sample, extra_codes['attempts', skill_id, pos],
                        log(1 + value))
            for pos, value in enumerate(q[user, skill_id,
                                          'correct'].get_counters(t)):
                if value > 0:
예제 #4
0
 def test_simple(self):
     q = OurQueue()
     q.push(0)
     q.push(0.8 * 3600 * 24)
     q.push(5 * 3600 * 24)
     q.push(40 * 3600 * 24)
     self.assertEqual(q.get_counters(40 * 3600 * 24), [4, 1, 1, 1, 1])
예제 #5
0
 def test_complex(self):
     q = OurQueue()
     q.push(0)
     q.push(10)
     q.push(3599)
     q.push(3600)
     q.push(3601)
     q.push(3600 * 24)
     q.push(3600 * 24 + 1)
     q.push(3600 * 24 * 7)
     q.push(3600 * 24 * 7 + 1)
     q.push(3600 * 24 * 7 * 30)
     q.push(3600 * 24 * 7 * 30 + 1)
     self.assertEqual(q.get_counters(3600 * 24 * 7 * 30 + 1), [11, 2, 2, 2, 2])
예제 #6
0
def df_to_sparse(df, Q_mat, active_features, tw=None, skip_sucessive=True, log_counts=False):
	"""Build sparse features dataset from dense dataset and q-matrix.

	Arguments:
	df -- dense dataset, output from one function from prepare_data.py (pandas DataFrame)
	Q_mat -- q-matrix, output from one function from prepare_data.py (sparse array)
	active_features -- features used to build the dataset (list of strings)
	tw -- useful when script is *not* called from command line.

	Output:
	sparse_df -- sparse dataset. The 5 first columns of sparse_df are just the same columns as in df.

	Notes:
	* tw_kc and tw_items respectively encode time windows features instead of regular counter features
	  at the skill and at the item level for wins and attempts, as decribed in our paper. As a consequence,
	  these arguments can only be used along with the wins and/or attempts arguments. With tw_kc, one column
	  per time window x skill is encoded, whereas with tw_items, one column per time window is encoded (it is
	  assumed that items share the same time window biases).
	"""

	# Transform q-matrix into dictionary
	dt = time.time()
	dict_q_mat = {i:set() for i in range(Q_mat.shape[0])}
	for elt in np.argwhere(Q_mat == 1):
		dict_q_mat[elt[0]].add(elt[1])

	X={}
	if 'skills' in active_features:
		X["skills"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'attempts' in active_features:
		if tw == "tw_kc":
			X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS)))
		elif tw == "tw_items":
			X["attempts"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS)))
		else:
			X["attempts"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'wins' in active_features:
		if tw == "tw_kc":
			X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1]*NB_OF_TIME_WINDOWS)))
		elif tw == "tw_items":
			X["wins"] = sparse.csr_matrix(np.empty((0, NB_OF_TIME_WINDOWS)))
		else:
			X["wins"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))
	if 'fails' in active_features:
		X["fails"] = sparse.csr_matrix(np.empty((0, Q_mat.shape[1])))

	X['df'] = np.empty((0,4)) # Keep only track of line index + user/item id + correctness

	q = defaultdict(lambda: OurQueue())  # Prepare counters for time windows
	wf_counters = defaultdict(lambda: 0)
	if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
		res = Parallel(n_jobs=-1,verbose=10)(delayed(encode_single_student)(df, stud_id, Q_mat, active_features, NB_OF_TIME_WINDOWS, q, dict_q_mat, tw,
			wf_counters, log_counts, X) for stud_id in df["user_id"].unique())
		for X_stud in res:
			for key in X_stud.keys():
				if key == "df":
					X[key] = np.vstack((X[key],X_stud[key]))
				else:
					X[key] = sparse.vstack([X[key],X_stud[key]]).tocsr()
		#sparse_df = sparse.vstack([sparse.csr_matrix(X_stud) for X_stud in res]).tocsr() #df["correct"].values.reshape(-1,1)),
		#		sparse.hstack([X[agent] for agent in active_features])]).tocsr()
		#sparse_df = sparse_df[np.argsort(sparse_df[:,3])] # sort matrix by original index
		#X_df = sparse_df[:,:5]
		#sparse_df = sparse_df[:,5:]
	onehot = OneHotEncoder()
	if 'users' in active_features:
		if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
			X['users'] = onehot.fit_transform(X["df"][:,0].reshape(-1,1))
		else:
			X['users'] = onehot.fit_transform(df["user_id"].values.reshape(-1,1))
	if 'items' in active_features:
		if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
			X['items'] = onehot.fit_transform(X["df"][:,1].reshape(-1,1))
		else:
			X['items'] = onehot.fit_transform(df["item_id"].values.reshape(-1,1))
	if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
		sparse_df = sparse.hstack([sparse.csr_matrix(X['df'])[:,-2].reshape(-1,1),
			sparse.hstack([X[agent] for agent in active_features])]).tocsr()
		#sparse_df = sparse_df[np.argsort(sparse.csr_matrix(X["df"])[:,-1])] # sort matrix by original index
		sparse_df = sparse_df[np.argsort(X["df"][:,-1])] # sort matrix by original index
	else:
		sparse_df = sparse.hstack([sparse.csr_matrix(df["correct"].values.reshape(-1,1)),
			sparse.hstack([X[agent] for agent in active_features])]).tocsr()
		# No need to sort sparse matrix here
	print("Preprocessed data in: ", time.time()-dt)
	#return sparse_df
	#if 'users' in active_features:
	#	if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
	#		sparse_df = sparse.hstack([onehot.fit_transform(X_df[:,0].reshape(-1,1))])
	#	else:
	#		X_users = onehot.fit_transform(df["user_id"].values.reshape(-1,1))
	#if 'items' in active_features:
	#	if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
	#		X_items = onehot.fit_transform(X_df[:,1].reshape(-1,1))
	#	else:
	#		X_items = onehot.fit_transform(df["item_id"].values.reshape(-1,1))
	#if len(set(active_features).intersection({"skills","attempts","wins","fails"})) > 0:
	#	sparse_df = sparse.hstack([])
	#	sparse_df = sparse.hstack([sparse.csr_matrix(X['df'][:,-2].reshape(-1,1)),
	#		sparse.hstack([X[agent] for agent in active_features])]).tocsr()
	#	sparse_df = sparse_df[np.argsort(X["df"][:,-1])] # sort matrix by original index
	#else:
	#	sparse_df = sparse.hstack([sparse.csr_matrix(df["correct"].values.reshape(-1,1)),
	#		sparse.hstack([X[agent] for agent in active_features])]).tocsr()
		# No need to sort sparse matrix here
	#print("Preprocessed data in: ", time.time()-dt)
	return sparse_df