def runAssist(datasetName = 'assistments12', isTest = True, isAll = False, TmpDir = "./data"): ####################################### # LC parameters ####################################### #assistments12原始数据里的最值,可以注释,不要删 #low_time = "2012-09-01 00:00:00" #high_time = "2013-09-01 00:00:00" if isTest == True: userLC = [10, 300] problemLC = [10, 300] low_time = "2012-09-01 00:00:00" high_time = "2012-09-10 00:00:00" timeLC = [low_time, high_time] else: userLC = [10, 3000] problemLC = [10, 3000] low_time = "2012-09-01 00:00:00" high_time = "2012-09-30 00:00:00" timeLC = [low_time, high_time] if isAll == True: userLC = [10, 1e9] problemLC = [10, 1e9] low_time = "2012-09-01 00:00:00" high_time = "2013-09-01 00:00:00" timeLC = [low_time, high_time] a = _DataProcessor(userLC, problemLC, timeLC, 'assist', TmpDir = TmpDir) return a
def runOJ(fold_id, is_test=True): ####################################### # LC parameters ####################################### userLC = [30,3600,0.1,1] problemLC = [30,1e9,0,1] #hdu原始数据里的最值,可以注释,不要删 low_time = "2018-06-01 00:00:00" high_time = "2018-11-29 00:00:00" timeLC = [low_time, high_time] data_processor = _DataProcessor(userLC, problemLC, timeLC, 'oj', TmpDir = "./DataProcessor/data") LCDataDir = data_processor.LCDataDir saveDir = os.path.join(LCDataDir, 'DKVMN') print("===================================") print("metrics save path: ", saveDir) print("===================================") prepareFolder(saveDir) LC_params = data_processor.LC_params dataset_params = copy.deepcopy(LC_params) dataset_params["trainRate"] = 0.8 dataset_params["batch_size"] = 32 dataset_params['kFold'] = 5 [train_dataset, test_dataset, problem_num] = data_processor.loadDKVMNbatchData_5F(dataset_params, fold_id) ####################################### # model parameters ####################################### model_params = {} model_params['m_N'] = 40 model_params['mk_dim'] = 60 model_params['mv_dim'] = 60 model_params['threshold'] = 0.5 model_params['data_shape'] = [data for data, label in train_dataset][0].shape.as_list() model_params['problem_num'] = problem_num model_params['epoch'] = 200 model_params['metrics_path'] = saveDir + '/metrics' + str(fold_id) + '.csv' model = DKVMN(model_params) if is_test: train_dataset = train_dataset.take(10) test_dataset = test_dataset.take(8) ####################################### # train parameters ####################################### train(epoch=model_params['epoch'], model=model, train_dataset=train_dataset, test_dataset=test_dataset) ####################################### # save model ####################################### results={'LC_params':LC_params, 'model_params':model_params,'results':{}} temp = results['results'] [temp['tf_Accuracy'], temp['tf_Precision'], temp['tf_Recall'], temp['tf_AUC'], temp['tf_MAE'], temp['tf_RMSE']] = get_last_epoch_data(model, test_dataset) model_params.pop("metrics_path") saveDict(results, saveDir, 'results'+ getLegend(model_params)+'.json')
def runAssist(is_test=True): ####################################### # LC parameters ####################################### userLC = [10, 3000] problemLC = [10, 3000] #hdu原始数据里的最值,可以注释,不要删 low_time = "2012-09-01 00:00:00" high_time = "2012-09-30 00:00:00" timeLC = [low_time, high_time] data_processor = _DataProcessor(userLC, problemLC, timeLC, 'assist', TmpDir="./DataProcessor/data") split_dict_data = data_processor.loadSplitInfo(kFold=5) [df, QMatrix, StaticInformation, DictList] = data_processor.dataprocessor.loadLCData() # others all_user = list(df["user_id"].unique()) num_user = len(all_user) train_uid = all_user[:int(num_user * 0.8)] test_uid = all_user[int(num_user * 0.8):] plt.title("no shuffle") train_length = [len(df[df['user_id'] == uid]) for uid in train_uid] test_length = [len(df[df['user_id'] == uid]) for uid in test_uid] plt.hist(train_length, bins=20, density=True, label="train") plt.hist(test_length, bins=20, density=True, label="test") plt.legend() plt.show() for i in range(5): train_uid = split_dict_data[str(i)]['train'] train_length = [len(df[df['user_id'] == uid]) for uid in train_uid] plt.title("FOLD: " + str(i)) plt.hist(train_length, bins=20, density=True, label="train") test_uid = split_dict_data[str(i)]['test'] test_length = [len(df[df['user_id'] == uid]) for uid in test_uid] plt.hist(test_length, bins=20, density=True, label="test") plt.legend() plt.show()
def runOJ(datasetName = 'hdu', isTest = True, isAll = False, TmpDir = "./data"): ####################################### # LC parameters ####################################### #hdu原始数据里的最值,可以注释,不要删 #low_time = "2018-06-01 00:00:00" #high_time = "2018-11-29 00:00:00" if isTest == True: userLC = [10, 500, 0.1, 1] problemLC = [10, 500, 0, 1] low_time = "2018-11-22 00:00:00" high_time = "2018-11-29 00:00:00" timeLC = [low_time, high_time] else: userLC = [30, 3600, 0.1, 1] problemLC = [30, 1e9, 0, 1] low_time = "2018-06-01 00:00:00" high_time = "2018-11-29 00:00:00" timeLC = [low_time, high_time] a = _DataProcessor(userLC, problemLC, timeLC, 'oj', TmpDir = TmpDir) return a
def runKDD(datasetName, isTest = True, isAll = False, TmpDir = "./data"): ####################################### # LC parameters ####################################### #algebra08原始数据里的最值 #low_time = "2008-09-08 14:46:48" #high_time = "2009-07-06 18:02:12" if datasetName == 'algebra08': if isTest == True: userLC = [10, 3000] problemLC = [10, 5000] low_time = "2008-12-21 14:46:48" high_time = "2009-01-01 00:00:00" timeLC = [low_time, high_time] else: userLC = [30, 3600] problemLC = [30, 1e9] low_time = "2008-09-08 14:46:48" high_time = "2009-01-01 00:00:00" timeLC = [low_time, high_time] if isAll == True: userLC = [20, 1e9] problemLC = [0, 1e9] low_time = "2008-09-08 14:46:48" high_time = "2009-07-06 18:02:12" timeLC = [low_time, high_time] elif datasetName == 'algebra05': if isTest == True: userLC = [10, 3000] problemLC = [10, 5000] low_time = "2006-06-01 00:00:00" high_time = "2006-06-07 11:12:38" timeLC = [low_time, high_time] else: userLC = [20, 1e9] problemLC = [0, 1e9] low_time = "2005-08-30 09:50:35" high_time = "2006-06-07 11:12:38" timeLC = [low_time, high_time] elif datasetName == 'bridge_algebra06': if isTest == True: userLC = [10, 3000] problemLC = [10, 5000] low_time = "2006-06-10 08:26:16" high_time = "2007-06-20 13:36:57" timeLC = [low_time, high_time] else: userLC = [20, 1e9] problemLC = [0, 1e9] low_time = "2006-10-05 08:26:16" high_time = "2007-06-20 13:36:57" timeLC = [low_time, high_time] a = _DataProcessor(userLC, problemLC, timeLC, 'kdd', datasetName = datasetName, TmpDir = TmpDir) return a
def runAssist(fold_id, is_test=True): ####################################### # LC parameters ####################################### userLC = [10, 3000] problemLC = [10, 3000] #hdu原始数据里的最值,可以注释,不要删 low_time = "2012-09-01 00:00:00" high_time = "2012-09-30 00:00:00" timeLC = [low_time, high_time] data_processor = _DataProcessor(userLC, problemLC, timeLC, 'assist', TmpDir="./DataProcessor/data") LCDataDir = data_processor.LCDataDir saveDir = os.path.join(LCDataDir, 'AKT') print("===================================") print("metrics save path: ", saveDir) print("===================================") prepareFolder(saveDir) LC_params = data_processor.LC_params dataset_params = copy.deepcopy(LC_params) dataset_params["trainRate"] = 0.8 dataset_params["batch_size"] = 32 dataset_params['kFold'] = 5 [train_dataset, test_dataset, Q_matrix] = data_processor.loadAKTData_5F(dataset_params, fold_id) Q_matrix = Q_matrix.toarray().astype(np.float32) ####################################### # model parameters ####################################### model_params = {} model_params['problem_num'] = Q_matrix.shape[0] model_params['concept_num'] = Q_matrix.shape[1] model_params['embed_dim'] = 20 model_params['epoch'] = 200 model_params['threshold'] = 0.5 model_params['metrics_path'] = saveDir + '/metrics.csv' model_params["data_shape"] = [ data for data, label in train_dataset.take(1) ][0].shape.as_list() model_params['Q_matrix'] = Q_matrix model_params['num_head'] = 5 model = AKT(model_params) if is_test: train_dataset = train_dataset.take(10) test_dataset = test_dataset.take(8) ####################################### # train parameters ####################################### train(epoch=model_params['epoch'], model=model, train_dataset=train_dataset, test_dataset=test_dataset) ####################################### # save model ####################################### results = { 'LC_params': LC_params, 'model_params': model_params, 'results': {} } temp = results['results'] [ temp['tf_Accuracy'], temp['tf_Precision'], temp['tf_Recall'], temp['tf_AUC'], temp['tf_MAE'], temp['tf_RMSE'] ] = get_last_epoch_data(model, test_dataset) model_params.pop("metrics_path")
def runAssist(): Features = {} Features['users'] = True Features['items'] = True Features['skills'] = True Features['lasttime_1kc'] = False Features['lasttime_2items'] = False Features['lasttime_3sequence'] = False Features['interval_1kc'] = False Features['interval_2items'] = False Features['interval_3sequence'] = False Features['wins_1kc'] = True Features['wins_2items'] = False Features['fails_1kc'] = True Features['fails_2items'] = False Features['attempts_1kc'] = False Features['attempts_2items'] = False active = [key for key, value in Features.items() if value] all_features = list(Features.keys()) features_suffix = getFeaturesSuffix(active) window_lengths = [365 * 24 * 3600] ####################################### # LC parameters ####################################### userLC = [10, 3000] problemLC = [10, 3000] # algebra08原始数据里的最值,可以注释,不要删 low_time = "2012-09-01 14:46:48 " high_time = "2012-09-30 00:00:00" timeLC = [low_time, high_time] a = _DataProcessor(userLC, problemLC, timeLC, 'assist', TmpDir="./DataProcessor/data") LCDataDir = a.LCDataDir saveDir = os.path.join(LCDataDir, 'KTM') print("===================================") print("metrics save path: ", saveDir) print("===================================") prepareFolder(saveDir) batch_size = 32 train_fraction = 0.8 dataset_params = { 'active': active, 'window_lengths': window_lengths, 'batch_size': batch_size, 'train_fraction': train_fraction } train_dataset, test_dataset = a.loadKTMData(dataset_params, all_features) os._exit(0) feature_num = [d for d, l in train_dataset.take(1)][0].shape[-1] print('feature_num: ', feature_num) model_params = {} model_params["feature_num"] = feature_num model_params["embed_dim"] = 20 model_params["threshold"] = 0.5 model_params["metrics_path"] = saveDir + "/metrics.csv" model = KTM(model_params) train(epoch=30, model=model, train_dataset=train_dataset, test_dataset=test_dataset)
def runOJ(is_test=True): Features = {} Features['users'] = True Features['items'] = True Features['skills'] = True Features['lasttime_1kc'] = False Features['lasttime_2items'] = False Features['lasttime_3sequence'] = False Features['interval_1kc'] = False Features['interval_2items'] = False Features['interval_3sequence'] = False Features['wins_1kc'] = True Features['wins_2items'] = False Features['fails_1kc'] = True Features['fails_2items'] = False Features['attempts_1kc'] = False Features['attempts_2items'] = False active = [key for key, value in Features.items() if value] all_features = list(Features.keys()) features_suffix = getFeaturesSuffix(active) window_lengths = [365 * 24 * 3600] ####################################### # LC parameters ####################################### userLC = [30, 3600, 0.1, 1] problemLC = [30, 1e9, 0, 1] #hdu原始数据里的最值,可以注释,不要删 low_time = "2018-06-01 00:00:00" high_time = "2018-11-29 00:00:00" timeLC = [low_time, high_time] data_processor = _DataProcessor(userLC, problemLC, timeLC, 'oj', TmpDir="./DataProcessor/data") LCDataDir = data_processor.LCDataDir saveDir = os.path.join(LCDataDir, 'KTM') print("===================================") print("metrics save path: ", saveDir) print("===================================") prepareFolder(saveDir) LC_params = data_processor.LC_params dataset_params = copy.deepcopy(LC_params) train_fraction = 0.8 batch_size = 32 dataset_params = { 'active': active, 'window_lengths': window_lengths, 'batch_size': batch_size, 'train_fraction': train_fraction } train_dataset, test_dataset = a.loadKTMData(dataset_params, all_features=all_features) os._exit(0) ####################################### # model parameters ####################################### model_params = {} model_params['trainRate'] = 0.8 model_params['lstm_units'] = 40 model_params['dropout'] = 0.01 model_params['l2'] = 0.01 model_params['problem_embed_dim'] = 20 model_params['problem_num'] = problem_num model_params['epoch'] = 200 model_params['threshold'] = 0.5 model_params['metrics_path'] = saveDir + '/metrics.csv' model_params["data_shape"] = [ data for data, label in train_dataset.take(1) ][0].shape.as_list() model = DKT(model_params) if is_test: train_dataset = train_dataset.take(10) test_dataset = test_dataset.take(8) ####################################### # train parameters ####################################### train(epoch=model_params['epoch'], model=model, train_dataset=train_dataset, test_dataset=test_dataset) ####################################### # save model ####################################### results = { 'LC_params': LC_params, 'model_params': model_params, 'results': {} } temp = results['results'] [ temp['tf_Accuracy'], temp['tf_Precision'], temp['tf_Recall'], temp['tf_AUC'], temp['tf_MAE'], temp['tf_RMSE'] ] = get_last_epoch_data(model, test_dataset) model_params.pop("metrics_path") saveDict(results, saveDir, 'results' + getLegend(model_params) + '.json')
def runKDD(is_test=True, datasetName='algebra05'): ####################################### # LC parameters ####################################### if datasetName == 'algebra08': userLC = [30, 3600] problemLC = [30, 1e9] low_time = "2008-09-08 14:46:48" high_time = "2009-01-01 00:00:00" timeLC = [low_time, high_time] elif datasetName == 'algebra05': userLC = [20, 1e9] problemLC = [0, 1e9] low_time = "2005-08-30 09:50:35" high_time = "2006-06-07 11:12:38" _time = "2005-09-3 09:50:35" timeLC = [low_time, high_time] elif datasetName == 'bridge_algebra06': userLC = [20, 1e9] problemLC = [0, 1e9] low_time = "2006-10-05 08:26:16" high_time = "2007-06-20 13:36:57" timeLC = [low_time, high_time] ''' userLC = [30, 3600] problemLC = [30, 1e9] # algebra08原始数据里的最值,可以注释,不要删 low_time = "2008-09-08 14:46:48" high_time = "2009-01-01 00:00:00" timeLC = [low_time, high_time] ''' #data_processor = _DataProcessor(userLC, problemLC, timeLC, 'kdd', TmpDir = "./DataProcessor/data") data_processor = _DataProcessor(userLC, problemLC, timeLC, 'kdd', datasetName=datasetName) LCDataDir = data_processor.LCDataDir saveDir = os.path.join(LCDataDir, 'DKT') print("===================================") print("metrics save path: ", saveDir) print("dataset is: ", datasetName) print("===================================") prepareFolder(saveDir) LC_params = data_processor.LC_params dataset_params = copy.deepcopy(LC_params) dataset_params["trainRate"] = 0.8 dataset_params["batch_size"] = 32 dataset_params['kFold'] = 5 [train_dataset, test_dataset, problem_num] = data_processor.loadDKTbatchData_5F(dataset_params, fold_id) ####################################### # model parameters ####################################### model_params = {} model_params['trainRate'] = 0.8 model_params['lstm_units'] = 40 model_params['dropout'] = 0.01 model_params['l2'] = 0.01 model_params['problem_embed_dim'] = 20 model_params['problem_num'] = problem_num model_params['epoch'] = 200 model_params['threshold'] = 0.5 model_params['metrics_path'] = saveDir + '/metrics' + str(fold_id) + '.csv' model_params["data_shape"] = [ data for data, label in train_dataset.take(1) ][0].shape.as_list() model = DKT(model_params) if is_test: train_dataset = train_dataset.take(10) test_dataset = test_dataset.take(8) ####################################### # train parameters ####################################### train(epoch=model_params['epoch'], model=model, train_dataset=train_dataset, test_dataset=test_dataset) ####################################### # save model ####################################### results = { 'LC_params': LC_params, 'model_params': model_params, 'results': {} } temp = results['results'] [ temp['tf_Accuracy'], temp['tf_Precision'], temp['tf_Recall'], temp['tf_AUC'], temp['tf_MAE'], temp['tf_RMSE'] ] = get_last_epoch_data(model, test_dataset) model_params.pop("metrics_path") saveDict(results, saveDir, 'results' + getLegend(model_params) + '.json')