string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " ? ", string) string = re.sub(r"\s{2,}", " ", string) return string # load data data = DATA(args, tokenizer) train_iter = data.train_iter test_iter = data.test_iter # vocab wordvocab = data.TEXT.vocab.itos # full vocab word_dic_full = {} word_invdic_full = {} for ii, ww in enumerate(wordvocab): word_dic_full[ww] = ii word_invdic_full[ii] = ww args.embed_num = len(data.TEXT.vocab) args.class_num = len(data.LABEL.vocab)
import pickle from load_data import DATA from sklearn.decomposition import PCA from sklearn.decomposition import FastICA from sklearn.random_projection import GaussianRandomProjection from sklearn.manifold import TSNE from scipy.stats import kurtosis DATA_RED = {key: {} for key in DATA} for key, val in DATA.items(): print(f"Transforming {key} data...") X, y = val # pca pca = PCA(n_components=0.95, whiten=True, random_state=0) X_pca = pca.fit_transform(X - X.mean()) DATA_RED[key]["pca"] = X_pca, y # ica ica = FastICA(n_components=X_pca.shape[1], whiten=True, random_state=0) X_ica = ica.fit_transform(X) DATA_RED[key]["ica"] = X_ica, y # rca rca = GaussianRandomProjection(n_components=X_pca.shape[1], random_state=0) X_rca = rca.fit_transform(X) kurt = kurtosis(X_ica) kurt_rank = (-kurt).argsort() DATA_RED[key]["rca"] = X_rca[:, kurt_rank], y # tsne tsne = TSNE(n_components=3, random_state=0, n_jobs=-1)
params.data_name = dataset if dataset in {"synthetic"}: params.n_question = 50 params.batch_size = 128 params.seqlen = 200 params.data_dir = '../dataset/' + dataset params.data_name = 'naive_c5_q50_s4000_v0' params.save = params.data_name params.load = params.data_name # Setup if "pid" not in params.data_name: dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') else: dat = PID_DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') seedNum = params.seed np.random.seed(seedNum) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seedNum) np.random.seed(seedNum) file_name_identifier = get_file_name_identifier(params) ###Train- Test d = vars(params)
params.lr = params.init_lr params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim params.dataset = dataset if not params.gpus: ctx = mx.cpu() print("Training with cpu ...") else: ctx = mx.gpu(int(params.gpus)) print("Training with gpu(" + params.gpus + ") ...") params.ctx = ctx # Read data dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') seedNum = 224 np.random.seed(seedNum) if not params.test: params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim d = vars(params) for key in d: print('\t', key, '\t', d[key]) file_name = 'b' + str(params.batch_size) + \ '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \ '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \ '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \ '_f' + str(params.final_fc_dim) + '_s' + str(seedNum)
def get_auc(fold_num): # Parse Arguments parser = argparse.ArgumentParser(description='Script to test KT') # Basic Parameters parser.add_argument('--max_iter', type=int, default=500, help='number of iterations') parser.add_argument('--train_set', type=int, default=fold_num) parser.add_argument('--seed', type=int, default=224, help='default seed') # Common parameters parser.add_argument('--optim', type=str, default='adam', help='Default Optimizer') parser.add_argument('--batch_size', type=int, default=24, help='the batch size') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate') parser.add_argument('--maxgradnorm', type=float, default=-1, help='maximum gradient norm') parser.add_argument('--final_fc_dim', type=int, default=512, help='hidden state dim for final fc layer') # AKT Specific Parameter parser.add_argument('--d_model', type=int, default=256, help='Transformer d_model shape') parser.add_argument('--d_ff', type=int, default=1024, help='Transformer d_ff shape') parser.add_argument('--dropout', type=float, default=0.05, help='Dropout rate') parser.add_argument('--n_block', type=int, default=1, help='number of blocks') parser.add_argument('--n_head', type=int, default=8, help='number of heads in multihead attention') parser.add_argument('--kq_same', type=int, default=1) # AKT-R Specific Parameter parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty for difficulty') # DKVMN Specific Parameter parser.add_argument('--s_embed_dim', type=int, default=50, help='question embedding dimensions') parser.add_argument('--sa_embed_dim', type=int, default=256, help='skill-response embedding dimensions') parser.add_argument('--memory_size', type=int, default=50, help='memory size') parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std') # DKT Specific Parameter parser.add_argument('--hidden_dim', type=int, default=512) parser.add_argument('--lamda_r', type=float, default=0.1) parser.add_argument('--lamda_w1', type=float, default=0.1) parser.add_argument('--lamda_w2', type=float, default=0.1) # Datasets and Model parser.add_argument( '--model', type=str, default='akt_eid', help="combination of akt, eid (mandatory) separated by underscore '_'." ) parser.add_argument('--dataset', type=str, default='assist2009_eid') params = parser.parse_args() dataset = params.dataset if dataset in {'assist2009_eid'}: params.batch_size = 24 params.seqlen = 400 params.data_dir = 'data/' + dataset params.data_name = dataset params.n_skill = 124 params.n_eid = 26688 params.n_tid = 214 #maximum true response count in past params.n_fid = 214 #maximum false response count in past params.n_xid = 0 params.n_yid = 0 if dataset in {'assist2017_eid'}: params.batch_size = 24 params.seqlen = 200 params.data_dir = 'data/' + dataset params.data_name = dataset params.n_skill = 102 params.n_eid = 3162 params.n_tid = 12 #maximum true response count in past params.n_fid = 90 #maximum false response count in past params.n_xid = 0 params.n_yid = 0 params.save = params.data_name params.load = params.data_name # Setup if 'eid' in params.data_name: dat = EID_DATA(n_skill=params.n_skill, seqlen=params.seqlen, separate_char=',') else: dat = DATA(n_skill=params.n_skill, seqlen=params.seqlen, separate_char=',') seedNum = params.seed np.random.seed(seedNum) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seedNum) np.random.seed(seedNum) file_name_identifier = get_file_name_identifier(params) ###Train- Test d = vars(params) for key in d: print('\t', key, '\t', d[key]) #model path file_name = '' for item_ in file_name_identifier: file_name = file_name + item_[0] + str(item_[1]) train_data_path = params.data_dir + '/' + \ params.data_name + '_train' + str(params.train_set) + '.csv' valid_data_path = params.data_dir + "/" + \ params.data_name + '_valid' + str(params.train_set) + '.csv' train_s_data, train_sa_data, train_eid, train_tid, train_fid, train_xid, train_yid = dat.load_data( train_data_path) valid_s_data, valid_sa_data, valid_eid, valid_tid, valid_fid, valid_xid, valid_yid = dat.load_data( valid_data_path) print('\n') print('train_s_data.shape', train_s_data.shape) print('train_sa_data.shape', train_sa_data.shape) print('train_eid.shape', train_eid.shape) print('train_tid.shape', train_tid.shape) print('train_fid.shape', train_fid.shape) print('valid_s_data.shape', valid_s_data.shape) print('valid_sa_data.shape', valid_sa_data.shape) print('valid_eid.shape', valid_eid.shape) print('valid_tid.shape', valid_tid.shape) print('valid_fid.shape', valid_fid.shape) print('\n') # Train and get the best episode best_epoch = train_one_dataset(params, file_name, train_s_data, train_sa_data, train_eid, train_tid, train_fid, train_xid, train_yid, valid_s_data, valid_sa_data, valid_eid, valid_tid, valid_fid, valid_xid, valid_yid) test_data_path = params.data_dir + '/' + \ params.data_name + '_test' + str(params.train_set) + '.csv' test_s_data, test_sa_data, test_eid, test_tid, test_fid, test_xid, test_yid, test_s_num = dat.load_test_data( test_data_path) auc = test_one_dataset(params, file_name, test_s_data, test_sa_data, test_eid, test_tid, test_fid, test_xid, test_yid, best_epoch) return test_s_num, auc
def get_auc(fold_num): # Parse Arguments parser = argparse.ArgumentParser(description='Script to test KT') # Basic Parameters parser.add_argument('--max_iter', type=int, default=1000, help='number of iterations') parser.add_argument('--train_set', type=int, default=fold_num) parser.add_argument('--seed', type=int, default=224, help='default seed') # Common parameters parser.add_argument('--optim', type=str, default='adam', help='Default Optimizer') parser.add_argument('--batch_size', type=int, default=24, help='the batch size') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate') parser.add_argument('--maxgradnorm', type=float, default=-1, help='maximum gradient norm') parser.add_argument('--final_fc_dim', type=int, default=512, help='hidden state dim for final fc layer') # AKT Specific Parameter parser.add_argument('--d_model', type=int, default=256, help='Transformer d_model shape') parser.add_argument('--d_ff', type=int, default=1024, help='Transformer d_ff shape') parser.add_argument('--dropout', type=float, default=0.05, help='Dropout rate') parser.add_argument('--n_block', type=int, default=1, help='number of blocks') parser.add_argument('--n_head', type=int, default=8, help='number of heads in multihead attention') parser.add_argument('--kq_same', type=int, default=1) # AKT-R Specific Parameter parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty for difficulty') # Datasets and Model parser.add_argument('--model', type=str, default='akt', help="combination of akt(mandatory), e_p_f_a (mandatory) separated by underscore '_'.") parser.add_argument('--dataset', type=str, default="assist2009") parser.add_argument('--test', type=bool, default=False, help='enable testing') params = parser.parse_args() dataset = params.dataset if dataset in {"assist2009"}: params.n_question = 124 params.batch_size = 24 params.seqlen = 400 params.data_dir = 'data/'+dataset params.data_name = dataset params.n_pid = 19932 params.n_tid = 8 params.n_fid = 8 params.n_sd = 14 #sequence_delay params.n_rd = 11 #repeat_delay params.n_xid = 816 params.n_yid = 4 if dataset in {"assist2017"}: params.batch_size = 24 params.seqlen = 200 params.data_dir = 'data/'+dataset params.data_name = dataset params.n_question = 102 params.n_pid = 0#3162 params.n_tid = 4 params.n_fid = 7 params.n_sd = 18 #sequence_delay params.n_rd = 20 #repeat_delay params.n_xid = 16 params.n_yid = 6 if dataset in {"statics"}: params.n_question = 1223 params.batch_size = 24 params.seqlen = 200 params.data_dir = 'data/'+dataset params.data_name = dataset params.n_pid = 0 params.n_tid = 8 params.n_fid = 9 params.n_sd = 16 params.n_rd = 17 params.n_xid = 382 params.n_yid = 19 if dataset in {"slepemapy"}: params.n_question = 1277 params.batch_size = 24 params.seqlen = 200 params.data_dir = 'data/'+dataset params.data_name = dataset params.n_pid = 56030 params.n_tid = 7 params.n_fid = 5 params.n_sd = 14 params.n_rd = 15 params.n_xid = 21 params.n_yid = 0#56030 params.save = params.data_name params.load = params.data_name # Setup dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') seedNum = params.seed np.random.seed(seedNum) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seedNum) np.random.seed(seedNum) file_name_identifier = get_file_name_identifier(params) ###Train- Test d = vars(params) for key in d: print('\t', key, '\t', d[key]) #model path file_name = '' for item_ in file_name_identifier: file_name = file_name+item_[0] + str(item_[1]) train_data_path = params.data_dir + "/" + \ params.data_name + "_train"+str(params.train_set)+".csv" valid_data_path = params.data_dir + "/" + \ params.data_name + "_valid"+str(params.train_set)+".csv" train_data = dat.load_data(train_data_path) valid_data = dat.load_data(valid_data_path) # Train and get the best episode best_epoch = train_one_dataset( params, file_name, train_data, valid_data) test_data_path = params.data_dir + "/" + \ params.data_name + "_test"+str(params.train_set)+".csv" test_data = dat.load_test_data(test_data_path) auc, acc, loss = test_one_dataset(params, file_name, test_data, best_epoch) return test_data[-1], auc, acc, loss
def adjust_param(max_iter, batch_size, seqlen, test_seqlen, min_seqlen, max_seqlen): parser = argparse.ArgumentParser(description='Script to test KVMN.') parser.add_argument('--gpus', type=str, default='0', help='the gpus will be used, e.g "0,1,2,3"') parser.add_argument('--max_iter', type=int, default=max_iter, help='number of iterations') # default=50 parser.add_argument('--test', type=bool, default=False, help='enable testing') parser.add_argument('--train_test', type=bool, default=True, help='enable testing') parser.add_argument('--show', type=bool, default=True, help='print progress') dataset = "STATICS" # assist2009_updated / assist2015 / KDDal0506 / STATICS if dataset == "assist2009_updated": parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size') # 32 parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions') # 50 parser.add_argument('--qa_embed_dim', type=int, default=10, help='answer and question embedding dimensions') # 200 parser.add_argument('--memory_size', type=int, default=10, help='memory size') parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std') parser.add_argument('--init_lr', type=float, default=0.05, help='initial learning rate') parser.add_argument('--final_lr', type=float, default=1E-5, help='learning rate will not decrease after hitting this threshold') parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate') parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm') parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer') parser.add_argument('--n_question', type=int, default=110, help='the number of unique questions in the dataset') parser.add_argument('--seqlen', type=int, default=seqlen, help='the allowed maximum length of a sequence') # 200 parser.add_argument('--data_dir', type=str, default='../../data/assist2009_updated', help='data directory') parser.add_argument('--data_name', type=str, default='assist2009_updated', help='data set name') parser.add_argument('--load', type=str, default='assist2009_updated', help='model file to load') parser.add_argument('--save', type=str, default='assist2009_updated', help='path to save model') elif dataset == "assist2015": parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size') parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions') parser.add_argument('--qa_embed_dim', type=int, default=10, help='answer and question embedding dimensions') parser.add_argument('--memory_size', type=int, default=10, help='memory size') parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std') parser.add_argument('--init_lr', type=float, default=0.1, help='initial learning rate') parser.add_argument('--final_lr', type=float, default=1E-5, help='learning rate will not decrease after hitting this threshold') parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate') parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm') parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer') parser.add_argument('--n_question', type=int, default=100, help='the number of unique questions in the dataset') parser.add_argument('--seqlen', type=int, default=seqlen, help='the allowed maximum length of a sequence') parser.add_argument('--data_dir', type=str, default='../../data/assist2015', help='data directory') parser.add_argument('--data_name', type=str, default='assist2015', help='data set name') parser.add_argument('--load', type=str, default='assist2015', help='model file to load') parser.add_argument('--save', type=str, default='assist2015', help='path to save model') elif dataset == "STATICS": parser.add_argument('--batch_size', type=int, default=batch_size, help='the batch size') parser.add_argument('--q_embed_dim', type=int, default=50, help='question embedding dimensions') parser.add_argument('--qa_embed_dim', type=int, default=100, help='answer and question embedding dimensions') parser.add_argument('--memory_size', type=int, default=10, help='memory size') parser.add_argument('--init_std', type=float, default=0.1, help='weight initialization std') parser.add_argument('--init_lr', type=float, default=0.01, help='initial learning rate') parser.add_argument('--final_lr', type=float, default=1E-5, help='learning rate will not decrease after hitting this threshold') parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate') parser.add_argument('--maxgradnorm', type=float, default=50.0, help='maximum gradient norm') parser.add_argument('--final_fc_dim', type=float, default=50, help='hidden state dim for final fc layer') parser.add_argument('--n_question', type=int, default=1223, help='the number of unique questions in the dataset') parser.add_argument('--seqlen', type=int, default=seqlen, help='the allowed maximum length of a sequence') parser.add_argument('--data_dir', type=str, default='../../data/STATICS', help='data directory') parser.add_argument('--data_name', type=str, default='STATICS', help='data set name') parser.add_argument('--load', type=str, default='STATICS', help='model file to load') parser.add_argument('--save', type=str, default='STATICS', help='path to save model') params = parser.parse_args() params.lr = params.init_lr params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim params.ctx = mx.cpu() # test_seqlen = params.seqlen # # Read data train_dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') test_dat = DATA(n_question=params.n_question, seqlen=test_seqlen, separate_char=',') seedNum = 224 np.random.seed(seedNum) if not params.test: params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim train_seqlen = params.seqlen d = vars(params) train_data_path = params.data_dir + "/" + params.data_name + "_sub_train_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" valid_data_path = params.data_dir + "/" + params.data_name + "_sub_valid_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" test_data_path = params.data_dir + "/" + params.data_name + "_test_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" train_u2q_data, train_u2qa_data = train_dat.load_data(train_data_path) valid_u2q_data, valid_u2qa_data, valid_u2tf_data = train_dat.load_test_data(valid_data_path, 0.111) # 0.1/0.9 test_u2q_data, test_u2qa_data, test_u2tf_data = test_dat.load_test_data(test_data_path, 0.1) total_train_valid_acc = 0 total_train_valid_loss = 0 total_test_valid_auc = 0 total_test_valid_acc = 0 total_test_valid_loss = 0 user_count = 0 best_epoch = 30 all_pred_list = [] all_target_list = [] i = 0 for user_id in train_u2q_data: params.seqlen = train_seqlen file_name = 'u' + user_id + '_b' + str(params.batch_size) + \ '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \ '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \ '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \ '_f' + str(params.final_fc_dim) + '_s' + str(seedNum) train_q_data = train_u2q_data[user_id] train_qa_data = train_u2qa_data[user_id] valid_q_data = valid_u2q_data[user_id] valid_qa_data = valid_u2qa_data[user_id] valid_tf_data = valid_u2tf_data[user_id] train_valid_acc, train_valid_loss = train_one_dataset(params, file_name, train_q_data, train_qa_data, valid_q_data, valid_qa_data, valid_tf_data) total_train_valid_acc += train_valid_acc total_train_valid_loss += train_valid_loss if params.train_test: params.seqlen = test_seqlen test_q_data = test_u2q_data[user_id] test_qa_data = test_u2qa_data[user_id] test_tf_data = test_u2tf_data[user_id] pred_list, target_list = test_one_dataset(params, file_name, test_q_data, test_qa_data, test_tf_data, best_epoch, user_id) all_pred_list += pred_list all_target_list += target_list user_count += 1 average_train_valid_acc = total_train_valid_acc / user_count average_train_valid_loss = total_train_valid_loss / user_count # print("average_train_valid_acc: ", average_train_valid_acc) # print("average_train_valid_loss: ", average_train_valid_loss) all_pred = np.concatenate(all_pred_list, axis=0) all_target = np.concatenate(all_target_list, axis=0) loss = run.binaryEntropy(all_target, all_pred) auc = run.compute_auc(all_target, all_pred) acc = run.compute_accuracy(all_target, all_pred) # print("valid_auc: ", auc) # print("valid_acc: ", acc) # print("valid_loss: ", loss) return auc else: params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim params.seqlen = test_seqlen d = vars(params) train_data_path = params.data_dir + "/" + params.data_name + "_sub_train_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" valid_data_path = params.data_dir + "/" + params.data_name + "_sub_valid_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" test_data_path = params.data_dir + "/" + params.data_name + "_test_"+str(min_seqlen)+"_"+str(max_seqlen)+".csv" train_u2q_data, train_u2qa_data = train_dat.load_data(train_data_path) test_u2q_data, test_u2qa_data, test_u2tf_data = test_dat.load_test_data(test_data_path, 0.1) user_count = 0 best_epoch = 30 all_pred_list = [] all_target_list = [] i = 0 for user_id in train_u2q_data: file_name = params.save + '-dkvmn_initialization' test_q_data = test_u2q_data[user_id] test_qa_data = test_u2qa_data[user_id] test_tf_data = test_u2tf_data[user_id] pred_list, target_list = test_one_dataset(params, file_name, test_q_data, test_qa_data, test_tf_data, best_epoch, user_id) all_pred_list += pred_list all_target_list += target_list user_count += 1 all_pred = np.concatenate(all_pred_list, axis=0) all_target = np.concatenate(all_target_list, axis=0) loss = run.binaryEntropy(all_target, all_pred) auc = run.compute_auc(all_target, all_pred) acc = run.compute_accuracy(all_target, all_pred) # print("valid_auc: ", auc) # print("valid_acc: ", acc) # print("valid_loss: ", loss) return auc
params = parser.parse_args() params.lr = params.init_lr params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim params.dataset = dataset if params.gpus == None: ctx = mx.cpu() print "Training with cpu ..." else: ctx = mx.gpu(int(params.gpus)) print "Training with gpu(" + params.gpus + ") ..." params.ctx = ctx # Read data dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',') seedNum =224 np.random.seed(seedNum) if not params.test: params.memory_key_state_dim = params.q_embed_dim params.memory_value_state_dim = params.qa_embed_dim d = vars(params) for key in d: print '\t', key, '\t', d[key] file_name = 'b' + str(params.batch_size) + \ '_q' + str(params.q_embed_dim) + '_qa' + str(params.qa_embed_dim) + \ '_m' + str(params.memory_size) + '_std' + str(params.init_std) + \ '_lr' + str(params.init_lr) + '_gn' + str(params.maxgradnorm) + \ '_f' + str(params.final_fc_dim)+'_s'+str(seedNum) train_data_path = params.data_dir + "/" + params.data_name + "_train1.csv"