def load_tt_datas(config={}, reload=True): ''' loda data. config: 获得需要加载的数据类型,放入pre_embedding. nload: 是否重新解析原始数据 ''' if reload: print "reload the datasets." if config['dataset'] == 'lap': train_data, test_data, word2idx = load_data( lap_train_path, lap_test_path, class_num=config['class_num']) elif config['dataset'] == 'res': train_data, test_data, word2idx = load_data( res_train_path, res_test_path, class_num=config['class_num']) elif config['dataset'] == 'dong': train_data, test_data, word2idx = load_data_dong( dong_train_path, dong_test_path, class_num=config['class_num']) emb_dict = load_glove(pre_train, word2idx, init_std=config['emb_stddev']) sswe_dict = load_ssweu(sswe_path, word2idx, init_std=config['emb_stddev']) rand_dict = load_random('', word2idx, init_std=config['emb_stddev']) # dump the pretreatment data. if config['class_num'] == 2: path = 'datas/mid_data_2classes/' else: path = 'datas/mid_data_3classes/' if config['dataset'] == 'lap': dump_file([train_data, path + mid_lap_train_data], [test_data, path + mid_lap_test_data], [emb_dict, path + mid_lap_emb_dict], [word2idx, path + mid_lap_word2idx], [sswe_dict, path + mid_lap_sswe_dict], [rand_dict, path + mid_lap_rand_dict]) elif config['dataset'] == 'res': dump_file([train_data, path + mid_res_train_data], [test_data, path + mid_res_test_data], [emb_dict, path + mid_res_emb_dict], [word2idx, path + mid_res_word2idx], [sswe_dict, path + mid_res_sswe_dict], [rand_dict, path + mid_res_rand_dict]) elif config['dataset'] == 'dong': dump_file([train_data, path + mid_dong_train_data], [test_data, path + mid_dong_test_data], [emb_dict, path + mid_dong_emb_dict], [word2idx, path + mid_dong_word2idx], [sswe_dict, path + mid_dong_sswe_dict], [rand_dict, path + mid_dong_rand_dict]) else: print "not reload the datasets." if config['class_num'] == 2: path = 'datas/mid_data_2classes/' else: path = 'datas/mid_data_3classes/' if config['dataset'] == 'lap': datas = load_file(path + mid_lap_train_data, path + mid_lap_test_data, path + mid_lap_emb_dict, path + mid_lap_sswe_dict, path + mid_lap_rand_dict) elif config['dataset'] == 'res': datas = load_file(path + mid_res_train_data, path + mid_res_test_data, path + mid_res_emb_dict, path + mid_res_sswe_dict, path + mid_res_rand_dict) elif config['dataset'] == 'dong': datas = load_file(path + mid_dong_train_data, path + mid_dong_test_data, path + mid_dong_emb_dict, path + mid_dong_sswe_dict, path + mid_dong_rand_dict) train_data = datas[0] test_data = datas[1] emb_dict = datas[2] sswe_dict = datas[3] rand_dict = datas[4] config['pre_embedding'] = emb_dict config['sswe_embedding'] = sswe_dict config['rand_embedding'] = rand_dict return train_data, test_data
def load_tt_datas(config={}, reload=True): ''' loda data. config: 获得需要加载的数据类型,放入pre_embedding. nload: 是否重新解析原始数据 ''' if reload: print( "reload the datasets.") print (config['dataset']) if config['dataset'] == 'home': train_data, test_data, item2idx, n_items = load_data_p(home_train, home_test, pro = None) config["n_items"] = n_items-1 emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path+home_emb_dict]) print("-----") if config['dataset'] == 'rsc15_4': train_data, test_data, item2idx, n_items = load_data_p( rsc15_train, rsc15_test, pro = 4 ) config["n_items"] = n_items-1 emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path+mid_rsc15_4_emb_dict]) print("-----") if config['dataset'] == 'rsc15_64': train_data, test_data, item2idx, n_items = load_data_p( rsc15_train, rsc15_test, pro = 64 ) config["n_items"] = n_items-1 emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path + mid_rsc15_64_emb_dict]) print("-----") if config['dataset'] == 'cikm16': train_data, test_data, item2idx, n_items = load_data2( cikm16_train, cikm16_test, class_num=config['class_num'] ) config["n_items"] = n_items-1 emb_dict = load_random(item2idx,edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path+mid_cikm16_emb_dict]) print("-----") else: print ("not reload the datasets.") print(config['dataset']) if config['dataset'] == 'rsc15_4': train_data, test_data, item2idx, n_items = load_data_p( rsc15_train, rsc15_test, pro=4 ) config["n_items"] = n_items-1 path = 'datas/mid_data' emb_dict = load_file(path + mid_rsc15_4_emb_dict) config['pre_embedding'] = emb_dict[0] # path = 'datas/mid_data' # dump_file([emb_dict, path+mid_rsc15_emb_dict]) print("-----") if config['dataset'] == 'rsc15_64': train_data, test_data, item2idx, n_items = load_data_p( rsc15_train, rsc15_test, pro=64 ) config["n_items"] = n_items-1 # emb_dict = load_random(n_items, edim=config['hidden_size'], init_std=config['emb_stddev']) # path = 'datas/train_emb/' # emb_dict = load_file(path + "rsc15_64_emb.data") path = 'datas/mid_data' emb_dict = load_file(path+mid_rsc15_64_emb_dict) config['pre_embedding'] = emb_dict[0] # dump_file([emb_dict, path + mid_rsc15_emb_dict]) print("-----") if config['dataset'] == 'cikm16': train_data, test_data, item2idx, n_items = load_data2( cikm16_train, cikm16_test, class_num=config['class_num'] ) config["n_items"] = n_items-1 path = 'datas/mid_data' emb_dict = load_file(path + mid_cikm16_emb_dict) # path = 'datas/train_emb/' # emb_dict = load_file(path + "cikm16_emb.data") config['pre_embedding'] = emb_dict[0] print("-----") return train_data, test_data
def load_tt_datas(config={}, reload=True): ''' loda data. config: 获得需要加载的数据类型,放入pre_embedding. nload: 是否重新解析原始数据 ''' if reload: print("reload the datasets.") print(config['dataset']) if config['dataset'] == 'rsc15': train_data, test_data, item2idx = load_data( rsc15_train, rsc15_test, class_num=config['class_num']) data = pd.read_csv(rsc15_train, sep='\t', dtype={'ItemId': np.int64}) itemids = data["ItemId"].unique() # 去重,return 唯一的itemid序列 n_items = len(itemids) config["n_items"] = n_items emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path + mid_rsc15_emb_dict]) print("-----") if config['dataset'] == 'rsc15_4': train_data, test_data, item2idx, n_items = load_data_p(rsc15_train, rsc15_test, pro=4) config["n_items"] = n_items - 1 emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path + mid_rsc15_4_emb_dict]) print("-----") if config['dataset'] == 'rsc15_64': train_data, test_data, item2idx, n_items = load_data_p(rsc15_train, rsc15_test, pro=64) config["n_items"] = n_items - 1 emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path + mid_rsc15_64_emb_dict]) print("-----") if config['dataset'] == 'cikm16': train_data, test_data, item2idx, n_items = load_data2( cikm16_train, cikm16_test, class_num=config['class_num']) config["n_items"] = n_items - 1 emb_dict = load_random(item2idx, edim=config['hidden_size'], init_std=config['emb_stddev']) config['pre_embedding'] = emb_dict path = 'datas/mid_data' dump_file([emb_dict, path + mid_cikm16_emb_dict]) print("-----") else: print("not reload the datasets.") print(config['dataset']) if config['dataset'] == 'rsc15': train_data, test_data, item2idx = load_data( rsc15_train, rsc15_test, class_num=config['class_num']) data = pd.read_csv(rsc15_train, sep='\t', dtype={'ItemId': np.int64}) itemids = data["ItemId"].unique() # 去重,return 唯一的itemid序列 n_items = len(itemids) config["n_items"] = n_items path = 'datas/mid_data' emb_dict = load_file(path + mid_rsc15_emb_dict) config['pre_embedding'] = emb_dict[0] print("-----") if config['dataset'] == 'rsc15_4': train_data, test_data, item2idx, n_items = load_data_p(rsc15_train, rsc15_test, pro=4) config["n_items"] = n_items - 1 path = 'datas/mid_data' emb_dict = load_file(path + mid_rsc15_4_emb_dict) config['pre_embedding'] = emb_dict[0] # path = 'datas/mid_data' # dump_file([emb_dict, path+mid_rsc15_emb_dict]) print("-----") if config['dataset'] == 'rsc15_64': train_data, test_data, item2idx, n_items = load_data_p(rsc15_train, rsc15_test, pro=64) config["n_items"] = n_items - 1 # emb_dict = load_random(n_items, edim=config['hidden_size'], init_std=config['emb_stddev']) # path = 'datas/train_emb/' # emb_dict = load_file(path + "rsc15_64_emb.data") path = 'datas/mid_data' emb_dict = load_file(path + mid_rsc15_64_emb_dict) config['pre_embedding'] = emb_dict[0] # dump_file([emb_dict, path + mid_rsc15_emb_dict]) print("-----") if config['dataset'] == 'cikm16': train_data, test_data, item2idx, n_items = load_data2( cikm16_train, cikm16_test, class_num=config['class_num']) config["n_items"] = n_items - 1 path = 'datas/mid_data' emb_dict = load_file(path + mid_cikm16_emb_dict) # path = 'datas/train_emb/' # emb_dict = load_file(path + "cikm16_emb.data") config['pre_embedding'] = emb_dict[0] print("-----") # if config['dataset'] == 'rsc15': # train_data = pd.read_csv(rsc15_train, sep='\t', dtype={'ItemId': np.int64}) # test_data = pd.read_csv(rsc15_test, sep='\t', dtype={'ItemId': np.int64}) # path = 'datas/mid_data/' # sample_list = [] # for i in range(10): # tmp = load_file( # path +str(i)+ "_"+mid_rsc15_train_data # ) # sample_list+= tmp[0] # sample_pack = Samplepack() # sample_pack.samples = sample_list # sample_pack.init_id2sample() # train_data = sample_pack # ret= load_file( # path +mid_rsc15_test_data # ) # test_data = ret[0] return train_data, test_data