def test_all(data_path, file_prefix, density=1, slic=[0]): #data_path = 'data/tmall/slices/' #file_prefix = 'dataset15' #data_path = 'data/clef/slices/' #file_prefix = 'ds' #data_path = 'data/rsc15/slices/' #file_prefix = 'rsc15-clicks' #data_path = 'data/nowplaying/slices/' #file_prefix = 'nowplaying' #data_path = 'data/aotm/slices/' #file_prefix = 'playlists-aotm' #data_path = 'data/30music/slices/' #file_prefix = '30music-200ks' #data_path = 'data/retailrocket/slices/' #file_prefix = 'events' all_stats = defaultdict(int) for i in slic: train, test = loader.load_data(data_path, file_prefix, rows_train=None, rows_test=None, density=density, slice_num=i) s, i2s = load_sessions(train) print(data_path, file_prefix, i) stats = test_reachability(s, i2s, test) for k, v in stats.items(): all_stats[k] += v for k, v in all_stats.items(): print(k, v)
def dump_sequence(data_path, file_prefix, out_fn, density=1, slic=0): """ Convert training/testing slices into a sequence format suitable for entropy rate estimation """ #data_path = "data/tmall/slices/" #file_prefix = "dataset15" #data_path = "data/clef/slices/" #file_prefix = "ds" #data_path = "data/nowplaying/slices/" #file_prefix = "nowplaying" #data_path = "data/aotm/slices/" #file_prefix = "playlists-aotm" #data_path = "data/rsc15/slices/" #file_prefix = "rsc15-clicks" #data_path = "data/30music/slices/" #file_prefix = "30music-200ks" #data_path = "data/retailrocket/slices/" #file_prefix = "events" train, test = loader.load_data(data_path, file_prefix, rows_train=None, rows_test=None, density=density, slice_num=slic) # append all all_data = train.append(test) # sort by sequence, then timestamp groupby = all_data.groupby("SessionId") with open(out_fn, "w") as f: for session_id, session in groupby: item_ids = [ item_id for item_id in session.sort_values("Time")["ItemId"] ] for item_id in item_ids: f.write("{}\n".format(item_id)) f.write("-1\n")
adpt = ad.Adapter(algo='fism') algs['fism'] = adpt adpt = ad.Adapter(algo='fossil') algs['fossil'] = adpt adpt = ad.Adapter(algo='fpmc') algs['fpmc'] = adpt ''' #load data train, test = loader.load_data(data_path, file_prefix, rows_train=limit_train, rows_test=limit_test, density=density_value) buys = loader.load_buys(data_path, buys_prefix) item_ids = train.ItemId.unique() #init metrics for m in metric: m.init(train) # result dict res = {} res_buys = {} #train algorithms for k, a in algs.items(): ts = time.time()
else: return suff + ".w2v" if __name__ == '__main__': # for testing in main import sys sys.path.append('../../') from evaluation import loader as loader data_path = '../../data/retailrocket/slices/' file_prefix = 'events' data_trained = '../../data/retailrocket/prepared2d/' train, test = loader.load_data(data_path, file_prefix, slice_num=0, rows_train=None, rows_test=None, density=1) items_to_predict = test['ItemId'].unique() factors = 100 window = 5 sg = 1 epochs = 10 model = Item2Vec(factors=factors, window=window, sg=sg, workers=4, hs=1, epochs=epochs) model.fit(train)
def create_w2v_features( train, size=10, pos=False ): start = time.time() train['ItemId'] = train['ItemId'].astype('str') print( train['ItemId'].min() ) sequences = train.groupby('SessionId')['ItemId'].apply(list) print('prepared features in ',(time.time() - start)) # Learn decompositon ---------------------------------------------------------------- print('ITEM2VEC FEATURES') start = time.time() model = gensim.models.Word2Vec(sequences, size=size, window=5, min_count=1, workers=4, iter=50) weights = model.wv.syn0 np.save(open(FOLDER+'w2v.'+str(size)+'.wght', 'wb'), weights) vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) with open(FOLDER+'w2v.'+str(size)+'.voc', 'w') as f: f.write(json.dumps(vocab)) if __name__ == '__main__': train, test = ld.load_data(FOLDER, FILE) # create_latent_factors( combi, size=32, pos=False ) create_w2v_features( train, size=64 )
def main(): train, test = dl.load_data(FOLDER, PREFIX) split_data(train, FOLDER + PREFIX, DAYS_TEST)