def process_meta(file): fi = open(datafilename(category, file), "r") fo = open(datafilename(category, "item-info"), "w") for line in fi: obj = eval(line) cat = obj["categories"][0][-1] print >> fo, obj["asin"] + "\t" + cat
def process_reviews(file): fi = open(datafilename(category, file), "r") user_map = {} fo = open(datafilename(category, "reviews-info"), "w") for line in fi: obj = eval(line) userID = obj["reviewerID"] itemID = obj["asin"] rating = obj["overall"] time = obj["unixReviewTime"] print >> fo, userID + "\t" + itemID + "\t" + str(rating) + "\t" + str( time)
def split_test_by_time(cut_time): fi = open(datafilename(category, "local_all_sample_by_time"), "r") ftrain = open(datafilename(category, "local_train_by_time"), "w") ftest = open(datafilename(category, "local_test_by_time"), "w") for line in fi: line = line.strip() time = float(line.split("\t")[-1]) if time <= cut_time: print >> ftrain, line[:-2] else: print >> ftest, line[:-2]
def get_cut_timestamp(train_percent=0.85): time_list = [] fi = open(datafilename(category, "local_all_sample_by_time"), "r") path = datafilename(category, "local_all_sample_by_time") samples_count = file_len(path) train_size = int(samples_count * train_percent) for line in fi: line = line.strip() time = float(line.split("\t")[-1]) time_list.append(time) index = np.argsort(time_list, axis=-1) cut_time_index = index[train_size] return time_list[cut_time_index]
def get_all_samples(): fin = open(datafilename(category, "jointed-new-by-time"), "r") fall = open(datafilename(category, "local_all_sample_by_time"), "w") gap = np.array( [1.1, 1.4, 1.7, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]) last_user = "******" line_idx = 0 for line in fin: items = line.strip().split("\t") clk = int(items[0]) user = items[1] movie_id = items[2] dt = items[4] cat1 = items[5] user_list = items[6] user_t_list = items[7] if user != last_user: movie_id_list = [] cate1_list = [] movie_id_t_list = [] else: history_clk_num = len(movie_id_list) cat_str = "" mid_str = "" for c1 in cate1_list: cat_str += c1 + "" for mid in movie_id_list: mid_str += mid + "" dt_gap = [] for t in movie_id_t_list: temp = float(dt) / 3600.0 / 24.0 - float( t) / 3600.0 / 24.0 + 1. dt_gap.append(str(np.sum(temp >= gap))) dt_gap_str = "".join(dt_gap) if len(cat_str) > 0: cat_str = cat_str[:-1] if len(mid_str) > 0: mid_str = mid_str[:-1] if history_clk_num >= 1: # 8 is the average length of user behavior print >> fall, items[0] + "\t" + user + "\t" + movie_id + "\t" + cat1 + "\t" + mid_str + "\t" + cat_str + \ "\t" + user_list + "\t" + user_t_list + '\t' + dt_gap_str + "\t" + dt last_user = user if clk: movie_id_list.append(movie_id) cate1_list.append(cat1) movie_id_t_list.append(dt) line_idx += 1
def split_test_by_seqlen(): fi = open(datafilename(category, "local_test_by_time"), "r") ftest_u1 = open(datafilename(category, "local_test_u1"), "w") ftest_u2 = open(datafilename(category, "local_test_u2"), "w") ftest_u3 = open(datafilename(category, "local_test_u3"), "w") for line in fi: line = line.strip() item_seq = line.split("\t")[4] sl = len(item_seq.split("")) if sl < 5: print >> ftest_u1, line elif sl < 15: print >> ftest_u2, line else: print >> ftest_u3, line
import cPickle import random import numpy as np category = 'Amazon_Clothing_Shoes_and_Jewelry' from path import datafilename np.random.seed(1234) random.seed(1234) f_train = open(datafilename(category, "local_train_by_time"), "r").readlines() f_test = open(datafilename(category, "local_test_by_time"), "r").readlines() f_all = f_train + f_test uid_dict = {} mid_dict = {} cat_dict = {} iddd = 0 for line in f_all: arr = line.strip("\n").split("\t") clk = arr[0] uid = arr[1] mid = arr[2] cat = arr[3] mid_list = arr[4] cat_list = arr[5] if uid not in uid_dict:
def manual_join(): f_rev = open(datafilename(category, "reviews-info"), "r") user_map = {} ## User clicked on the list of items item_list = [] # all items list useridToClickItem = {} # The user dict who clicked on the item for line in f_rev: line = line.strip() items = line.split("\t") # loctime = time.localtime(float(items[-1])) # items[-1] = time.strftime('%Y-%m-%d', loctime) if items[0] not in user_map: user_map[items[0]] = [] user_map[items[0]].append(("\t".join(items), float(items[-1]))) item_list.append(items[1]) # The user dict who clicked on the item f_rev = open(datafilename(category, "reviews-info"), "r") for line in f_rev: data = line.split("\t") if data[1] not in useridToClickItem: useridToClickItem[data[1]] = [] useridToClickItem[data[1]].append((data[0], float(data[-1]))) f_meta = open(datafilename(category, "item-info"), "r") meta_map = {} # itemID map cate for line in f_meta: arr = line.strip().split("\t") if arr[0] not in meta_map: meta_map[arr[0]] = arr[1] arr = line.strip().split("\t") fo = open(datafilename(category, "jointed-new-by-time"), "w") for key in user_map: sorted_user_bh = sorted(user_map[key], key=lambda x: x[1]) for line, t in sorted_user_bh: items = line.split("\t") asin = items[1] cur_t = float(items[3]) // 3600 // 24 j = 0 target_user_pos_in_seq = 0 while True: asin_neg_index = random.randint(0, len(item_list) - 1) asin_neg = item_list[asin_neg_index] if asin_neg == asin: continue items[1] = asin_neg if len(useridToClickItem[asin_neg]) == 0: user_str = "default_user" user_t_str = "-1" else: user_str = "" user_t_str = "" sorted_user_in_item_seq = sorted( useridToClickItem[asin_neg], key=lambda x: x[1]) for i, (u, t) in enumerate(sorted_user_in_item_seq): if int(t) > int(items[-1]): target_user_pos_in_seq = i + 1 break if u == items[0]: continue user_str += u + "" user_t = float(cur_t) - t // 3600 // 24 + 1. user_t_str += str(np.sum(user_t >= gap)) + "" if len(user_str) > 0: user_str = user_str[:-1] user_t_str = user_t_str[:-1] if len(user_str) == 0: user_str = "default_user" user_t_str = "-1" if asin_neg in meta_map: print >> fo, "0" + "\t" + "\t".join( items ) + "\t" + meta_map[ asin_neg] + "\t" + user_str + "\t" + user_t_str + "\t" + items[ 3] else: print >> fo, "0" + "\t" + "\t".join( items ) + "\t" + "default_cat" + "\t" + user_str + "\t" + user_t_str + "\t" + items[ 3] j += 1 if j == 1: # negative sampling frequency break target_user_pos_in_seq = 0 # useridToClickItem[asin][0].remove(items[0]) if len(useridToClickItem[asin]) == 0: user_str = "default_user" user_t_str = "-1" else: user_str = "" user_t_str = "" sorted_user_in_item_seq = sorted(useridToClickItem[asin], key=lambda x: x[1]) for i, (u, t) in enumerate(sorted_user_in_item_seq): if int(t) > int(items[-1]): target_user_pos_in_seq = i break if u == items[0]: continue user_str += u + "" user_t = float(cur_t) - t // 3600 // 24 + 1. user_t_str += str(np.sum(user_t >= gap)) + "" if len(user_str) > 0: user_str = user_str[:-1] user_t_str = user_t_str[:-1] if len(user_str) == 0: user_str = "default_user" user_t_str = "-1" if asin in meta_map: print >> fo, "1" + "\t" + line + "\t" + meta_map[ asin] + "\t" + user_str + "\t" + user_t_str + "\t" + items[ 3] else: print >> fo, "1" + "\t" + line + "\t" + "default_cat" + "\t" + user_str + "\t" + user_t_str + "\t" + items[ 3]