def cal_comm_mat_UBB(path_str): ''' 200k ratings calculate the commuting matrix in U-B-*-B style in fact, only need to calculate BB ''' print "path str:", path_str uid_filename = dir_ + 'uids.txt'#users print 'run cal_comm_mat_samples for 10k users in ', uid_filename lines = open(uid_filename, 'r').readlines() uids = [int(l.strip()) for l in lines] uid2ind = {v:k for k,v in enumerate(uids)} ind2uid = reverse_map(uid2ind) bid_filename = dir_ + 'bids.txt'#items lines = open(bid_filename, 'r').readlines() bids = [int(l.strip()) for l in lines] bid2ind = {v:k for k,v in enumerate(bids)} ind2bid = reverse_map(bid2ind) upb_filename = dir_ + 'uid_pos_bid.txt'# positive rating upb = np.loadtxt(upb_filename, dtype=int) # generate users items adjacency matrix adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind) # print uid2ind[640698], bid2ind[51874] # print type(adj_ub), adj_ub.toarray()[uid2ind[640698]][bid2ind[51874]], adj_ub.toarray().shape # print type(adj_ub_t), adj_ub_t.toarray()[bid2ind[51874]][uid2ind[640698]], adj_ub_t.toarray().shape # generate items object adjacency matrix (cat, state, city, star) adj_bo, adj_bo_t = get_bo(path_str, bid2ind) t1 = time.time() # compute u-> b -> o(cat,city) <- b comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t) t2 = time.time() print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1) print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0/comm_res.shape[0]/comm_res.shape[1]) K = 500 wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K) save_triplets(wfilename, triplets) #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid) t3 = time.time() print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
def cal_comm_mat_sm(path_str): ''' calculate commuting matrix for U-*-U-pos-B style in merge way with 7 simple motifs (sm) ''' uid_filename = dir_ + 'uids.txt' bid_filename = dir_ + 'bids.txt' ub_filename = dir_ + 'uid_bid.txt' print 'cal commut mat with motif for %s, filenames: %s, %s, %s' % ( path_str, uid_filename, bid_filename, ub_filename) uids, uid2ind, ind2uid = load_eids(uid_filename, 'user') bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz') # upb = np.loadtxt(upb_filename, dtype=np.int64) ub = np.loadtxt(ub_filename, dtype=np.int64) # adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind) adj_ub, adj_ub_t = generate_adj_mat(ub, uid2ind, bid2ind) social_filename = dir_ + 'user_social.txt' uu = np.loadtxt(social_filename, dtype=np.int64) adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind) motif_matrix = compute_motif_matrix(adj_uu, adj_uu_t, path_str) if path_str[:3] == 'UUB': base_matrix = adj_uu if path_str[:4] == 'UBUB': base_matrix = adj_ub.dot(adj_ub_t) #for n in range(1, 10): for n in range(11): alpha = n * 0.1 UBU_merge = (1 - alpha) * base_matrix + alpha * motif_matrix start = time.time() UBUB = UBU_merge.dot(adj_ub) print 'UBUB(%s), density=%.5f cost %.2f seconds' % ( UBUB.shape, UBUB.nnz * 1.0 / UBUB.shape[0] / UBUB.shape[1], time.time() - start) start = time.time() K = 500 #normal way triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K) wfilename = dir_ + 'sim_res/path_count/%s_%s_top%s.res' % (path_str, alpha, K) save_triplets(wfilename, triplets) print 'finish saving %s %s entries in %s, cost %.2f seconds' % ( len(triplets), path_str, wfilename, time.time() - start)
def batch_save_comm_res(path_str, wfilename, comm_res, ind2row, ind2col): coo = comm_res.tocoo(copy=False) step = 10000000 N = len(coo.row) / step for i in range(N+1): start_time = time.time() triplets = [] start = i * step end = start + step rows = coo.row[start:end] cols = coo.col[start:end] vs = coo.data[start:end] for r, c, v in zip(rows, cols, vs): triplets.append((ind2row[r], ind2col[c], v)) save_triplets(wfilename, triplets, is_append=True) print 'finish saving 10M %s triplets in %s, progress: %s/%s, cost %.2f seconds' % (path_str, wfilename, (i+1) * step, len(coo.data), time.time() - start_time)
def cal_comm_mat_UBB(path_str): ''' 200k ratings calculate the commuting matrix in U-B-*-B style in fact, only need to calculate BB ''' uid_filename = dir_ + 'uids.txt' print 'run cal_comm_mat_samples for 10k users in ', uid_filename lines = open(uid_filename, 'r').readlines() uids = [int(l.strip()) for l in lines] uid2ind = {v: k for k, v in enumerate(uids)} ind2uid = reverse_map(uid2ind) bid_filename = dir_ + 'bids.txt' lines = open(bid_filename, 'r').readlines() bids = [int(l.strip()) for l in lines] bid2ind = {v: k for k, v in enumerate(bids)} ind2bid = reverse_map(bid2ind) upb_filename = dir_ + 'uid_pos_bid.txt' upb = np.loadtxt(upb_filename, dtype=int) adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind) adj_bo, adj_bo_t = get_bo(path_str, bid2ind) t1 = time.time() comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t) t2 = time.time() print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1) print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0 / comm_res.shape[0] / comm_res.shape[1]) K = 500 wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K) save_triplets(wfilename, triplets) #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid) t3 = time.time() print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
def save_comm_res(path_str, filename, comm_res, ind2row, ind2col): triplets = [] coo = comm_res.tocoo() for r, c, v in zip(coo.row, coo.col,coo.data): triplets.append((ind2row[r], ind2col[c], v)) save_triplets(filename, triplets)
def cal_comm_mat_UUB(path_str, cikm=False): ''' calculate commuting matrix for U-*-U-pos-B style ''' print "path str:", path_str uid_filename = dir_ + 'uids.txt' bid_filename = dir_ + 'bids.txt' upb_filename = dir_ + 'uid_pos_bid.txt' if not cikm: rid_filename = dir_ + 'rids.txt' aid_filename = dir_ + 'aids.txt' print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename) uids, uid2ind, ind2uid = load_eids(uid_filename, 'user') bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz') if not cikm: rids, rid2ind, ind2rid = load_eids(rid_filename, 'review') aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect') upb = np.loadtxt(upb_filename, dtype=np.int64) adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind) if path_str == 'UPBUB': start = time.time() UBU = adj_upb.dot(adj_upb_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str in ['UPBCatBUB', 'UPBCityBUB']: start = time.time() adj_bo, adj_bo_t = get_bo(path_str, bid2ind) UBO = adj_upb.dot(adj_bo) UBU = UBO.dot(UBO.transpose()) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str in ['UNBCatBUB', 'UNBCityBUB']: unb_filename = dir_ + 'uid_neg_bid.txt' unb = np.loadtxt(unb_filename, dtype=np.int64) adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind) start = time.time() adj_bo, adj_bo_t = get_bo(path_str, bid2ind) UBO = adj_unb.dot(adj_bo) UBU = UBO.dot(UBO.transpose()) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UNBUB': unb_filename = dir_ + 'uid_neg_bid.txt' unb = np.loadtxt(unb_filename, dtype=np.int64) adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind) start = time.time() UBU = adj_unb.dot(adj_unb_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UUB': social_filename = dir_ + 'user_social.txt' uu = np.loadtxt(social_filename, dtype=np.int64) adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind) start = time.time() UBU = adj_uu.copy() print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UCompUB': uid_comp_filename = dir_ + 'uid_comp.txt' uc = np.loadtxt(uid_comp_filename, dtype=np.int64) cids = set(uc[:,1]) cid2ind = {v:k for k,v in enumerate(cids)} ind2cnd = reverse_map(cid2ind) adj_uc, adj_uc_t = generate_adj_mat(uc, uid2ind, cid2ind) start = time.time() UBU = adj_uc.dot(adj_uc_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'URPARUB': urpa_filename = dir_ + 'uid_rid_pos_aid.txt' urpa = np.loadtxt(urpa_filename) ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind) ra = urpa[:,(1,2)] adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind) start = time.time() URA = adj_ur.dot(adj_ra) UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'URNARUB': urpa_filename = dir_ + 'uid_rid_neg_aid.txt' urpa = np.loadtxt(urpa_filename) ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind) ra = urpa[:,(1,2)] adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind) start = time.time() URA = adj_ur.dot(adj_ra) UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) start = time.time() UBUB = UBU.dot(adj_upb) print 'UBUB(%s), density=%.5f cost %.2f seconds' % (UBUB.shape, UBUB.nnz * 1.0/UBUB.shape[0]/UBUB.shape[1], time.time() - start) start = time.time() K = 500 triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K) wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) save_triplets(wfilename, triplets) #save_comm_res(path_str, wfilename, UBUB, ind2uid, ind2bid) print 'finish saving %s %s entries in %s, cost %.2f seconds' % (len(triplets), path_str, wfilename, time.time() - start)