Exemplo n.º 1
0
def cal_comm_mat_sm(path_str):
    '''
        calculate commuting matrix for U-*-U-pos-B style in merge way with 7 simple motifs (sm)
    '''
    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    ub_filename = dir_ + 'uid_bid.txt'

    print 'cal commut mat with motif for %s, filenames: %s, %s, %s' % (
        path_str, uid_filename, bid_filename, ub_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')

    # upb = np.loadtxt(upb_filename, dtype=np.int64)
    ub = np.loadtxt(ub_filename, dtype=np.int64)

    # adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)
    adj_ub, adj_ub_t = generate_adj_mat(ub, uid2ind, bid2ind)

    social_filename = dir_ + 'user_social.txt'
    uu = np.loadtxt(social_filename, dtype=np.int64)
    adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind)

    motif_matrix = compute_motif_matrix(adj_uu, adj_uu_t, path_str)

    if path_str[:3] == 'UUB':
        base_matrix = adj_uu

    if path_str[:4] == 'UBUB':
        base_matrix = adj_ub.dot(adj_ub_t)

    #for n in range(1, 10):
    for n in range(11):
        alpha = n * 0.1
        UBU_merge = (1 - alpha) * base_matrix + alpha * motif_matrix
        start = time.time()
        UBUB = UBU_merge.dot(adj_ub)
        print 'UBUB(%s), density=%.5f cost %.2f seconds' % (
            UBUB.shape, UBUB.nnz * 1.0 / UBUB.shape[0] / UBUB.shape[1],
            time.time() - start)
        start = time.time()
        K = 500

        #normal way
        triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K)
        wfilename = dir_ + 'sim_res/path_count/%s_%s_top%s.res' % (path_str,
                                                                   alpha, K)
        save_triplets(wfilename, triplets)
        print 'finish saving %s %s entries in %s, cost %.2f seconds' % (
            len(triplets), path_str, wfilename, time.time() - start)
Exemplo n.º 2
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''

    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'#users
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v:k for k,v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'#items
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v:k for k,v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'# positive rating
    upb = np.loadtxt(upb_filename, dtype=int)

    # generate users items adjacency matrix
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    # print uid2ind[640698], bid2ind[51874]
    # print type(adj_ub), adj_ub.toarray()[uid2ind[640698]][bid2ind[51874]], adj_ub.toarray().shape
    # print type(adj_ub_t), adj_ub_t.toarray()[bid2ind[51874]][uid2ind[640698]], adj_ub_t.toarray().shape

    # generate items object adjacency matrix (cat, state, city, star)
    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    # compute u-> b -> o(cat,city) <- b
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0/comm_res.shape[0]/comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)

    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
Exemplo n.º 3
0
def get_bo(path_str, bid2ind):

    #U-pos-B-Cat-B
    if 'State' in path_str:
        sfilename = dir_ + 'bid_state.txt'
    elif 'Cat' in path_str:
        sfilename = dir_ + 'bid_cat.txt'
    elif 'City' in path_str:
        sfilename = dir_ + 'bid_city.txt'
    elif 'Star' in path_str:
        sfilename = dir_ + 'bid_stars.txt'

    lines = open(sfilename, 'r').readlines()
    parts = [l.strip().split() for l in lines]
    bos = [(int(b), int(o)) for b,o in parts]
    ond2ind = {v:k for k,v in enumerate(set([o for _, o in bos]))}
    ind2ond = reverse_map(ond2ind)
    adj_bo, adj_bo_t = generate_adj_mat(bos, bid2ind, ond2ind)
    return adj_bo, adj_bo_t
Exemplo n.º 4
0
def cal_rar(path_str):

    aid_filename = dir_ + 'aids.txt'
    rid_filename = dir_ + 'rids.txt'

    aids = open(aid_filename, 'r').readlines()
    aids = [int(r.strip()) for r in aids]
    aid2ind = {a: ind for ind, a in enumerate(aids)}  #global ind
    ind2aid = reverse_map(aid2ind)

    rids = open(rid_filename, 'r').readlines()
    rids = [int(r.strip()) for r in rids]
    rid2ind = {r: ind for ind, r in enumerate(rids)}  #global ind
    ind2rid = reverse_map(rid2ind)

    if 'P' in path_str:
        ura_filename = dir_ + 'uid_rid_pos_aid_weight.txt'
    elif 'N' in path_str:
        ura_filename = dir_ + 'uid_rid_neg_aid_weight.txt'

    ura = np.loadtxt(ura_filename, dtype=np.float64)
    ra = ura[:, (1, 2, 3)]
    ra = [(int(r), int(a), w) for r, a, w in ra]
    adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind, is_weight=True)

    t1 = time.time()
    RA = adj_ra.toarray()
    t2 = time.time()
    print 'to dense RA%s cost %.2f seconds' % (RA.shape, t2 - t1)
    RAR_csr = cal_rar_block(RA, len(rid2ind), ind2rid, step=20000)
    print 'finish cal rar by blocks, cost %.2f minutes' % (
        (time.time() - t2) / 60.0)
    try:
        wfilename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str
        fw = open(wfilename, 'w+')
        pickle.dump(RAR_csr, fw, pickle.HIGHEST_PROTOCOL)
        map_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        fw = open(map_filename, 'w+')
        pickle.dump(ind2rid, fw, pickle.HIGHEST_PROTOCOL)
        print 'finish saving sparse mat in ', wfilename
    except Exception as e:
        print e
Exemplo n.º 5
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''
    uid_filename = dir_ + 'uids.txt'
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v: k for k, v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v: k for k, v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'
    upb = np.loadtxt(upb_filename, dtype=int)
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0 /
                                           comm_res.shape[0] /
                                           comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)
    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
Exemplo n.º 6
0
def cal_comm_mat_USUB(path_str):
    '''
        Given meta_structure_str, generate the commuting matrix
        e.g. 'user-review-business,t10_aspect-review-user'
    '''

    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    aid_filename = dir_ + 'aids.txt'
    rid_filename = dir_ + 'rids.txt'
    upb_filename = dir_ + 'uid_pos_bid.txt'

    print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')
    aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect')

    upb = np.loadtxt(upb_filename, dtype=np.int64)
    adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)

    if 'P' in path_str:
        urb_filename = dir_ + 'uid_rid_pos_bid.txt'
        ura_filename = dir_ + 'uid_rid_pos_aid.txt'
        ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str
    elif 'N' in path_str:
        urb_filename = dir_ + 'uid_rid_neg_bid.txt'
        ura_filename = dir_ + 'uid_rid_neg_aid.txt'
        ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str

    f = open(ind2rid_filename, 'r')
    ind2rid = pickle.load(f)
    rid2ind = reverse_map(ind2rid)

    urb = np.loadtxt(urb_filename, dtype=np.int64)
    ura = np.loadtxt(ura_filename, dtype=np.int64)

    ur = urb[:,(0,1)]
    adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)

    rb = urb[:,(1,2)]
    adj_rb, adj_rb_t = generate_adj_mat(rb, rid2ind, bid2ind)

    ra = ura[:,(1,2)]
    adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind)

    start = time.time()
    RBR = adj_rb.dot(adj_rb_t)
    print 'RBR(%s), density=%.5f cost %.2f seconds' % (RBR.shape, RBR.nnz * 1.0/RBR.shape[0]/RBR.shape[1], time.time() - start)
    start = time.time()
    #RAR = adj_ra.dot(adj_ra_t)
    f = open(rar_mat_filename, 'r')
    RAR = pickle.load(f)
    print 'load RAR(%s), density=%.5f cost %.2f seconds' % (RAR.shape, RAR.nnz * 1.0/RAR.shape[0]/RAR.shape[1], time.time() - start)
    start = time.time()
    RSR = RBR.multiply(RAR)
    print 'RSR(%s), density=%.5f cost %.2f seconds' % (RSR.shape, RSR.nnz * 1.0/RSR.shape[0]/RSR.shape[1], time.time() - start)
    start = time.time()
    URSR = adj_ur.dot(RSR)
    print 'URSR(%s), density=%.5f cost %.2f seconds' % (URSR.shape, URSR.nnz * 1.0/URSR.shape[0]/URSR.shape[1], time.time() - start)
    start = time.time()
    URSRU = URSR.dot(adj_ur_t)
    print 'URSRU(%s), density=%.5f cost %.2f seconds' % (URSRU.shape, URSRU.nnz * 1.0/URSRU.shape[0]/URSRU.shape[1], time.time() - start)

    start = time.time()
    URSRUB = URSRU.dot(adj_upb)
    print 'URSRUB(%s), density=%.5f cost %.2f seconds' % (URSRUB.shape, URSRUB.nnz * 1.0/URSRUB.shape[0]/URSRUB.shape[1], time.time() - start)
    start = time.time()
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    #wfilename = dir_ + 'sim_res/path_count/%s.res' % path_str
    batch_save_comm_res(path_str, wfilename, URSRUB, ind2uid, ind2bid)
    print 'finish saving %s %s entries in %s, cost %.2f seconds' % (URSRUB.nnz, path_str, wfilename, time.time() - start)
Exemplo n.º 7
0
def cal_comm_mat_UUB(path_str, cikm=False):
    '''
        calculate commuting matrix for U-*-U-pos-B style
    '''
    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    upb_filename = dir_ + 'uid_pos_bid.txt'
    if not cikm:
        rid_filename = dir_ + 'rids.txt'
        aid_filename = dir_ + 'aids.txt'

    print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')
    if not cikm:
        rids, rid2ind, ind2rid = load_eids(rid_filename, 'review')
        aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect')

    upb = np.loadtxt(upb_filename, dtype=np.int64)
    adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)

    if path_str == 'UPBUB':
        start = time.time()
        UBU = adj_upb.dot(adj_upb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UPBCatBUB', 'UPBCityBUB']:
        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_upb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UNBCatBUB', 'UNBCityBUB']:
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_unb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UNBUB':
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        UBU = adj_unb.dot(adj_unb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UUB':
        social_filename = dir_ + 'user_social.txt'
        uu = np.loadtxt(social_filename, dtype=np.int64)
        adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind)

        start = time.time()
        UBU = adj_uu.copy()
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UCompUB':
        uid_comp_filename = dir_ + 'uid_comp.txt'
        uc = np.loadtxt(uid_comp_filename, dtype=np.int64)
        cids = set(uc[:,1])
        cid2ind = {v:k for k,v in enumerate(cids)}
        ind2cnd = reverse_map(cid2ind)
        adj_uc, adj_uc_t = generate_adj_mat(uc, uid2ind, cid2ind)

        start = time.time()
        UBU = adj_uc.dot(adj_uc_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URPARUB':
        urpa_filename = dir_ + 'uid_rid_pos_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URNARUB':
        urpa_filename = dir_ + 'uid_rid_neg_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    start = time.time()
    UBUB = UBU.dot(adj_upb)
    print 'UBUB(%s), density=%.5f cost %.2f seconds' % (UBUB.shape, UBUB.nnz * 1.0/UBUB.shape[0]/UBUB.shape[1], time.time() - start)
    start = time.time()
    K = 500
    triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K)
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    save_triplets(wfilename, triplets)
    #save_comm_res(path_str, wfilename, UBUB, ind2uid, ind2bid)
    print 'finish saving %s %s entries in %s, cost %.2f seconds' % (len(triplets), path_str, wfilename, time.time() - start)