예제 #1
0
    BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True
    X, Y = None, None
    if BOOLEAN_LOAD_PP_COOCC_FROM_FILE:
        print 'Loading project project negative_co-occurrence matrix'
        t1 = time.time()
        start_idx = range(0, n_users, batch_size)
        end_idx = start_idx[1:] + [n_users]
        X = _load_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix = 'project') #project project co-occurrence matrix
        print 'dumping matrix ...'
        text_utils.save_pickle(X, os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD))
        t2 = time.time()
        print 'Time : %d seconds'%(t2-t1)
    else:
        print 'test loading model from pickle file'
        t1 = time.time()
        X = text_utils.load_pickle(os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD))
        t2 = time.time()
        print '[INFO]: sparse matrix size of project project negative_co-occurrence matrix: %d mb\n' % (
                                                        (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024))
        print 'Time : %d seconds'%(t2-t1)

    #X = None
    BOOLEAN_LOAD_UU_COOCC_FROM_FILE = True
    if BOOLEAN_LOAD_UU_COOCC_FROM_FILE:
        print 'Loading user user negative_co-occurrence matrix'
        t1 = time.time()
        start_idx = range(0, n_projects, batch_size)
        end_idx = start_idx[1:] + [n_projects]
        Y = _load_coord_matrix(start_idx, end_idx, n_users, n_users, prefix = 'backer') #user user co-occurrence matrix

        t2 = time.time()
def produce_neg_embeddings(DATA_DIR,
                           train_data,
                           n_users,
                           n_items,
                           batch_size=5000,
                           iter=0):
    print n_users, n_items

    #clear the negative-co-temp folder:
    if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')):
        for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp',
                                        '*.npy')):
            os.remove(f)

    GENERATE_ITEM_ITEM_COOCCURENCE_FILE = True
    if GENERATE_ITEM_ITEM_COOCCURENCE_FILE:
        t1 = time.time()
        print 'Generating item item negative_co-occurrence matrix'
        start_idx = range(0, n_users, batch_size)
        end_idx = start_idx[1:] + [n_users]
        Parallel(n_jobs=1)(
            delayed(_coord_batch)(DATA_DIR, lo, hi, train_data, prefix='item')
            for lo, hi in zip(start_idx, end_idx))
        t2 = time.time()
        print 'Time : %d seconds' % (t2 - t1)
        pass
    ########################################################################################################################
    ####################Generate user-user co-occurrence matrix based on the same items they backed######################
    #####################        This will build a user-user co-occurrence matrix ##########################################

    def _load_coord_matrix(start_idx, end_idx, nrow, ncol, prefix='item'):
        X = sparse.csr_matrix((nrow, ncol), dtype='float32')

        for lo, hi in zip(start_idx, end_idx):
            coords = np.load(
                os.path.join(DATA_DIR, 'negative-co-temp',
                             'negative_%s_coo_%d_%d.npy' % (prefix, lo, hi)))

            rows = coords[:, 0]
            cols = coords[:, 1]

            tmp = sparse.coo_matrix((np.ones_like(rows), (rows, cols)),
                                    shape=(nrow, ncol),
                                    dtype='float32').tocsr()
            X = X + tmp

            print("%s %d to %d finished" % (prefix, lo, hi))
            sys.stdout.flush()
        return X

    BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True
    X, Y = None, None
    if BOOLEAN_LOAD_PP_COOCC_FROM_FILE:
        print 'Loading item item negative_co-occurrence matrix'
        t1 = time.time()
        start_idx = range(0, n_users, batch_size)
        end_idx = start_idx[1:] + [n_users]
        X = _load_coord_matrix(start_idx,
                               end_idx,
                               n_items,
                               n_items,
                               prefix='item')  #item item co-occurrence matrix
        print 'dumping matrix ...'
        text_utils.save_pickle(
            X,
            os.path.join(DATA_DIR,
                         'negative_item_item_cooc_iter%d.dat' % (iter)))
        t2 = time.time()
        print 'Time : %d seconds' % (t2 - t1)
    else:
        print 'test loading model from pickle file'
        t1 = time.time()
        X = text_utils.load_pickle(
            os.path.join(DATA_DIR,
                         'negative_item_item_cooc_iter%d.dat' % (iter)))
        t2 = time.time()
        print '[INFO]: sparse matrix size of item item negative_co-occurrence matrix: %d mb\n' % (
            (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) /
            (1024 * 1024))
        print 'Time : %d seconds' % (t2 - t1)

    if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')):
        for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp',
                                        '*.npy')):
            os.remove(f)
    return X, None
예제 #3
0
        # return df

    U, V = None, None

    vad_data, vad_raw, vad_df = load_data(
        os.path.join(DATA_DIR, 'validation.csv'))
    train_data, train_raw, train_df = load_data(
        os.path.join(DATA_DIR, 'train.csv'))
    test_data, test_raw, test_df = load_data(os.path.join(
        DATA_DIR, 'test.csv'))
    U, V = wmf.decompose(train_data, vad_data, num_factors=n_components)
    VT = V.T
    iter, max_iter = 0, 10

    #load postivie information
    X = text_utils.load_pickle(os.path.join(DATA_DIR, 'item_item_cooc.dat'))
    Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc.dat'))
    X_sppmi = convert_to_SPPMI_matrix(X,
                                      max_row=n_items,
                                      shifted_K=SHIFTED_K_VALUE)
    Y_sppmi = convert_to_SPPMI_matrix(Y,
                                      max_row=n_users,
                                      shifted_K=SHIFTED_K_VALUE)

    best_ndcg100 = 0.0
    best_iter = 1
    early_stopping = False
    while (iter < max_iter and not early_stopping):
        ################ Expectation step: ######################
        user_slices = rec_eval.user_idx_generator(n_users, batch_users=5000)
        print 'GENERATING NEGATIVE INSTANCES ...'
예제 #4
0
if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE:
    print 'Loading negative project project co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_users, batch_size)
    end_idx = start_idx[1:] + [n_users]
    X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects,
                           prefix='%s-project'%NEGATIVE_SELECTION_MODE)  # project project co-occurrence matrix
    print X_neg
    print 'dumping matrix ...'
    text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE))
    t2 = time.time()
    print 'Time : %d seconds' % (t2 - t1)
else:
    print 'test loading model from pickle file'
    t1 = time.time()
    X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE))
    t2 = time.time()
    print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % (
        (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024))
    print 'Time : %d seconds' % (t2 - t1)

# X = None
BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = False
if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE:
    print 'Loading negative user user co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_projects, batch_size)
    end_idx = start_idx[1:] + [n_projects]
    Y_neg = _load_negative_coord_matrix(start_idx, end_idx, n_users, n_users, prefix='backer')  # user user co-occurrence matrix

    t2 = time.time()
예제 #5
0
    print 'Loading project project co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_users, batch_size)
    end_idx = start_idx[1:] + [n_users]
    X = _load_coord_matrix(
        start_idx, end_idx, n_projects, n_projects,
        prefix='project')  #project project co-occurrence matrix
    print X
    print 'dumping matrix ...'
    text_utils.save_pickle(X, os.path.join(DATA_DIR, 'pro_pro_cooc.dat'))
    t2 = time.time()
    print 'Time : %d seconds' % (t2 - t1)
else:
    print 'test loading model from pickle file'
    t1 = time.time()
    X = text_utils.load_pickle(os.path.join(DATA_DIR, 'pro_pro_cooc.dat'))
    t2 = time.time()
    print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % (
        (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024))
    print 'Time : %d seconds' % (t2 - t1)

#X = None
BOOLEAN_LOAD_UU_COOCC_FROM_FILE = False
if BOOLEAN_LOAD_UU_COOCC_FROM_FILE:
    print 'Loading user user co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_projects, batch_size)
    end_idx = start_idx[1:] + [n_projects]
    Y = _load_coord_matrix(start_idx,
                           end_idx,
                           n_users,
if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE:
    print 'Loading negative project project co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_users, batch_size)
    end_idx = start_idx[1:] + [n_users]
    X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects,
                           prefix='cate-project')  # project project co-occurrence matrix
    print X_neg
    print 'dumping matrix ...'
    text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS))
    t2 = time.time()
    print 'Time : %d seconds' % (t2 - t1)
else:
    print 'test loading model from pickle file'
    t1 = time.time()
    X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS))
    t2 = time.time()
    print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % (
        (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024))
    print 'Time : %d seconds' % (t2 - t1)

# X = None
BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = True
if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE:
    print 'Loading negative user user co-occurrence matrix'
    t1 = time.time()
    start_idx = range(0, n_projects, batch_size)
    end_idx = start_idx[1:] + [n_projects]
    Y_neg = _load_negative_coord_matrix(start_idx, end_idx, n_users, n_users, prefix='cate-backer')  # user user co-occurrence matrix

    t2 = time.time()
예제 #7
0
#train_data, train_raw, train_df =  load_data(os.path.join(DATA_DIR, 'train.num.sub.csv'))
LOAD_NEGATIVE_MATRIX = True
#for i in range(10):
for i in [2]:
# for i in range(9,-1,-1):
    FOLD = i
    print '*************************************FOLD %d ******************************************'%FOLD
    # train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train_fold%d.csv'%FOLD))
    vad_data, vad_raw, vad_df = load_data(os.path.join(DATA_DIR, 'vad.num.sub.fold%d.csv'%FOLD))
    test_data, test_raw, test_df = load_data(os.path.join(DATA_DIR, 'test.num.sub.fold%d.csv'%FOLD))
    train_data, train_raw, train_df =  load_data(os.path.join(DATA_DIR, 'train.num.sub.fold%d.csv'%FOLD))

    print 'loading pro_pro_cooc_fold%d.dat'%FOLD
    t1 = time.time()
    X = text_utils.load_pickle(os.path.join(DATA_DIR,'pro_pro_cooc_fold%d.dat'%FOLD))
    t2 = time.time()
    print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % (
                                                    (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024))
    print 'Time : %d seconds'%(t2-t1)

    print 'loading user_user_cooc_fold%d.dat'%FOLD
    t1 = time.time()
    Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc_fold%d.dat'%FOLD))
    t2 = time.time()
    print '[INFO]: sparse matrix size of user user co-occurrence matrix: %d mb\n' % (
                                                    (Y.data.nbytes + Y.indices.nbytes + Y.indptr.nbytes) / (1024 * 1024))
    print 'Time : %d seconds'%(t2-t1)
    ################# LOADING NEGATIVE CO-OCCURRENCE MATRIX ########################################

    if LOAD_NEGATIVE_MATRIX:
예제 #8
0
    
   
    vad_data, vad_raw, vad_df = load_data(os.path.join(DATA_DIR, 'validation.csv'))
    
    train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train.csv'))
    
    test_data, test_raw, test_df = load_data(os.path.join(DATA_DIR, 'test.csv'))
    
    U, V = wmf.decompose(train_data, vad_data, num_factors= n_components)
    
    VT = V.T
    iter, max_iter = 0, 10

    #load postivie information
    X = text_utils.load_pickle(os.path.join(DATA_DIR, 'item_item_cooc.dat'))
    Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc.dat'))
    X_sppmi = convert_to_SPPMI_matrix(X, max_row=n_items, shifted_K=SHIFTED_K_VALUE)
    Y_sppmi = convert_to_SPPMI_matrix(Y, max_row=n_users, shifted_K=SHIFTED_K_VALUE)

    best_ndcg100 = 0.0
    best_iter = 1
    early_stopping = False
    while (iter < max_iter and not early_stopping):
        ################ Expectation step: ######################
        user_slices = rec_eval.user_idx_generator(n_users, batch_users=5000)
        print 'GENERATING NEGATIVE INSTANCES ...'
        t1 = time.time()
        df = Parallel(n_jobs=16)(delayed(gen_neg_instances)(train_data, U, VT, user_idx, neg_ratio = NEGATIVE_SAMPLE_RATIO, iter = iter)
                                      for user_idx in user_slices)
        t2 = time.time()