Exemplo n.º 1
0
def getDataLoader(data_path, batch_size=2048):
    # load train data
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
    # all data file
    data_df = pd.read_table(data_path, names=data_fields)
    if SMALL:
        data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False)
    if IMPLICT:
        data_df.rating = (data_df.rating >= 4).astype(np.float32)
    # ua_base = allData.sample(n=90570, replace=False)
    df_train = data_df.sample(n=int(len(data_df) * 0.8), replace=False)
    df_test = data_df.drop(df_train.index, axis=0)
    if IMPLICT:
        df_train=drop_df(df_train)
        df_test = drop_df(df_test)
    # get user number
    n_users = max(set(data_df['user_id'].values))+1
    # get item number
    n_items = max(set(data_df['item_id'].values))+1

    print("Initialize end.The user number is:%d,item number is:%d" % (n_users, n_items))
    train_loader = data.DataLoader(
        Interactions(df_train), batch_size=batch_size, shuffle=True)

    test_loader = data.DataLoader(
        Interactions(df_test), batch_size=batch_size, shuffle=False)

    loaders = {'train': train_loader,
               'valid': test_loader}

    return (n_users,n_items ), loaders
Exemplo n.º 2
0
def getDataLoader(data_path, batch_size=2048):
    # load train data
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
    # all data file
    data_df = pd.read_table(data_path + '', names=data_fields)
    # data_df=pd.read_csv(data_path+'user_ratedmovies.dat',sep='\t')
    # # embed()
    # data_df=data_df[['userID','movieID','rating']]
    # data_df.columns=['user_id', 'item_id', 'rating']
    if SMALL:
        data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False)
    if IMPLICT:
        data_df.rating = (data_df.rating >= 5).astype(np.float32)

    for i in ['user_id', 'item_id']:
        data_df[i] = data_df[i].map(
            dict(zip(data_df[i].unique(), range(1, data_df[i].nunique() + 1))))

    # print(data_df.describe())

    df_train = data_df.sample(n=int(len(data_df) * 0.9), replace=False)
    df_test = data_df.drop(df_train.index, axis=0)
    if IMPLICT:
        df_train = drop_df(df_train)
        df_test = drop_df(df_test)
    # get user number
    n_users = max(data_df['user_id'].values)
    # get item number
    n_items = max(data_df['item_id'].values)

    print("Initialize end.The user number is:%d,item number is:%d" %
          (n_users, n_items))
    train_loader = torch.utils.data.DataLoader(Interactions(
        df_train, index_from_one=True),
                                               batch_size=batch_size,
                                               shuffle=True)

    test_loader = torch.utils.data.DataLoader(Interactions(
        df_test, index_from_one=True),
                                              batch_size=batch_size,
                                              shuffle=False)

    loaders = {'train': train_loader, 'valid': test_loader}

    from scipy.sparse import coo_matrix
    data = np.ones((data_df.shape[0]))
    row = data_df.user_id - 1
    col = data_df.item_id - 1
    UI = coo_matrix((data, (row, col)), shape=(n_users, n_items))
    UIU = UI.dot(UI.transpose())
    IUI = UI.transpose().dot(UI)
    hinSim = {}
    hinSimI = {}
    calSim('UIU', UIU, hinSim, hinSimI)
    calSim('IUI', IUI, hinSim, hinSimI)
    calSim('UI', UI, hinSim, hinSimI)
    metaPath = {'II': ['IUI'], 'UU': ['UIU'], 'UI': 'UI'}  # 也有可能是 IUI
    return (n_users, n_items, hinSim, hinSimI, metaPath), loaders
Exemplo n.º 3
0
def getDataLoader(data_path, trainset_rate=0.8, batch_size=4096):
    # load train data
    # all data file
    if os.path.exists(data_path +
                      'data_df.txt') and os.path.exists(data_path +
                                                        'social_df.txt'):
        print("Epinions dataset has been preprocessed.")
    else:
        print("Epinions dataset has not been preprocessed.")
        epinionsPreprocessing(data_path)
    data_df = pd.read_table(data_path + 'data_df.txt', sep=' ')
    data_df['rating'] /= max(data_df['rating'])
    social_df = pd.read_table(data_path + 'social_df.txt', sep=' ')

    le = preprocessing.LabelEncoder()
    le.fit(data_df['user_id'])
    data_df['user_id'] = le.transform(data_df['user_id'])
    social_df['user_id'] = le.transform(social_df['user_id'])
    social_df['user_id2'] = le.transform(social_df['user_id2'])
    le.fit(data_df['item_id'])
    data_df['item_id'] = le.transform(data_df['item_id'])

    social_user = defaultdict(set)
    for (user, user2, record) in social_df.itertuples(index=False):
        social_user.setdefault(user, set())
        social_user[user].add(user2)

    df_train = data_df.sample(n=int(len(data_df) * trainset_rate),
                              replace=False)
    df_test = data_df.drop(df_train.index, axis=0)
    # get user number
    # get item number
    n_users = max(set(data_df['user_id'].values)) + 1
    n_items = max(set(data_df['item_id'].values)) + 1
    # 为何使用max不是len

    print("Initialize end. The user number is: %d, item number is: %d" %
          (n_users, n_items))
    train_loader = data.DataLoader(Interactions(df_train),
                                   batch_size=batch_size,
                                   shuffle=True)

    test_loader = data.DataLoader(Interactions(df_test),
                                  batch_size=batch_size,
                                  shuffle=False)

    loaders = {'train': train_loader, 'valid': test_loader}

    return (n_users, n_items, social_user), loaders
Exemplo n.º 4
0
    def loadData(self, data_path, batch_size=2048):
        # load train data
        data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
        # all data file
        data_df = pd.read_table(data_path, names=data_fields)
        data_df.rating = (data_df.rating >= 5).astype(np.float32)
        le = preprocessing.LabelEncoder()
        le.fit(data_df['user_id'])
        data_df['user_id'] = le.transform(data_df['user_id'])
        le.fit(data_df['item_id'])
        data_df['item_id'] = le.transform(data_df['item_id'])

        self.n_users = max(data_df['user_id'].values) + 1
        # get item number
        self.n_items = max(data_df['item_id'].values) + 1

        print("Initialize end.The user number is:%d,item number is:%d" %
              (self.n_users, self.n_items))
        df = {}
        df['train'] = data_df.sample(n=int(len(data_df) * 0.8), replace=False)
        df['valid'] = data_df.drop(df['train'].index, axis=0)

        self.user_item = defaultdict(dict)
        for (user, item, record,
             timestamp) in df['train'].itertuples(index=False):
            self.user_item.setdefault(user, {})
            self.user_item[user][item] = record
            # print("ui:",user," ",item,"=",record)

        # print(self.user_item[500])

        self.loader = {}
        for phase in ['train', 'valid']:
            self.loader[phase] = data.DataLoader(Interactions(df[phase]),
                                                 batch_size=batch_size,
                                                 shuffle=(phase == 'train'))

        print("Initialize end.The user number is:%d,item number is:%d" %
              (self.n_users, self.n_items))

        self.loader['valid_simple'] = data.DataLoader(Interactions(
            df['valid']),
                                                      batch_size=batch_size,
                                                      shuffle=False)