示例#1
0
    data = data[['user_id', 'track_id', 'created_at'] +
                track_feature_cols].dropna()

    users = data[['user_id']].drop_duplicates()
    tracks = data[['track_id'] + track_feature_cols].drop_duplicates()
    assert tracks['track_id'].value_counts().max() == 1
    tracks = tracks.astype({
        'mode': 'int64',
        'key': 'int64',
        'artist_id': 'category'
    })
    events = data[['user_id', 'track_id', 'created_at']]
    events['created_at'] = events['created_at'].values.astype(
        'datetime64[s]').astype('int64')

    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(users, 'user_id', 'user')
    graph_builder.add_entities(tracks, 'track_id', 'track')
    graph_builder.add_binary_relations(events, 'user_id', 'track_id',
                                       'listened')
    graph_builder.add_binary_relations(events, 'track_id', 'user_id',
                                       'listened-by')

    g = graph_builder.build()

    float_cols = []
    for col in tracks.columns:
        if col == 'track_id':
            continue
        elif col == 'artist_id':
            g.nodes['track'].data[col] = torch.LongTensor(
示例#2
0
            })
    ratings = pd.DataFrame(ratings)

    # Filter the users and items that never appear in the rating table.
    distinct_users_in_ratings = ratings['user_id'].unique()
    distinct_movies_in_ratings = ratings['movie_id'].unique()
    users = users[users['user_id'].isin(distinct_users_in_ratings)]
    movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)]

    # Group the movie features into genres (a vector), year (a category), title (a string)
    genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
    movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
    movies_categorical = movies.drop('title', axis=1)

    # Build graph
    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(users, 'user_id', 'user')
    graph_builder.add_entities(movies_categorical, 'movie_id', 'movie')
    graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id',
                                       'watched')
    graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id',
                                       'watched-by')

    g = graph_builder.build()

    # Assign features.
    # Note that variable-sized features such as texts or images are handled elsewhere.
    g.nodes['user'].data['gender'] = torch.LongTensor(
        users['gender'].cat.codes.values)
    g.nodes['user'].data['age'] = torch.LongTensor(
        users['age'].cat.codes.values)
示例#3
0
games.columns = ['_', 'game']
games = games.drop(columns='_', axis=1)
games_play.columns = ['_', 'game_play']
games_play = games_play.drop(columns='_', axis=1)

play2game.columns = ['_', 'game_play', 'game']
play2game = play2game.drop(columns='_', axis=1)
comments.columns = [
    '_', 'channel', 'user', 'timestamp', 'message', 'game_play'
]
comments = comments.drop(columns=['_', 'message'], axis=1)

print("Finish reading csv files")

# Build trirelation graph
builder = PandasGraphBuilder()
builder.add_entities(users, 'user', 'user')
builder.add_entities(games, 'game', 'game')
builder.add_entities(channels, 'channel', 'channel')
builder.add_entities(games_play, 'game_play', 'game_play')

builder.add_binary_relations(play2game, 'game_play', 'game', 'corresponds')
# builder.add_binary_relations(play2game, 'game_play', 'game', 'corresponded')
builder.add_binary_relations(comments, 'user', 'game_play', 'comments')
# builder.add_binary_relations(comments, 'game', 'user', 'commented-by')
builder.add_binary_relations(comments, 'user', 'channel', 'watches')
# builder.add_binary_relations(comments, 'user', 'channel', 'watched-by')
builder.add_binary_relations(comments, 'channel', 'game_play', 'contains')
# builder.add_binary_relations(comments, 'game', 'channel', 'contained-by')
builder.add_binary_relations(subscribes, 'user', 'channel', 'subscribes')
# builder.add_binary_relations(subscribes, 'channel', 'user', 'subscribed-by')
    if is_feature_hasher:
        print('feature hasing ...')
        mlb = MultiLabelBinarizer()
        encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] )
        fea_hasher = FeatureHasher(n_features=hash_dim)
        # wrap 'encodings' into dict
        all_categories = list(mlb.classes_)
        encode_dict_list = [ dict(zip(all_categories, list(instance_encoding)))  for instance_encoding in encodings] 
        hash_encodings = fea_hasher.transform(encode_dict_list).toarray()
    else:
        mlb = MultiLabelBinarizer()
        hash_encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] )

    # Build graph
    print('building graph ...')
    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(user_df, 'user_id', 'user')
    graph_builder.add_entities(item_df, 'business_id', 'item')
    graph_builder.add_binary_relations(review_df, 'user_id', 'business_id', 'reviewed')
    graph_builder.add_binary_relations(review_df, 'business_id', 'user_id', 'reviewed-by')

    g = graph_builder.build()

    print('Assigning feature ...')
    # Assign features.
    g.nodes['user'].data['review_count'] = torch.FloatTensor(user_df['review_count'].values)
    g.nodes['user'].data['average_stars'] = torch.FloatTensor(user_df['average_stars'].values)

    g.nodes['item'].data['city'] = torch.LongTensor(item_df['city'].cat.codes.values)
    g.nodes['item'].data['is_open'] = torch.LongTensor(item_df['is_open'].cat.codes.values)
    g.nodes['item'].data['stars'] = torch.FloatTensor(item_df['stars'].values)
def movielens_graph_building(args):
    directory = args.directory

    users = []
    with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
        for l in f:
            id_, gender, age, occupation, zip_ = l.strip().split('::')
            users.append({
                'user_id': int(id_),
                'gender': gender,
                'age': age,
                'occupation': occupation,
                'zip': zip_,
            })
    users = pd.DataFrame(users).astype('category')

    movies = []
    with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
        for l in f:
            id_, title, genres = l.strip().split('::')
            genres_set = set(genres.split('|'))

            # extract year
            assert re.match(r'.*\([0-9]{4}\)$', title)
            year = title[-5:-1]
            title = title[:-6].strip()

            data = {'movie_id': int(id_), 'title': title, 'year': year}
            for g in genres_set:
                data[g] = True
            movies.append(data)
    movies = pd.DataFrame(movies).astype({'year': 'category'})

    ratings = []
    with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
        for l in f:
            user_id, movie_id, rating, timestamp = [
                int(_) for _ in l.split('::')
            ]
            ratings.append({
                'user_id': user_id,
                'movie_id': movie_id,
                'rating': rating,
                'timestamp': timestamp,
            })
    ratings = pd.DataFrame(ratings)

    distinct_users_in_ratings = ratings['user_id'].unique()
    distinct_movies_in_ratings = ratings['movie_id'].unique()
    users = users[users['user_id'].isin(distinct_users_in_ratings)]
    movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)]

    genre_columns = movies.columns.drop(['movie_id', 'title', 'year'])
    movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
    movies_categorical = movies.drop('title', axis=1)

    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(users, 'user_id', 'user')
    graph_builder.add_entities(movies_categorical, 'movie_id', 'movie')
    graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id',
                                       'watched')
    graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id',
                                       'watched-by')
    g = graph_builder.build()

    g.nodes['movie'].data['year'] = torch.LongTensor(
        movies['year'].cat.codes.values)
    g.nodes['movie'].data['genre'] = torch.FloatTensor(
        movies[genre_columns].values)
    g.edges['watched'].data['rating'] = torch.LongTensor(
        ratings['rating'].values)
    g.edges['watched'].data['timestamp'] = torch.LongTensor(
        ratings['timestamp'].values)
    g.edges['watched-by'].data['rating'] = torch.LongTensor(
        ratings['rating'].values)
    g.edges['watched-by'].data['timestamp'] = torch.LongTensor(
        ratings['timestamp'].values)

    return g
示例#6
0
def movielens_graph_building(args):
    directory = args.directory

    movies = []
    with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
        for l in f:
            id_, title, genres = l.strip().split('::')
            genres_set = set(genres.split('|'))

            # extract year
            assert re.match(r'.*\([0-9]{4}\)$', title)
            year = title[-5:-1]
            title = title[:-6].strip()

            data = {
                'movie_id': int(id_),
                'title': title,
                'year': year,
                'genre': genres.split("|")
            }
            for g in genres_set:
                data[g] = True
            movies.append(data)
    movies = pd.DataFrame(movies).astype({'year': 'category'})

    ratings = []
    with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
        for l in f:
            user_id, movie_id, rating, timestamp = [
                int(_) for _ in l.split('::')
            ]
            ratings.append({
                'user_id': user_id,
                'movie_id': movie_id,
                'rating': rating,
                'timestamp': timestamp,
            })
    ratings = pd.DataFrame(ratings)

    merged_ratings = pd.merge(ratings, movies, on=['movie_id'])
    merged_ratings = merged_ratings[['movie_id', 'rating', 'genre']]
    merged_ratings = merged_ratings.explode('genre')
    genres = pd.DataFrame(merged_ratings['genre'].unique()).reset_index()
    genres.columns = ['genre_id', 'genre']
    merged_ratings = pd.merge(merged_ratings, genres, on='genre')
    distinct_movies_in_ratings = merged_ratings['movie_id'].unique()
    movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)]
    genres = pd.DataFrame(genres).astype({'genre_id': 'category'})

    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(genres, 'genre_id', 'genre')
    graph_builder.add_entities(movies, 'movie_id', 'movie')
    graph_builder.add_binary_relations(merged_ratings, 'genre_id', 'movie_id',
                                       'define')
    graph_builder.add_binary_relations(merged_ratings, 'movie_id', 'genre_id',
                                       'define-by')
    g = graph_builder.build()

    g.nodes['genre'].data['id'] = torch.LongTensor(
        genres['genre_id'].cat.codes.values)
    movies = pd.DataFrame(movies).astype({'year': 'category'})
    genre_columns = movies.columns.drop(['movie_id', 'title', 'year', 'genre'])
    movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool')
    g.nodes['movie'].data['year'] = torch.LongTensor(
        movies['year'].cat.codes.values)
    g.nodes['movie'].data['genre'] = torch.FloatTensor(
        movies[genre_columns].values)
    g.edges['define'].data['rating'] = torch.LongTensor(
        merged_ratings['rating'].values)
    g.edges['define-by'].data['rating'] = torch.LongTensor(
        merged_ratings['rating'].values)

    return g
示例#7
0
new_users = users[users['userID'].isin(user_intersect)]
new_items = items[items['wine_id'].isin(item_intersect)]
new_ratings = ratings[ratings['userID'].isin(user_intersect)
                      & ratings['wine_id'].isin(item_intersect)]
new_ratings = new_ratings.sort_values('userID')

label = []
for userID, df in new_ratings.groupby('userID'):
    idx = int(df.shape[0] * 0.8)
    timestamp = [0] * df.shape[0]
    timestamp = [x if i < idx else 1 for i, x in enumerate(timestamp)]
    label.extend(timestamp)
new_ratings['timestamp'] = label

# Build graph
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(new_users, 'userID', 'user')
graph_builder.add_entities(new_items, 'wine_id', 'wine')
graph_builder.add_binary_relations(new_ratings, 'userID', 'wine_id', 'rated')
graph_builder.add_binary_relations(new_ratings, 'wine_id', 'userID',
                                   'rated-by')
g = graph_builder.build()

# Assign features.
node_dict = {
    'user': [new_users, ['userID', 'user_feats'], ['cat', 'int']],
    'wine':
    [new_items, ['wine_id', 'grapes_id', 'wine_feats'], ['cat', 'cat', 'int']]
}
edge_dict = {
    'rated': [new_ratings, ['rating_per_user', 'timestamp']],