data = data[['user_id', 'track_id', 'created_at'] + track_feature_cols].dropna() users = data[['user_id']].drop_duplicates() tracks = data[['track_id'] + track_feature_cols].drop_duplicates() assert tracks['track_id'].value_counts().max() == 1 tracks = tracks.astype({ 'mode': 'int64', 'key': 'int64', 'artist_id': 'category' }) events = data[['user_id', 'track_id', 'created_at']] events['created_at'] = events['created_at'].values.astype( 'datetime64[s]').astype('int64') graph_builder = PandasGraphBuilder() graph_builder.add_entities(users, 'user_id', 'user') graph_builder.add_entities(tracks, 'track_id', 'track') graph_builder.add_binary_relations(events, 'user_id', 'track_id', 'listened') graph_builder.add_binary_relations(events, 'track_id', 'user_id', 'listened-by') g = graph_builder.build() float_cols = [] for col in tracks.columns: if col == 'track_id': continue elif col == 'artist_id': g.nodes['track'].data[col] = torch.LongTensor(
}) ratings = pd.DataFrame(ratings) # Filter the users and items that never appear in the rating table. distinct_users_in_ratings = ratings['user_id'].unique() distinct_movies_in_ratings = ratings['movie_id'].unique() users = users[users['user_id'].isin(distinct_users_in_ratings)] movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)] # Group the movie features into genres (a vector), year (a category), title (a string) genre_columns = movies.columns.drop(['movie_id', 'title', 'year']) movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool') movies_categorical = movies.drop('title', axis=1) # Build graph graph_builder = PandasGraphBuilder() graph_builder.add_entities(users, 'user_id', 'user') graph_builder.add_entities(movies_categorical, 'movie_id', 'movie') graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id', 'watched') graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id', 'watched-by') g = graph_builder.build() # Assign features. # Note that variable-sized features such as texts or images are handled elsewhere. g.nodes['user'].data['gender'] = torch.LongTensor( users['gender'].cat.codes.values) g.nodes['user'].data['age'] = torch.LongTensor( users['age'].cat.codes.values)
games.columns = ['_', 'game'] games = games.drop(columns='_', axis=1) games_play.columns = ['_', 'game_play'] games_play = games_play.drop(columns='_', axis=1) play2game.columns = ['_', 'game_play', 'game'] play2game = play2game.drop(columns='_', axis=1) comments.columns = [ '_', 'channel', 'user', 'timestamp', 'message', 'game_play' ] comments = comments.drop(columns=['_', 'message'], axis=1) print("Finish reading csv files") # Build trirelation graph builder = PandasGraphBuilder() builder.add_entities(users, 'user', 'user') builder.add_entities(games, 'game', 'game') builder.add_entities(channels, 'channel', 'channel') builder.add_entities(games_play, 'game_play', 'game_play') builder.add_binary_relations(play2game, 'game_play', 'game', 'corresponds') # builder.add_binary_relations(play2game, 'game_play', 'game', 'corresponded') builder.add_binary_relations(comments, 'user', 'game_play', 'comments') # builder.add_binary_relations(comments, 'game', 'user', 'commented-by') builder.add_binary_relations(comments, 'user', 'channel', 'watches') # builder.add_binary_relations(comments, 'user', 'channel', 'watched-by') builder.add_binary_relations(comments, 'channel', 'game_play', 'contains') # builder.add_binary_relations(comments, 'game', 'channel', 'contained-by') builder.add_binary_relations(subscribes, 'user', 'channel', 'subscribes') # builder.add_binary_relations(subscribes, 'channel', 'user', 'subscribed-by')
if is_feature_hasher: print('feature hasing ...') mlb = MultiLabelBinarizer() encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] ) fea_hasher = FeatureHasher(n_features=hash_dim) # wrap 'encodings' into dict all_categories = list(mlb.classes_) encode_dict_list = [ dict(zip(all_categories, list(instance_encoding))) for instance_encoding in encodings] hash_encodings = fea_hasher.transform(encode_dict_list).toarray() else: mlb = MultiLabelBinarizer() hash_encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] ) # Build graph print('building graph ...') graph_builder = PandasGraphBuilder() graph_builder.add_entities(user_df, 'user_id', 'user') graph_builder.add_entities(item_df, 'business_id', 'item') graph_builder.add_binary_relations(review_df, 'user_id', 'business_id', 'reviewed') graph_builder.add_binary_relations(review_df, 'business_id', 'user_id', 'reviewed-by') g = graph_builder.build() print('Assigning feature ...') # Assign features. g.nodes['user'].data['review_count'] = torch.FloatTensor(user_df['review_count'].values) g.nodes['user'].data['average_stars'] = torch.FloatTensor(user_df['average_stars'].values) g.nodes['item'].data['city'] = torch.LongTensor(item_df['city'].cat.codes.values) g.nodes['item'].data['is_open'] = torch.LongTensor(item_df['is_open'].cat.codes.values) g.nodes['item'].data['stars'] = torch.FloatTensor(item_df['stars'].values)
def movielens_graph_building(args): directory = args.directory users = [] with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f: for l in f: id_, gender, age, occupation, zip_ = l.strip().split('::') users.append({ 'user_id': int(id_), 'gender': gender, 'age': age, 'occupation': occupation, 'zip': zip_, }) users = pd.DataFrame(users).astype('category') movies = [] with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f: for l in f: id_, title, genres = l.strip().split('::') genres_set = set(genres.split('|')) # extract year assert re.match(r'.*\([0-9]{4}\)$', title) year = title[-5:-1] title = title[:-6].strip() data = {'movie_id': int(id_), 'title': title, 'year': year} for g in genres_set: data[g] = True movies.append(data) movies = pd.DataFrame(movies).astype({'year': 'category'}) ratings = [] with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f: for l in f: user_id, movie_id, rating, timestamp = [ int(_) for _ in l.split('::') ] ratings.append({ 'user_id': user_id, 'movie_id': movie_id, 'rating': rating, 'timestamp': timestamp, }) ratings = pd.DataFrame(ratings) distinct_users_in_ratings = ratings['user_id'].unique() distinct_movies_in_ratings = ratings['movie_id'].unique() users = users[users['user_id'].isin(distinct_users_in_ratings)] movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)] genre_columns = movies.columns.drop(['movie_id', 'title', 'year']) movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool') movies_categorical = movies.drop('title', axis=1) graph_builder = PandasGraphBuilder() graph_builder.add_entities(users, 'user_id', 'user') graph_builder.add_entities(movies_categorical, 'movie_id', 'movie') graph_builder.add_binary_relations(ratings, 'user_id', 'movie_id', 'watched') graph_builder.add_binary_relations(ratings, 'movie_id', 'user_id', 'watched-by') g = graph_builder.build() g.nodes['movie'].data['year'] = torch.LongTensor( movies['year'].cat.codes.values) g.nodes['movie'].data['genre'] = torch.FloatTensor( movies[genre_columns].values) g.edges['watched'].data['rating'] = torch.LongTensor( ratings['rating'].values) g.edges['watched'].data['timestamp'] = torch.LongTensor( ratings['timestamp'].values) g.edges['watched-by'].data['rating'] = torch.LongTensor( ratings['rating'].values) g.edges['watched-by'].data['timestamp'] = torch.LongTensor( ratings['timestamp'].values) return g
def movielens_graph_building(args): directory = args.directory movies = [] with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f: for l in f: id_, title, genres = l.strip().split('::') genres_set = set(genres.split('|')) # extract year assert re.match(r'.*\([0-9]{4}\)$', title) year = title[-5:-1] title = title[:-6].strip() data = { 'movie_id': int(id_), 'title': title, 'year': year, 'genre': genres.split("|") } for g in genres_set: data[g] = True movies.append(data) movies = pd.DataFrame(movies).astype({'year': 'category'}) ratings = [] with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f: for l in f: user_id, movie_id, rating, timestamp = [ int(_) for _ in l.split('::') ] ratings.append({ 'user_id': user_id, 'movie_id': movie_id, 'rating': rating, 'timestamp': timestamp, }) ratings = pd.DataFrame(ratings) merged_ratings = pd.merge(ratings, movies, on=['movie_id']) merged_ratings = merged_ratings[['movie_id', 'rating', 'genre']] merged_ratings = merged_ratings.explode('genre') genres = pd.DataFrame(merged_ratings['genre'].unique()).reset_index() genres.columns = ['genre_id', 'genre'] merged_ratings = pd.merge(merged_ratings, genres, on='genre') distinct_movies_in_ratings = merged_ratings['movie_id'].unique() movies = movies[movies['movie_id'].isin(distinct_movies_in_ratings)] genres = pd.DataFrame(genres).astype({'genre_id': 'category'}) graph_builder = PandasGraphBuilder() graph_builder.add_entities(genres, 'genre_id', 'genre') graph_builder.add_entities(movies, 'movie_id', 'movie') graph_builder.add_binary_relations(merged_ratings, 'genre_id', 'movie_id', 'define') graph_builder.add_binary_relations(merged_ratings, 'movie_id', 'genre_id', 'define-by') g = graph_builder.build() g.nodes['genre'].data['id'] = torch.LongTensor( genres['genre_id'].cat.codes.values) movies = pd.DataFrame(movies).astype({'year': 'category'}) genre_columns = movies.columns.drop(['movie_id', 'title', 'year', 'genre']) movies[genre_columns] = movies[genre_columns].fillna(False).astype('bool') g.nodes['movie'].data['year'] = torch.LongTensor( movies['year'].cat.codes.values) g.nodes['movie'].data['genre'] = torch.FloatTensor( movies[genre_columns].values) g.edges['define'].data['rating'] = torch.LongTensor( merged_ratings['rating'].values) g.edges['define-by'].data['rating'] = torch.LongTensor( merged_ratings['rating'].values) return g
new_users = users[users['userID'].isin(user_intersect)] new_items = items[items['wine_id'].isin(item_intersect)] new_ratings = ratings[ratings['userID'].isin(user_intersect) & ratings['wine_id'].isin(item_intersect)] new_ratings = new_ratings.sort_values('userID') label = [] for userID, df in new_ratings.groupby('userID'): idx = int(df.shape[0] * 0.8) timestamp = [0] * df.shape[0] timestamp = [x if i < idx else 1 for i, x in enumerate(timestamp)] label.extend(timestamp) new_ratings['timestamp'] = label # Build graph graph_builder = PandasGraphBuilder() graph_builder.add_entities(new_users, 'userID', 'user') graph_builder.add_entities(new_items, 'wine_id', 'wine') graph_builder.add_binary_relations(new_ratings, 'userID', 'wine_id', 'rated') graph_builder.add_binary_relations(new_ratings, 'wine_id', 'userID', 'rated-by') g = graph_builder.build() # Assign features. node_dict = { 'user': [new_users, ['userID', 'user_feats'], ['cat', 'int']], 'wine': [new_items, ['wine_id', 'grapes_id', 'wine_feats'], ['cat', 'cat', 'int']] } edge_dict = { 'rated': [new_ratings, ['rating_per_user', 'timestamp']],