Пример #1
0
def cluster_users_by_interactions_count(clip):
    """
    Split the playlists based on interactions count above or below the specified clip value.
    
    Parameters
    ----------
    clip : (int), clip value for splitting. The playlists will be splitted in 2 groups:
        those that have an interactions count <= clip and those that have an interactions count > clip.

    Returns
    -------
    2 lists of playlists ids
    """
    playlists = data.get_playlists_df()

    target_playlist = pd.DataFrame({'playlist_id':data.get_target_playlists()})
    target_playlist['index'] = target_playlist.index

    counts = target_playlist.merge(playlists).groupby(['playlist_id', 'index']).size().reset_index(name='counts')

    #counts = counts.reset_index()
    #counts.columns[2] = 'index'
    #counts['index'] = counts.index

    # build dataframe of number of interactions: playlist_id | tracks_count
    #counts = playlists.groupby('playlist_id').size().reset_index(name='counts')

    # split based on the interactions counts
    return counts[counts['counts']<=clip]['index'].values, counts[counts['counts']>clip]['index'].values
Пример #2
0
def histogram_of_interactions():
    """
    Plot the histogram of the interactions counts:
    x axis: interactions
    y axis: count of playlists with that number of interactions
    """
    playlists = data.get_playlists_df()
    target_playlist = pd.DataFrame(
        {'playlist_id': data.get_target_playlists()})

    counts = playlists.merge(target_playlist).groupby(
        'playlist_id').size().reset_index(name='interactions')

    # plot counts for each playlist
    #counts.plot(x='playlist_id', y='interactions', kind='scatter', figsize=(200,100))

    hist = counts.groupby('interactions').size().reset_index(name='counts')
    hist.plot(x='interactions',
              y='counts',
              kind='bar',
              fontsize=7,
              figsize=(150, 100))

    # plot histogram
    plt.show(block=True)
Пример #3
0
def histogram_of_top_pop_items(top_n, only_target=True):
    playlists_df = data.get_playlists_df()
    if only_target:
        # filter only target playlist
        target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()})
        playlists_df = playlists_df.merge(target_playlist_df)
    # track_id | count
    toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count')
    toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n)
    toptracks_df.plot(x='track_id', y='count', kind='bar', fontsize=6, figsize=(150,100))

    # plot histogram
    plt.show(block=True)
Пример #4
0
def cluster_users_by_top_pop_count(clip_perc, top_n=100, only_target=True):
    """
    Return the ids of the playlists containing at least the specified percentage of top
    popular track (in descending order based on contained top pop tracks count)
    
    Parameters
    ----------
    clip_perc: (float) returns only playlist with a percentage of top pop tracks over the total 
                tracks count >= clip_perc
    top_n: consider only the most popular tracks (it should be set equal to the max
            track count among all playlists)
    only_target: (bool) consider only the target playlist

    Returns
    -------
    List of playlist_id
    """
    playlists_df = data.get_playlists_df()
    #tot_interactions = playlists_df.shape[0]
    if only_target:
        # filter only target playlist
        target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()})
        playlists_df = playlists_df.merge(target_playlist_df)

    # track_id | count
    toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count')
    #toptracks_df['relative_count'] = toptracks_df['count'] / tot_interactions
    toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n)

    # playlist_id | top_pop_count
    filtered_df = playlists_df.merge(toptracks_df)
    filtered_df = filtered_df.groupby('playlist_id').size().reset_index(name='top_pop_count')
    #filtered_df = filtered_df.sort_values('top_pop_count', ascending=False)

    # playlist_id | count | top_pop_count | perc
    playlists_count_df = playlists_df.groupby('playlist_id').size().reset_index(name='count')

    final_df = playlists_count_df.merge(filtered_df)
    final_df['perc'] = np.divide(final_df['top_pop_count'], final_df['count'])
    # filter only playlist with top pop perc >= clip_perc
    final_df = final_df[final_df['perc']>=clip_perc]
    final_df.sort_values(['perc','top_pop_count'], ascending=False, inplace=True)
    return final_df['playlist_id'].values
Пример #5
0
def cluster_users_by_interactions_count(clip):
    """
    Split the playlists based on interactions count above or below the specified clip value.
    
    Parameters
    ----------
    clip : (int), clip value for splitting. The playlists will be splitted in 2 groups:
    those that have an interactions count <= clip and those that have an interactions count > clip.

    Returns
    -------
    2 lists of playlists ids
    """
    playlists = data.get_playlists_df()

    # build dataframe of number of interactions: playlist_id | tracks_count
    counts = playlists.groupby('playlist_id').size().reset_index(name='counts')

    # split based on the interactions counts
    return counts[counts['counts'] <= clip]['playlist_id'].values, counts[
        counts['counts'] > clip]['playlist_id'].values
Пример #6
0
    if len(df_test[df_test['playlist_id'].isin(p)].groupby(
            'playlist_id')) != len(p):
        if target == 'all':
            print(
                "WARNING: not all the target playlists (JUST THE TARGETS) have a song in the training set"
            )
        elif target == 'target':
            print(
                "WARNING: not all the playlists (ALL OF THEM) have a song in the training set"
            )


def _create_urm(df):
    """
    Utility method
    :param df: (panda's dataframe) represents a set of playlists and tracks (train.csv-like)
    :return:   (csr matrix) the urm built from the df, in format (number of playlists, number of tracks)
    """
    return csr_matrix((df['rating'].values,
                       (df['playlist_id'].values, df['track_id'].values)),
                      shape=(d.N_PLAYLISTS, d.N_TRACKS))


if __name__ == "__main__":
    df = d.get_playlists_df()
    pi = ProcessInteractions(df)
    es = ExplicateBase()
    s = SplitRandom(0.2)
    create_urms(pi, es, s)
Пример #7
0
    elif target == 'target':
        p = d.get_target_playlists()

    if len(df_test[df_test['playlist_id'].isin(p)].groupby(
            'playlist_id')) != len(p):
        if target == 'all':
            print(
                "WARNING: not all the target playlists (JUST THE TARGETS) have a song in the training set"
            )
        elif target == 'target':
            print(
                "WARNING: not all the playlists (ALL OF THEM) have a song in the training set"
            )


def _create_urm(df):
    """
    Utility method
    :param df: (panda's dataframe) represents a set of playlists and tracks (train.csv-like)
    :return:   (csr matrix) the urm built from the df, in format (number of playlists, number of tracks)
    """
    return csr_matrix((np.ones(df.shape[0], dtype=int),
                       (df['playlist_id'].values, df['track_id'].values)),
                      shape=(d.N_PLAYLISTS, d.N_TRACKS))


df = d.get_playlists_df()  # reads train.csv path
pi = ProcessInteractions(df)
s = SplitRandom(0.2)
create_urms(pi, s)
Пример #8
0
# #### Append the 'profile_length' column to the recommendation dataframe

#%%
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

#%%
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')

#%% [markdown]
# #### Popularity feature

#%%
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')

#%%
rec_pop_df = rec_lengths_df.join(popularity.set_index('track_id'), on='track_id')

#%% [markdown]
# #### Append the 'label' column 

#%%
urm_test = data.get_urm_test_1()
test_labels = []

last_playlist_id = -1
for idx,row in recs_df.iterrows():
    current_playlist_id = row['playlist_id']