def get_recordings_df(mapped_listens_df, metadata, save_path): """ Prepare recordings dataframe. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. save_path (str): path where recordings_df should be saved Returns: recordings_df: Dataframe containing distinct recordings and corresponding mbids and names. """ recording_window = Window.orderBy('mb_recording_mbid') recordings_df = mapped_listens_df.select('mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable') \ .distinct() \ .withColumn('recording_id', rank().over(recording_window)) metadata['recordings_count'] = recordings_df.count() save_dataframe(recordings_df, save_path) return recordings_df
def save_playcounts_df(listens_df, recordings_df, users_df, threshold, metadata, save_path): """ Prepare and save playcounts dataframe. Args: listens_df (dataframe): Dataframe containing recording_mbids corresponding to a user. recordings_df (dataframe): Dataframe containing distinct recordings and corresponding mbids and names. users_df (dataframe): Dataframe containing user names and user ids. threshold (int): minimum number of listens a user should have to be saved in the dataframe. metadata (dict): metadata dataframe to append. save_path (str): path where playcounts_df should be saved. """ # listens_df is joined with users_df on user_name. # The output is then joined with recording_df on recording_mbid. # The final step uses groupBy which create groups on user_id and recording_id and counts the number of recording_ids. # The final dataframe tells us about the number of times a user has listend to a particular track for all users. playcounts_df = listens_df.join(users_df, 'user_name', 'inner') \ .join(recordings_df, 'mb_recording_mbid', 'inner') \ .groupBy('user_id', 'recording_id') \ .agg(func.count('recording_id').alias('count')) \ .where('count > {}'.format(threshold)) metadata['playcounts_count'] = playcounts_df.count() save_dataframe(playcounts_df, save_path)
def test_save_dataframe(self): path_ = '/test_df.parquet' df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) dataframe_utils.save_dataframe(df, path_) status = utils.path_exists(path_) self.assertTrue(status)
def get_users_dataframe(mapped_listens_df, metadata): """ Prepare users dataframe Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. Returns: users_df : Dataframe containing user names and user ids. """ # We use window function to give rank to distinct user_names # Note that if user_names are not distinct rank would repeat and give unexpected results. user_window = Window.orderBy('user_name') users_df = mapped_listens_df.select('user_name').distinct() \ .withColumn('user_id', rank().over(user_window)) metadata['users_count'] = users_df.count() save_dataframe(users_df, path.USERS_DATAFRAME_PATH) return users_df
def get_threshold_listens_df(mapped_listens_df, mapped_listens_path: str, threshold: int): """ Threshold mapped listens dataframe Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. mapped_listens_path: Path to store mapped listens. threshold: minimum number of listens a user should have to be saved in the dataframe. Returns: threshold_listens_df: mapped listens dataframe after dropping data below threshold """ threshold_users_df = mapped_listens_df \ .groupBy('user_name') \ .agg(func.count('user_name').alias('listen_count')) \ .where('listen_count > {}'.format(threshold)) \ .collect() threshold_users = [x.user_name for x in threshold_users_df] threshold_listens_df = mapped_listens_df.where(col('user_name').isin(threshold_users)) save_dataframe(threshold_listens_df, mapped_listens_path) return threshold_listens_df
def save_playcounts_df(listens_df, recordings_df, users_df, metadata): """ Prepare and save playcounts dataframe. Args: listens_df : Dataframe containing recording_mbids corresponding to a user. recordings_df : Dataframe containing distinct recordings and corresponding mbids and names. users_df : Dataframe containing user names and user ids. """ # listens_df is joined with users_df on user_name. # The output is then joined with recording_df on recording_mbid. # The final step uses groupBy which create groups on user_id and recording_id and counts the number of recording_ids. # The final dataframe tells us about the number of times a user has listend to a particular track for all users. playcounts_df = listens_df.join(users_df, 'user_name', 'inner') \ .join(recordings_df, 'mb_recording_mbid', 'inner') \ .groupBy('user_id', 'recording_id') \ .agg(func.count('recording_id').alias('count')) metadata['playcounts_count'] = playcounts_df.count() save_dataframe(playcounts_df, path.PLAYCOUNTS_DATAFRAME_PATH)
def get_recordings_df(mapped_listens_df, metadata): """ Prepare recordings dataframe. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. Returns: recordings_df: Dataframe containing distinct recordings and corresponding mbids and names. """ recording_window = Window.orderBy('mb_recording_mbid') recordings_df = mapped_listens_df.select('mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable') \ .distinct() \ .withColumn('recording_id', rank().over(recording_window)) metadata['recordings_count'] = recordings_df.count() save_dataframe(recordings_df, path.RECOMMENDATION_RECORDINGS_DATAFRAME) return recordings_df