示例#1
0
def get_recordings_df(mapped_listens_df, metadata, save_path):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.
            save_path (str): path where recordings_df should be saved

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('mb_recording_mbid')

    recordings_df = mapped_listens_df.select('mb_artist_credit_id',
                                             'mb_artist_credit_mbids',
                                             'mb_recording_mbid',
                                             'mb_release_mbid',
                                             'msb_artist_credit_name_matchable',
                                             'msb_recording_name_matchable') \
                                     .distinct() \
                                     .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, save_path)
    return recordings_df
示例#2
0
def save_playcounts_df(listens_df, recordings_df, users_df, threshold,
                       metadata, save_path):
    """ Prepare and save playcounts dataframe.

        Args:
            listens_df (dataframe): Dataframe containing recording_mbids corresponding to a user.
            recordings_df (dataframe): Dataframe containing distinct recordings and corresponding
                                       mbids and names.
            users_df (dataframe): Dataframe containing user names and user ids.
            threshold (int): minimum number of listens a user should have to be saved in the dataframe.
            metadata (dict): metadata dataframe to append.
            save_path (str): path where playcounts_df should be saved.
    """
    # listens_df is joined with users_df on user_name.
    # The output is then joined with recording_df on recording_mbid.
    # The final step uses groupBy which create groups on user_id and recording_id and counts the number of recording_ids.
    # The final dataframe tells us about the number of times a user has listend to a particular track for all users.
    playcounts_df = listens_df.join(users_df, 'user_name', 'inner') \
                              .join(recordings_df, 'mb_recording_mbid', 'inner') \
                              .groupBy('user_id', 'recording_id') \
                              .agg(func.count('recording_id').alias('count')) \
                              .where('count > {}'.format(threshold))

    metadata['playcounts_count'] = playcounts_df.count()
    save_dataframe(playcounts_df, save_path)
    def test_save_dataframe(self):
        path_ = '/test_df.parquet'
        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        dataframe_utils.save_dataframe(df, path_)

        status = utils.path_exists(path_)
        self.assertTrue(status)
def get_users_dataframe(mapped_listens_df, metadata):
    """ Prepare users dataframe

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            users_df : Dataframe containing user names and user ids.
    """
    # We use window function to give rank to distinct user_names
    # Note that if user_names are not distinct rank would repeat and give unexpected results.
    user_window = Window.orderBy('user_name')
    users_df = mapped_listens_df.select('user_name').distinct() \
                                .withColumn('user_id', rank().over(user_window))

    metadata['users_count'] = users_df.count()
    save_dataframe(users_df, path.USERS_DATAFRAME_PATH)
    return users_df
示例#5
0
def get_threshold_listens_df(mapped_listens_df, mapped_listens_path: str, threshold: int):
    """ Threshold mapped listens dataframe

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.
            mapped_listens_path: Path to store mapped listens.
            threshold: minimum number of listens a user should have to be saved in the dataframe.
        Returns:
             threshold_listens_df: mapped listens dataframe after dropping data below threshold
    """
    threshold_users_df = mapped_listens_df \
        .groupBy('user_name') \
        .agg(func.count('user_name').alias('listen_count')) \
        .where('listen_count > {}'.format(threshold)) \
        .collect()
    threshold_users = [x.user_name for x in threshold_users_df]
    threshold_listens_df = mapped_listens_df.where(col('user_name').isin(threshold_users))
    save_dataframe(threshold_listens_df, mapped_listens_path)
    return threshold_listens_df
def save_playcounts_df(listens_df, recordings_df, users_df, metadata):
    """ Prepare and save playcounts dataframe.

        Args:
            listens_df : Dataframe containing recording_mbids corresponding to a user.
            recordings_df : Dataframe containing distinct recordings and corresponding
                                       mbids and names.
            users_df : Dataframe containing user names and user ids.
    """
    # listens_df is joined with users_df on user_name.
    # The output is then joined with recording_df on recording_mbid.
    # The final step uses groupBy which create groups on user_id and recording_id and counts the number of recording_ids.
    # The final dataframe tells us about the number of times a user has listend to a particular track for all users.
    playcounts_df = listens_df.join(users_df, 'user_name', 'inner') \
                              .join(recordings_df, 'mb_recording_mbid', 'inner') \
                              .groupBy('user_id', 'recording_id') \
                              .agg(func.count('recording_id').alias('count'))

    metadata['playcounts_count'] = playcounts_df.count()
    save_dataframe(playcounts_df, path.PLAYCOUNTS_DATAFRAME_PATH)
def get_recordings_df(mapped_listens_df, metadata):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('mb_recording_mbid')

    recordings_df = mapped_listens_df.select('mb_artist_credit_id',
                                             'mb_artist_credit_mbids',
                                             'mb_recording_mbid',
                                             'mb_release_mbid',
                                             'msb_artist_credit_name_matchable',
                                             'msb_recording_name_matchable') \
                                     .distinct() \
                                     .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, path.RECOMMENDATION_RECORDINGS_DATAFRAME)
    return recordings_df