def test_get_similar_artist_candidate_set_df(self):
        similar_artist_df = listenbrainz_spark.session.createDataFrame([
            Row(similar_artist_credit_id=2, similar_artist_name='martinkemp', user_name='rob'),
            Row(similar_artist_credit_id=2, similar_artist_name='martinkemp', user_name='vansika_1'),
        ])

        similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = candidate_sets.get_similar_artist_candidate_set(
            similar_artist_df, self.recordings_df, self.users_df, self.mapped_listens_subset
        )

        self.assertCountEqual(['recording_id', 'user_id', 'user_name'], similar_artist_candidate_set_df.columns)
        self.assertEqual(similar_artist_candidate_set_df.count(), 2)

        self.assertCountEqual(
            ['similar_artist_credit_id', 'artist_credit_id', 'recording_mbid', 'recording_id', 'user_name', 'user_id'],
            similar_artist_candidate_set_df_html.columns
        )
        self.assertEqual(similar_artist_candidate_set_df_html.count(), 2)
예제 #2
0
    def test_get_similar_artist_candidate_set_df(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {}, self.recordings_path)
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {},
                                                      self.users_path)
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        df = utils.create_dataframe(Row(similar_artist_credit_id=2,
                                        similar_artist_name='martinkemp',
                                        user_name='rob'),
                                    schema=None)

        similar_artist_df = df.union(
            utils.create_dataframe(Row(similar_artist_credit_id=2,
                                       similar_artist_name='martinkemp',
                                       user_name='vansika_1'),
                                   schema=None))

        similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = candidate_sets.get_similar_artist_candidate_set(
            similar_artist_df, recordings_df, users, mapped_listens_subset)

        cols = ['recording_id', 'user_id', 'user_name']
        self.assertListEqual(sorted(cols),
                             sorted(similar_artist_candidate_set_df.columns))
        self.assertEqual(similar_artist_candidate_set_df.count(), 2)

        cols = [
            'similar_artist_credit_id', 'similar_artist_name',
            'mb_artist_credit_id', 'mb_artist_credit_mbids',
            'mb_recording_mbid', 'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable', 'recording_id', 'user_name',
            'user_id'
        ]

        self.assertListEqual(
            sorted(cols), sorted(similar_artist_candidate_set_df_html.columns))
        self.assertEqual(similar_artist_candidate_set_df_html.count(), 2)