def test_get_top_artists(self):
        mapped_listens = utils.read_files_from_HDFS(self.mapped_listens_path)
        top_artist_limit = 1
        test_top_artist = candidate_sets.get_top_artists(
            mapped_listens, top_artist_limit, [])

        cols = [
            'top_artist_credit_id', 'top_artist_name', 'user_name',
            'total_count'
        ]
        self.assertListEqual(cols, test_top_artist.columns)
        self.assertEqual(test_top_artist.count(), 2)

        top_artist_id = sorted(
            [row.top_artist_credit_id for row in test_top_artist.collect()])
        self.assertEqual(top_artist_id[0], 2)
        self.assertEqual(top_artist_id[1], 2)

        # empty df
        mapped_listens = mapped_listens.select('*').where(
            f.col('user_name') == 'lala')
        with self.assertRaises(TopArtistNotFetchedException):
            candidate_sets.get_top_artists(mapped_listens, top_artist_limit,
                                           [])

        with self.assertRaises(TopArtistNotFetchedException):
            candidate_sets.get_top_artists(mapped_listens, top_artist_limit,
                                           ['lala'])
    def test_filter_last_x_days_recordings(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        _, candidate_set_df = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)

        df = candidate_sets.filter_last_x_days_recordings(
            candidate_set_df, mapped_listens_subset)

        user_name = [row.user_name for row in df.collect()]
        self.assertEqual(sorted(user_name), ['rob', 'rob', 'vansika_1'])
        received_recording_mbid = sorted(
            [row.mb_recording_mbid for row in df.collect()])
        expected_recording_mbid = sorted([
            "sf5a56f4-1f83-4681-b319-70a734d0d047",
            "af5a56f4-1f83-4681-b319-70a734d0d047",
            "sf5a56f4-1f83-4681-b319-70a734d0d047"
        ])
        self.assertEqual(expected_recording_mbid, received_recording_mbid)
    def test_get_top_artist_candidate_set(self):
        mapped_listens_df = utils.read_files_from_HDFS(
            self.mapped_listens_path)
        recordings_df = create_dataframes.get_recordings_df(
            mapped_listens_df, {})
        users = create_dataframes.get_users_dataframe(mapped_listens_df, {})
        mapped_listens_subset = utils.read_files_from_HDFS(
            self.mapped_listens_subset_path)

        top_artist_limit = 1
        top_artist_df = candidate_sets.get_top_artists(mapped_listens_subset,
                                                       top_artist_limit, [])

        top_artist_candidate_set_df, top_artist_candidate_set_df_html = candidate_sets.get_top_artist_candidate_set(
            top_artist_df, recordings_df, users, mapped_listens_subset)
        cols = ['recording_id', 'user_id', 'user_name']
        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df.columns))
        self.assertEqual(top_artist_candidate_set_df.count(), 3)

        cols = [
            'top_artist_credit_id', 'top_artist_name', 'mb_artist_credit_id',
            'mb_artist_credit_mbids', 'mb_recording_mbid',
            'msb_artist_credit_name_matchable', 'msb_recording_name_matchable',
            'recording_id', 'user_name', 'user_id'
        ]

        self.assertListEqual(sorted(cols),
                             sorted(top_artist_candidate_set_df_html.columns))
        self.assertEqual(top_artist_candidate_set_df_html.count(), 3)
Пример #4
0
 def test_get_top_artists(self):
     mapped_df = self.get_mapped_listens()
     test_top_artist_df = candidate_sets.get_top_artists(mapped_df, 'vansika')
     self.assertListEqual(['mb_artist_credit_id', 'artist_name', 'count'], test_top_artist_df.columns)
     self.assertEqual(test_top_artist_df.count(), 1)
     row = test_top_artist_df.collect()[0]
     self.assertEqual(row.mb_artist_credit_id, 1)