def get_top_artist_rec_df(self): df = utils.create_dataframe( Row(recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=1.8, recording_id=5, user_id=6, user_name='vansika'), schema=None ) df = df.union(utils.create_dataframe( Row(recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=-0.8, recording_id=6, user_id=6, user_name='vansika'), schema=None )) df = df.union(utils.create_dataframe( Row(recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=0.99, recording_id=6, user_id=7, user_name='rob'), schema=None )) return df
def test_recommendation_params_init(self): recordings_df = utils.create_dataframe(Row(col1=3, col2=9), schema=None) model = MagicMock() top_artist_candidate_set_df = utils.create_dataframe(Row(col1=4, col2=5, col3=5), schema=None) similar_artist_candidate_set_df = utils.create_dataframe(Row(col1=1), schema=None) recommendation_top_artist_limit = 20 recommendation_similar_artist_limit = 40 params = recommend.RecommendationParams( recordings_df, model, top_artist_candidate_set_df, similar_artist_candidate_set_df, recommendation_top_artist_limit, recommendation_similar_artist_limit) self.assertEqual(sorted(params.recordings_df.columns), sorted(recordings_df.columns)) self.assertEqual(params.model, model) self.assertEqual(sorted(params.top_artist_candidate_set_df.columns), sorted(top_artist_candidate_set_df.columns)) self.assertEqual( sorted(params.similar_artist_candidate_set_df.columns), sorted(similar_artist_candidate_set_df.columns)) self.assertEqual(params.recommendation_top_artist_limit, recommendation_top_artist_limit) self.assertEqual(params.recommendation_similar_artist_limit, recommendation_similar_artist_limit)
def test_get_user_count(self): df = utils.create_dataframe(Row(user_id=3), schema=None) df = df.union(utils.create_dataframe(Row(user_id=3), schema=None)) df = df.union(utils.create_dataframe(Row(user_id=2), schema=None)) user_count = recommend.get_user_count(df) self.assertEqual(user_count, 2)
def get_recordings_df(cls): df = utils.create_dataframe( Row( mb_artist_credit_id=1, mb_artist_credit_mbids=["181c4177-f33a-441d-b15d-910acaf18b07"], mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_release_mbid="xxxxxx", msb_artist_credit_name_matchable="lessthanjake", recording_id=1, msb_recording_name_matchable="Al's War", ), schema=None ) recordings_df = df.union(utils.create_dataframe( Row( mb_artist_credit_id=2, mb_artist_mbids=["281c4177-f33a-441d-b15d-910acaf18b07"], mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_release_mbid="xxxxxx", msb_artist_credit_name_matchable="kishorekumar", recording_id=2, msb_recording_name_matchable="Mere Sapno ki Rani", ), schema=None )) return recordings_df
def test_get_user_name_and_user_id(self): params = self.get_recommendation_params() df = utils.create_dataframe(Row(user_id=1, user_name='vansika', recording_id=1), schema=None) df = df.union( utils.create_dataframe(Row(user_id=1, user_name='vansika', recording_id=2), schema=None)) df = df.union( utils.create_dataframe(Row(user_id=2, user_name='rob', recording_id=1), schema=None)) params.top_artist_candidate_set_df = df users = recommend.get_user_name_and_user_id(params, []) self.assertEqual(users.count(), 2) self.assertEqual(sorted(users.columns), sorted(['user_id', 'user_name'])) users = recommend.get_user_name_and_user_id(params, ['vansika']) self.assertEqual(users.count(), 1) self.assertEqual(sorted(users.columns), sorted(['user_id', 'user_name']))
def get_top_artist(self): df = utils.create_dataframe( Row( top_artist_credit_id=2, top_artist_name="blahblah", total_count=10, user_name='vansika_1' ), schema=None ) df = df.union(utils.create_dataframe( Row( top_artist_credit_id=2, top_artist_name="Less Than Jake", total_count=2, user_name='vansika' ), schema=None )) top_artist_df = df.union(utils.create_dataframe( Row( top_artist_credit_id=1, top_artist_name="Less Than Jake", total_count=4, user_name='vansika' ), schema=None )) return top_artist_df
def get_similar_artist_candidate_set_df_html(self): df = utils.create_dataframe(Row( similar_artist_credit_id=2, similar_artist_name="blahblah", mb_artist_credit_id=1, mb_artist_credit_mbids=['xxx'], mb_recording_mbid='yyy', msb_artist_credit_name_matchable='blahblah', msb_recording_name_matchable='looloo', recording_id=2, user_name='vansika_1'), schema=None) similar_artist_candidate_set_df_html = df.union( utils.create_dataframe(Row( similar_artist_credit_id=1, similar_artist_name="Less Than Jake", mb_artist_credit_id=1, mb_artist_credit_mbids=['xxx'], mb_recording_mbid='yyy', msb_artist_credit_name_matchable='lessthanjake', msb_recording_name_matchable='lalal', recording_id=2, user_name='vansika', ), schema=None)) return similar_artist_candidate_set_df_html
def get_mapped_listens(cls): mapped_listens_row_1 = Row( listened_at=datetime.utcnow(), mb_artist_credit_id=1, mb_artist_credit_mbids=["181c4177-f33a-441d-b15d-910acaf18b07"], mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_release_mbid="xxxxxx", msb_artist_credit_name_matchable="lessthanjake", msb_recording_name_matchable="Al's War", user_name='vansika', ) df = utils.create_dataframe(mapped_listens_row_1, schema=None) mapped_listens_row_2 = Row( listened_at=datetime.utcnow(), mb_artist_credit_id=2, mb_artist_mbids=["281c4177-f33a-441d-b15d-910acaf18b07"], mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_release_mbid="xxxxxx", msb_artist_credit_name_matchable="kishorekumar", msb_recording_name_matchable="Mere Sapno ki Rani", user_name='rob', ) mapped_listens_df = df.union( utils.create_dataframe(mapped_listens_row_2, schema=None)) return mapped_listens_df
def get_similar_artist_df_html(self): df = utils.create_dataframe( Row( top_artist_credit_id=2, top_artist_name="blahblah", similar_artist_credit_id=10, similar_artist_name='Monali', user_name='vansika_1' ), schema=None ) df = df.union(utils.create_dataframe( Row( top_artist_credit_id=2, top_artist_name="Less Than Jake", similar_artist_credit_id=1, similar_artist_name='shan', user_name='vansika' ), schema=None )) similar_artist_df_html = df.union(utils.create_dataframe( Row( top_artist_credit_id=1, top_artist_name="Less Than Jake", similar_artist_credit_id=90, similar_artist_name='john', user_name='vansika' ), schema=None )) return similar_artist_df_html
def get_candidate_set(cls): df = utils.create_dataframe(Row(user_id=1, recording_id=1), schema=None) candidate_set = df.union( utils.create_dataframe(Row(user_id=2, recording_id=2), schema=None)) return candidate_set
def get_similar_artist_rec_df(self): df = utils.create_dataframe( Row(mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=0.8, recording_id=5, user_id=8, user_name='vansika_1'), schema=None ) df = df.union(utils.create_dataframe( Row(mb_recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=-2.8, recording_id=6, user_id=8, user_name='vansika_1'), schema=None )) df = df.union(utils.create_dataframe( Row(mb_recording_mbid="7acb406f-c716-45f8-a8bd-96ca3939c2e5", rating=0.19, recording_id=11, user_id=7, user_name='rob'), schema=None )) return df
def test_append_artists_from_collaborations(self, mock_explode, mock_read_hdfs): top_artist_df = utils.create_dataframe(Row( top_artist_credit_id=2, top_artist_name='kishorekumar', user_name='vansika', mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"], total_count=4), schema=None) mock_explode.return_value = utils.create_dataframe(Row( mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"], user_name='rob', total_count=7), schema=None) mapping_df = utils.create_dataframe(Row( mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"], msb_artist_credit_name_matchable='kishorekumar', mb_artist_credit_id=2, ), schema=None) mock_read_hdfs.return_value = mapping_df res_df = candidate_sets.append_artists_from_collaborations( top_artist_df) mock_explode.assert_called_once_with(top_artist_df) mock_read_hdfs.assert_called_once_with(path.MBID_MSID_MAPPING) self.assertEqual(res_df.count(), 2) self.assertEqual(res_df.collect()[0].user_name, 'vansika') self.assertEqual(res_df.collect()[1].user_name, 'rob')
def get_users_df(cls): df = utils.create_dataframe(Row(user_name='vansika', user_id=1), schema=None) users_df = df.union( utils.create_dataframe(Row(user_name='rob', user_id=2), schema=None)) return users_df
def test_copy(self): # Test directories utils.create_dir(self.path_) utils.create_dir(os.path.join(self.path_, "a")) utils.create_dir(os.path.join(self.path_, "b")) # DataFrames to create parquets df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None) # Save DataFrames in respective directories utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet")) utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet")) utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet")) utils.copy(self.path_, self.temp_path_, overwrite=True) # Read copied DataFrame cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet")) cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet")) cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet")) # Check if both DataFrames are same self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect()) self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect()) self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
def create_df(self): df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df = df.union(df1) df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df = df.union(df2) return df
def test_get_top_artists_recording_ids(self): recordings_df = self.get_recordings_df() df = utils.create_dataframe(Row(mb_artist_credit_id=1, artist_name="Less Than Jake", count=1), schema=None) top_artist_df = df.union(utils.create_dataframe(Row(mb_artist_credit_id=2, artist_name="Kishore Kumar", count=1), schema=None)) recording_ids = candidate_sets.get_top_artists_recording_ids(top_artist_df, recordings_df, 1) self.assertListEqual(['user_id', 'recording_id'], recording_ids.columns) self.assertEqual(recording_ids.count(), 2)
def get_recommendation_df(self): df = utils.create_dataframe(Row(recording_id=1, rating=3.13456), schema=None) recommendation_df = df.union( utils.create_dataframe(Row(recording_id=2, rating=6.994590001), schema=None)) return recommendation_df
def test_get_latest_listen_ts(self): date = datetime(2020, 5, 18) df = utils.create_dataframe(Row(listened_at=date), schema=None) df = df.union( utils.create_dataframe(Row(listened_at=offset_days(date, 7)), schema=None)) utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_)) result = stats_utils.get_latest_listen_ts() self.assertEqual(date, result)
def get_listens(cls): cls.date = datetime.utcnow() df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None) shifted_date = stats.offset_days(cls.date, cls.recommendation_generation_window + 1) df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None) shifted_date = stats.offset_days(cls.date, 1) df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) shifted_date = stats.offset_days(cls.date, 2) df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) test_mapped_df = df1.union(df2).union(df3).union(df4) return test_mapped_df
def test_append_dataframe(self): hdfs_path = self.path_ + '/test_df.parquet' df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) utils.append(df, hdfs_path) new_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(new_df.count(), 1) df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) utils.append(df, hdfs_path) appended_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(appended_df.count(), 2)
def get_listens(cls): cls.date = datetime.utcnow() df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None) shifted_date = stats.adjust_days(cls.date, config.RECOMMENDATION_GENERATION_WINDOW + 1) df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None) shifted_date = stats.adjust_days(cls.date, 1, shift_backwards=False) df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) shifted_date = stats.adjust_days(cls.date, 2) df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None) test_mapped_df = df1.union(df2).union(df3).union(df4) return test_mapped_df
def test_get_similar_artists(self): df = utils.create_dataframe(Row(score=1.0, id_0=1, name_0="Less Than Jake", id_1=2, name_1="blahblah"), schema=None) df = df.union( utils.create_dataframe(Row(score=1.0, id_0=2, name_0="blahblah", id_1=3, name_1="Katty Peri"), schema=None)) artist_relation_df = df.union( utils.create_dataframe(Row(score=1.0, id_0=3, name_0="Katty Peri", id_1=1, name_1="Less Than Jake"), schema=None)) top_artist_df = self.get_top_artist() similar_artist_limit = 10 similar_artist_df, similar_artist_df_html = candidate_sets.get_similar_artists( top_artist_df, artist_relation_df, similar_artist_limit) self.assertEqual(similar_artist_df.count(), 3) cols = ['similar_artist_credit_id', 'similar_artist_name', 'user_name'] self.assertListEqual(cols, similar_artist_df.columns) self.assertEqual(similar_artist_df_html.count(), 4) cols = [ 'top_artist_credit_id', 'top_artist_name', 'similar_artist_credit_id', 'similar_artist_name', 'user_name' ] self.assertListEqual(cols, similar_artist_df_html.columns) artist_relation_df = utils.create_dataframe(Row( score=1.0, id_0=6, name_0="Less Than Jake", id_1=7, name_1="Wolfgang Amadeus Mozart"), schema=None) with self.assertRaises(SimilarArtistNotFetchedException): candidate_sets.get_similar_artists(top_artist_df, artist_relation_df, similar_artist_limit)
def test_get_user_name_and_user_id(self): params = self.get_recommendation_params() df = utils.create_dataframe( Row( user_id=1, user_name='vansika', recording_id=1 ), schema=None ) df = df.union(utils.create_dataframe( Row( user_id=1, user_name='vansika', recording_id=2 ), schema=None )) df = df.union(utils.create_dataframe( Row( user_id=2, user_name='rob', recording_id=1 ), schema=None )) params.top_artist_candidate_set_df = df users = [] users_df = recommend.get_user_name_and_user_id(params, []) self.assertEqual(users_df.count(), 2) user_name = sorted([row.user_name for row in users_df.collect()]) user_id = sorted([row.user_id for row in users_df.collect()]) self.assertEqual(sorted(users_df.columns), sorted(['user_id', 'user_name'])) self.assertEqual(['rob', 'vansika'], user_name) self.assertEqual([1, 2], user_id) users = ['vansika', 'invalid'] users_df = recommend.get_user_name_and_user_id(params, users) self.assertEqual(users_df.count(), 1) self.assertEqual(sorted(users_df.columns), sorted(['user_id', 'user_name'])) user_name = [row.user_name for row in users_df.collect()] user_id = [row.user_id for row in users_df.collect()] self.assertEqual(['vansika'], user_name) self.assertEqual([1], user_id) with self.assertRaises(EmptyDataframeExcpetion): users = ['invalid'] recommend.get_user_name_and_user_id(params, users)
def save_dataframe(self): df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1', track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None) df = df.union(df1).union(df2) utils.save_parquet(df, '/data/listenbrainz/2019/12.parquet')
def test_append_dataframe(self): path_ = 'test_df.parquet' hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_) df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) utils.append(df, hdfs_path) new_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(new_df.count(), 1) df = utils.create_dataframe(Row(column1=3, column2=4), schema=None) utils.append(df, hdfs_path) appended_df = utils.read_files_from_HDFS(hdfs_path) self.assertEqual(appended_df.count(), 2)
def get_recordings_df(cls): df = utils.create_dataframe(Row( mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_artist_credit_id=1, recording_id=1), schema=None) recordings_df = df.union( utils.create_dataframe(Row( mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5", mb_artist_credit_id=2, recording_id=2), schema=None)) return recordings_df
def test_get_latest_dataframe_id(self): df_id_1 = "a36d6fc9-49d0-4789-a7dd-a2b72369ca45" df_metadata_dict_1 = self.get_dataframe_metadata(df_id_1) df_1 = utils.create_dataframe(schema.convert_dataframe_metadata_to_row(df_metadata_dict_1), schema.dataframe_metadata_schema) df_id_2 = "bbbd6fc9-49d0-4789-a7dd-a2b72369ca45" df_metadata_dict_2 = self.get_dataframe_metadata(df_id_2) df_2 = utils.create_dataframe(schema.convert_dataframe_metadata_to_row(df_metadata_dict_2), schema.dataframe_metadata_schema) df_metadata = df_1.union(df_2) expected_dataframe_id = train_models.get_latest_dataframe_id(df_metadata) self.assertEqual(expected_dataframe_id, df_id_2)
def save_dataframe(self): now = datetime.now() with open(self.path_to_data_file('user_top_artists.json')) as f: data = json.load(f) schema = StructType( (StructField('user_name', StringType()), StructField('artist_name', StringType()), StructField('artist_msid', StringType()), StructField('artist_mbids', ArrayType(StringType())))) df = None for entry in data: for idx in range(0, entry['count']): # Assign listened_at to each listen row = utils.create_dataframe(Row( user_name=entry['user_name'], artist_name=entry['artist_name'], artist_msid=entry['artist_msid'], artist_mbids=entry['artist_mbids']), schema=schema) df = df.union(row) if df else row utils.save_parquet( df, os.path.join(self.path_, '{}/{}.parquet'.format(now.year, now.month)))
def test_delete_model(self): df = utils.create_dataframe(Row(col1=1, col2=1), None) utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR) train_models.delete_model() dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR) self.assertFalse(dir_exists)
def test_convert_string_datatype_to_array(self): df = utils.create_dataframe( Row(mbids="6a70b322-9aa9-41b3-9dce-824733633a1c"), schema=None) res_df = candidate_sets.convert_string_datatype_to_array(df) self.assertEqual(res_df.collect()[0].mb_artist_credit_mbids, ["6a70b322-9aa9-41b3-9dce-824733633a1c"])