Пример #1
0
    def get_top_artist_rec_df(self):
        df = utils.create_dataframe(
            Row(recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=1.8,
                recording_id=5,
                user_id=6,
                user_name='vansika'),
            schema=None
        )

        df = df.union(utils.create_dataframe(
            Row(recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=-0.8,
                recording_id=6,
                user_id=6,
                user_name='vansika'),
            schema=None
        ))

        df = df.union(utils.create_dataframe(
            Row(recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=0.99,
                recording_id=6,
                user_id=7,
                user_name='rob'),
            schema=None
        ))
        return df
    def test_recommendation_params_init(self):
        recordings_df = utils.create_dataframe(Row(col1=3, col2=9),
                                               schema=None)
        model = MagicMock()
        top_artist_candidate_set_df = utils.create_dataframe(Row(col1=4,
                                                                 col2=5,
                                                                 col3=5),
                                                             schema=None)
        similar_artist_candidate_set_df = utils.create_dataframe(Row(col1=1),
                                                                 schema=None)
        recommendation_top_artist_limit = 20
        recommendation_similar_artist_limit = 40

        params = recommend.RecommendationParams(
            recordings_df, model, top_artist_candidate_set_df,
            similar_artist_candidate_set_df, recommendation_top_artist_limit,
            recommendation_similar_artist_limit)

        self.assertEqual(sorted(params.recordings_df.columns),
                         sorted(recordings_df.columns))
        self.assertEqual(params.model, model)
        self.assertEqual(sorted(params.top_artist_candidate_set_df.columns),
                         sorted(top_artist_candidate_set_df.columns))
        self.assertEqual(
            sorted(params.similar_artist_candidate_set_df.columns),
            sorted(similar_artist_candidate_set_df.columns))
        self.assertEqual(params.recommendation_top_artist_limit,
                         recommendation_top_artist_limit)
        self.assertEqual(params.recommendation_similar_artist_limit,
                         recommendation_similar_artist_limit)
Пример #3
0
    def test_get_user_count(self):
        df = utils.create_dataframe(Row(user_id=3), schema=None)
        df = df.union(utils.create_dataframe(Row(user_id=3), schema=None))
        df = df.union(utils.create_dataframe(Row(user_id=2), schema=None))

        user_count = recommend.get_user_count(df)
        self.assertEqual(user_count, 2)
Пример #4
0
 def get_recordings_df(cls):
     df = utils.create_dataframe(
         Row(
             mb_artist_credit_id=1,
             mb_artist_credit_mbids=["181c4177-f33a-441d-b15d-910acaf18b07"],
             mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5",
             mb_release_mbid="xxxxxx",
             msb_artist_credit_name_matchable="lessthanjake",
             recording_id=1,
             msb_recording_name_matchable="Al's War",
         ),
         schema=None
     )
     recordings_df = df.union(utils.create_dataframe(
         Row(
             mb_artist_credit_id=2,
             mb_artist_mbids=["281c4177-f33a-441d-b15d-910acaf18b07"],
             mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5",
             mb_release_mbid="xxxxxx",
             msb_artist_credit_name_matchable="kishorekumar",
             recording_id=2,
             msb_recording_name_matchable="Mere Sapno ki Rani",
         ),
         schema=None
     ))
     return recordings_df
Пример #5
0
    def test_get_user_name_and_user_id(self):
        params = self.get_recommendation_params()
        df = utils.create_dataframe(Row(user_id=1,
                                        user_name='vansika',
                                        recording_id=1),
                                    schema=None)

        df = df.union(
            utils.create_dataframe(Row(user_id=1,
                                       user_name='vansika',
                                       recording_id=2),
                                   schema=None))

        df = df.union(
            utils.create_dataframe(Row(user_id=2,
                                       user_name='rob',
                                       recording_id=1),
                                   schema=None))

        params.top_artist_candidate_set_df = df

        users = recommend.get_user_name_and_user_id(params, [])

        self.assertEqual(users.count(), 2)
        self.assertEqual(sorted(users.columns),
                         sorted(['user_id', 'user_name']))

        users = recommend.get_user_name_and_user_id(params, ['vansika'])
        self.assertEqual(users.count(), 1)
        self.assertEqual(sorted(users.columns),
                         sorted(['user_id', 'user_name']))
    def get_top_artist(self):
        df = utils.create_dataframe(
            Row(
                top_artist_credit_id=2,
                top_artist_name="blahblah",
                total_count=10,
                user_name='vansika_1'
            ),
            schema=None
        )

        df = df.union(utils.create_dataframe(
            Row(
                top_artist_credit_id=2,
                top_artist_name="Less Than Jake",
                total_count=2,
                user_name='vansika'
            ),
            schema=None
        ))

        top_artist_df = df.union(utils.create_dataframe(
            Row(
                top_artist_credit_id=1,
                top_artist_name="Less Than Jake",
                total_count=4,
                user_name='vansika'
            ),
            schema=None
        ))

        return top_artist_df
    def get_similar_artist_candidate_set_df_html(self):
        df = utils.create_dataframe(Row(
            similar_artist_credit_id=2,
            similar_artist_name="blahblah",
            mb_artist_credit_id=1,
            mb_artist_credit_mbids=['xxx'],
            mb_recording_mbid='yyy',
            msb_artist_credit_name_matchable='blahblah',
            msb_recording_name_matchable='looloo',
            recording_id=2,
            user_name='vansika_1'),
                                    schema=None)

        similar_artist_candidate_set_df_html = df.union(
            utils.create_dataframe(Row(
                similar_artist_credit_id=1,
                similar_artist_name="Less Than Jake",
                mb_artist_credit_id=1,
                mb_artist_credit_mbids=['xxx'],
                mb_recording_mbid='yyy',
                msb_artist_credit_name_matchable='lessthanjake',
                msb_recording_name_matchable='lalal',
                recording_id=2,
                user_name='vansika',
            ),
                                   schema=None))

        return similar_artist_candidate_set_df_html
Пример #8
0
    def get_mapped_listens(cls):
        mapped_listens_row_1 = Row(
            listened_at=datetime.utcnow(),
            mb_artist_credit_id=1,
            mb_artist_credit_mbids=["181c4177-f33a-441d-b15d-910acaf18b07"],
            mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5",
            mb_release_mbid="xxxxxx",
            msb_artist_credit_name_matchable="lessthanjake",
            msb_recording_name_matchable="Al's War",
            user_name='vansika',
        )
        df = utils.create_dataframe(mapped_listens_row_1, schema=None)

        mapped_listens_row_2 = Row(
            listened_at=datetime.utcnow(),
            mb_artist_credit_id=2,
            mb_artist_mbids=["281c4177-f33a-441d-b15d-910acaf18b07"],
            mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5",
            mb_release_mbid="xxxxxx",
            msb_artist_credit_name_matchable="kishorekumar",
            msb_recording_name_matchable="Mere Sapno ki Rani",
            user_name='rob',
        )
        mapped_listens_df = df.union(
            utils.create_dataframe(mapped_listens_row_2, schema=None))
        return mapped_listens_df
    def get_similar_artist_df_html(self):
        df = utils.create_dataframe(
            Row(
                top_artist_credit_id=2,
                top_artist_name="blahblah",
                similar_artist_credit_id=10,
                similar_artist_name='Monali',
                user_name='vansika_1'
            ),
            schema=None
        )

        df = df.union(utils.create_dataframe(
            Row(
                top_artist_credit_id=2,
                top_artist_name="Less Than Jake",
                similar_artist_credit_id=1,
                similar_artist_name='shan',
                user_name='vansika'
            ),
            schema=None
        ))

        similar_artist_df_html = df.union(utils.create_dataframe(
            Row(
                top_artist_credit_id=1,
                top_artist_name="Less Than Jake",
                similar_artist_credit_id=90,
                similar_artist_name='john',
                user_name='vansika'
            ),
            schema=None
        ))

        return similar_artist_df_html
Пример #10
0
 def get_candidate_set(cls):
     df = utils.create_dataframe(Row(user_id=1, recording_id=1),
                                 schema=None)
     candidate_set = df.union(
         utils.create_dataframe(Row(user_id=2, recording_id=2),
                                schema=None))
     return candidate_set
Пример #11
0
    def get_similar_artist_rec_df(self):
        df = utils.create_dataframe(
            Row(mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=0.8,
                recording_id=5,
                user_id=8,
                user_name='vansika_1'),
            schema=None
        )

        df = df.union(utils.create_dataframe(
            Row(mb_recording_mbid="8acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=-2.8,
                recording_id=6,
                user_id=8,
                user_name='vansika_1'),
            schema=None
        ))

        df = df.union(utils.create_dataframe(
            Row(mb_recording_mbid="7acb406f-c716-45f8-a8bd-96ca3939c2e5",
                rating=0.19,
                recording_id=11,
                user_id=7,
                user_name='rob'),
            schema=None
        ))
        return df
    def test_append_artists_from_collaborations(self, mock_explode,
                                                mock_read_hdfs):
        top_artist_df = utils.create_dataframe(Row(
            top_artist_credit_id=2,
            top_artist_name='kishorekumar',
            user_name='vansika',
            mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"],
            total_count=4),
                                               schema=None)
        mock_explode.return_value = utils.create_dataframe(Row(
            mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"],
            user_name='rob',
            total_count=7),
                                                           schema=None)

        mapping_df = utils.create_dataframe(Row(
            mb_artist_credit_mbids=["6a70b322-9aa9-41b3-9dce-824733633a1c"],
            msb_artist_credit_name_matchable='kishorekumar',
            mb_artist_credit_id=2,
        ),
                                            schema=None)

        mock_read_hdfs.return_value = mapping_df
        res_df = candidate_sets.append_artists_from_collaborations(
            top_artist_df)

        mock_explode.assert_called_once_with(top_artist_df)
        mock_read_hdfs.assert_called_once_with(path.MBID_MSID_MAPPING)

        self.assertEqual(res_df.count(), 2)
        self.assertEqual(res_df.collect()[0].user_name, 'vansika')
        self.assertEqual(res_df.collect()[1].user_name, 'rob')
Пример #13
0
 def get_users_df(cls):
     df = utils.create_dataframe(Row(user_name='vansika', user_id=1),
                                 schema=None)
     users_df = df.union(
         utils.create_dataframe(Row(user_name='rob', user_id=2),
                                schema=None))
     return users_df
Пример #14
0
    def test_copy(self):
        # Test directories
        utils.create_dir(self.path_)
        utils.create_dir(os.path.join(self.path_, "a"))
        utils.create_dir(os.path.join(self.path_, "b"))

        # DataFrames to create parquets
        df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None)

        # Save DataFrames in respective directories
        utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet"))
        utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet"))
        utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet"))

        utils.copy(self.path_, self.temp_path_, overwrite=True)

        # Read copied DataFrame
        cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet"))
        cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet"))
        cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet"))

        # Check if both DataFrames are same
        self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect())
        self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect())
        self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
Пример #15
0
 def create_df(self):
     df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None)
     df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None)
     df = df.union(df1)
     df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1', release_mbid='1'), schema=None)
     df = df.union(df2)
     return df
Пример #16
0
 def test_get_top_artists_recording_ids(self):
     recordings_df = self.get_recordings_df()
     df = utils.create_dataframe(Row(mb_artist_credit_id=1, artist_name="Less Than Jake", count=1), schema=None)
     top_artist_df = df.union(utils.create_dataframe(Row(mb_artist_credit_id=2, artist_name="Kishore Kumar", count=1),
         schema=None))
     recording_ids = candidate_sets.get_top_artists_recording_ids(top_artist_df, recordings_df, 1)
     self.assertListEqual(['user_id', 'recording_id'], recording_ids.columns)
     self.assertEqual(recording_ids.count(), 2)
Пример #17
0
    def get_recommendation_df(self):
        df = utils.create_dataframe(Row(recording_id=1, rating=3.13456),
                                    schema=None)

        recommendation_df = df.union(
            utils.create_dataframe(Row(recording_id=2, rating=6.994590001),
                                   schema=None))

        return recommendation_df
    def test_get_latest_listen_ts(self):
        date = datetime(2020, 5, 18)
        df = utils.create_dataframe(Row(listened_at=date), schema=None)
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(date, 7)),
                                   schema=None))
        utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_))

        result = stats_utils.get_latest_listen_ts()
        self.assertEqual(date, result)
 def get_listens(cls):
     cls.date = datetime.utcnow()
     df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None)
     shifted_date = stats.offset_days(cls.date, cls.recommendation_generation_window + 1)
     df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None)
     shifted_date = stats.offset_days(cls.date, 1)
     df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     shifted_date = stats.offset_days(cls.date, 2)
     df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     test_mapped_df = df1.union(df2).union(df3).union(df4)
     return test_mapped_df
Пример #20
0
    def test_append_dataframe(self):
        hdfs_path = self.path_ + '/test_df.parquet'
        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        utils.append(df, hdfs_path)
        new_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(new_df.count(), 1)

        df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        utils.append(df, hdfs_path)
        appended_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(appended_df.count(), 2)
Пример #21
0
 def get_listens(cls):
     cls.date = datetime.utcnow()
     df1 = utils.create_dataframe(cls.get_listen_row(cls.date, 'vansika', 1), schema=None)
     shifted_date = stats.adjust_days(cls.date, config.RECOMMENDATION_GENERATION_WINDOW + 1)
     df2 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'vansika', 1), schema=None)
     shifted_date = stats.adjust_days(cls.date, 1, shift_backwards=False)
     df3 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     shifted_date = stats.adjust_days(cls.date, 2)
     df4 = utils.create_dataframe(cls.get_listen_row(shifted_date, 'rob', 2), schema=None)
     test_mapped_df = df1.union(df2).union(df3).union(df4)
     return test_mapped_df
    def test_get_similar_artists(self):
        df = utils.create_dataframe(Row(score=1.0,
                                        id_0=1,
                                        name_0="Less Than Jake",
                                        id_1=2,
                                        name_1="blahblah"),
                                    schema=None)

        df = df.union(
            utils.create_dataframe(Row(score=1.0,
                                       id_0=2,
                                       name_0="blahblah",
                                       id_1=3,
                                       name_1="Katty Peri"),
                                   schema=None))

        artist_relation_df = df.union(
            utils.create_dataframe(Row(score=1.0,
                                       id_0=3,
                                       name_0="Katty Peri",
                                       id_1=1,
                                       name_1="Less Than Jake"),
                                   schema=None))

        top_artist_df = self.get_top_artist()

        similar_artist_limit = 10
        similar_artist_df, similar_artist_df_html = candidate_sets.get_similar_artists(
            top_artist_df, artist_relation_df, similar_artist_limit)

        self.assertEqual(similar_artist_df.count(), 3)

        cols = ['similar_artist_credit_id', 'similar_artist_name', 'user_name']
        self.assertListEqual(cols, similar_artist_df.columns)

        self.assertEqual(similar_artist_df_html.count(), 4)
        cols = [
            'top_artist_credit_id', 'top_artist_name',
            'similar_artist_credit_id', 'similar_artist_name', 'user_name'
        ]
        self.assertListEqual(cols, similar_artist_df_html.columns)

        artist_relation_df = utils.create_dataframe(Row(
            score=1.0,
            id_0=6,
            name_0="Less Than Jake",
            id_1=7,
            name_1="Wolfgang Amadeus Mozart"),
                                                    schema=None)
        with self.assertRaises(SimilarArtistNotFetchedException):
            candidate_sets.get_similar_artists(top_artist_df,
                                               artist_relation_df,
                                               similar_artist_limit)
Пример #23
0
    def test_get_user_name_and_user_id(self):
        params = self.get_recommendation_params()
        df = utils.create_dataframe(
            Row(
                user_id=1,
                user_name='vansika',
                recording_id=1
            ),
            schema=None
        )

        df = df.union(utils.create_dataframe(
            Row(
                user_id=1,
                user_name='vansika',
                recording_id=2
            ),
            schema=None
        ))

        df = df.union(utils.create_dataframe(
            Row(
                user_id=2,
                user_name='rob',
                recording_id=1
            ),
            schema=None
        ))

        params.top_artist_candidate_set_df = df

        users = []
        users_df = recommend.get_user_name_and_user_id(params, [])

        self.assertEqual(users_df.count(), 2)
        user_name = sorted([row.user_name for row in users_df.collect()])
        user_id = sorted([row.user_id for row in users_df.collect()])
        self.assertEqual(sorted(users_df.columns), sorted(['user_id', 'user_name']))
        self.assertEqual(['rob', 'vansika'], user_name)
        self.assertEqual([1, 2], user_id)

        users = ['vansika', 'invalid']
        users_df = recommend.get_user_name_and_user_id(params, users)
        self.assertEqual(users_df.count(), 1)
        self.assertEqual(sorted(users_df.columns), sorted(['user_id', 'user_name']))
        user_name = [row.user_name for row in users_df.collect()]
        user_id = [row.user_id for row in users_df.collect()]
        self.assertEqual(['vansika'], user_name)
        self.assertEqual([1], user_id)

        with self.assertRaises(EmptyDataframeExcpetion):
            users = ['invalid']
            recommend.get_user_name_and_user_id(params, users)
Пример #24
0
 def save_dataframe(self):
     df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
         release_mbid='1'), schema=None)
     df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
          release_mbid='1'), schema=None)
     df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
         release_mbid='1'), schema=None)
     df = df.union(df1).union(df2)
     utils.save_parquet(df, '/data/listenbrainz/2019/12.parquet')
Пример #25
0
    def test_append_dataframe(self):
        path_ = 'test_df.parquet'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        utils.append(df, hdfs_path)
        new_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(new_df.count(), 1)

        df = utils.create_dataframe(Row(column1=3, column2=4), schema=None)
        utils.append(df, hdfs_path)
        appended_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(appended_df.count(), 2)
Пример #26
0
 def get_recordings_df(cls):
     df = utils.create_dataframe(Row(
         mb_recording_mbid="3acb406f-c716-45f8-a8bd-96ca3939c2e5",
         mb_artist_credit_id=1,
         recording_id=1),
                                 schema=None)
     recordings_df = df.union(
         utils.create_dataframe(Row(
             mb_recording_mbid="2acb406f-c716-45f8-a8bd-96ca3939c2e5",
             mb_artist_credit_id=2,
             recording_id=2),
                                schema=None))
     return recordings_df
Пример #27
0
    def test_get_latest_dataframe_id(self):
        df_id_1 = "a36d6fc9-49d0-4789-a7dd-a2b72369ca45"
        df_metadata_dict_1 = self.get_dataframe_metadata(df_id_1)
        df_1 = utils.create_dataframe(schema.convert_dataframe_metadata_to_row(df_metadata_dict_1),
                                      schema.dataframe_metadata_schema)

        df_id_2 = "bbbd6fc9-49d0-4789-a7dd-a2b72369ca45"
        df_metadata_dict_2 = self.get_dataframe_metadata(df_id_2)
        df_2 = utils.create_dataframe(schema.convert_dataframe_metadata_to_row(df_metadata_dict_2),
                                      schema.dataframe_metadata_schema)

        df_metadata = df_1.union(df_2)

        expected_dataframe_id = train_models.get_latest_dataframe_id(df_metadata)
        self.assertEqual(expected_dataframe_id, df_id_2)
Пример #28
0
    def save_dataframe(self):
        now = datetime.now()

        with open(self.path_to_data_file('user_top_artists.json')) as f:
            data = json.load(f)

        schema = StructType(
            (StructField('user_name', StringType()),
             StructField('artist_name',
                         StringType()), StructField('artist_msid',
                                                    StringType()),
             StructField('artist_mbids', ArrayType(StringType()))))
        df = None
        for entry in data:
            for idx in range(0, entry['count']):
                # Assign listened_at to each listen
                row = utils.create_dataframe(Row(
                    user_name=entry['user_name'],
                    artist_name=entry['artist_name'],
                    artist_msid=entry['artist_msid'],
                    artist_mbids=entry['artist_mbids']),
                                             schema=schema)
                df = df.union(row) if df else row

        utils.save_parquet(
            df,
            os.path.join(self.path_,
                         '{}/{}.parquet'.format(now.year, now.month)))
Пример #29
0
    def test_delete_model(self):
        df = utils.create_dataframe(Row(col1=1, col2=1), None)
        utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR)
        train_models.delete_model()

        dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR)
        self.assertFalse(dir_exists)
    def test_convert_string_datatype_to_array(self):
        df = utils.create_dataframe(
            Row(mbids="6a70b322-9aa9-41b3-9dce-824733633a1c"), schema=None)

        res_df = candidate_sets.convert_string_datatype_to_array(df)
        self.assertEqual(res_df.collect()[0].mb_artist_credit_mbids,
                         ["6a70b322-9aa9-41b3-9dce-824733633a1c"])