Пример #1
0
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, cls.listens_path)
        mapping_df = utils.read_files_from_HDFS(cls.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, cls.mapped_listens_path)
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
Пример #3
0
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_listens.count(), 1)
        self.assertListEqual(sorted(self.get_mapped_listens().columns),
                             sorted(mapped_listens.columns))
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_df.count(), 1)
        complete_listen_col = [
            'artist_msid', 'artist_name', 'listened_at', 'recording_msid',
            'release_mbid', 'release_msid', 'release_name', 'tags',
            'track_name', 'user_name', 'mb_artist_credit_id',
            'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid',
            'msb_artist_msid', 'msb_recording_msid', 'msb_release_msid'
        ]
        self.assertListEqual(complete_listen_col, mapped_df.columns)
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)
    def test_get_mapped_artist_and_recording_mbids(self):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            self.date, self.date, {}, self.listens_path)
        mapping_df = utils.read_files_from_HDFS(self.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at', 'mb_artist_credit_id', 'mb_artist_credit_mbids',
            'mb_recording_mbid', 'mb_release_mbid',
            'msb_artist_credit_name_matchable', 'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(path.MAPPED_LISTENS)
        self.assertTrue(status)