def test_get_available_dumps(self, mock_ftp):
        dump = [
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-text-20180603-202000.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
            'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2'
            'msid-mbid-mapping-with-matchable-20100603-202732.tar.bz2.md5',
        ]

        mapping = ListenbrainzDataDownloader().get_available_dumps(
            dump, 'msid-mbid-mapping-with-matchable')

        expected_mapping = [
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
        ]

        self.assertEqual(mapping, expected_mapping)

        dump = [
            'msid-mbid-mapping-with-text-20180603-202000.tar.bz2',
            'msid-mbid-mapping-with-matchable-20100603-202732.tar.bz2.md5',
        ]

        with self.assertRaises(DumpNotFoundException):
            ListenbrainzDataDownloader().get_available_dumps(
                dump, 'msid-mbid-mapping-with-matchable')
示例#2
0
def upload_mapping():
    """ Invoke script to upload mapping to HDFS.
    """
    from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader
    from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader
    downloader_obj = ListenbrainzDataDownloader()
    src, _ = downloader_obj.download_msid_mbid_mapping(path.FTP_FILES_PATH)
    uploader_obj = ListenbrainzDataUploader()
    uploader_obj.upload_mapping(src)
示例#3
0
def upload_artist_relation():
    """ Invoke script  to upload artist relation to HDFS.
    """
    from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader
    from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader
    downloader_obj = ListenbrainzDataDownloader()
    src, _ = downloader_obj.download_artist_relation(path.FTP_FILES_PATH)
    uploader_obj = ListenbrainzDataUploader()
    uploader_obj.upload_artist_relation(src)
示例#4
0
    def test_get_dump_name_to_download(self, mock_ftp_cons):
        dump = ['listenbrainz-01-00000', 'listenbrainz-02-00000']
        req_dump = ListenbrainzDataDownloader().get_dump_name_to_download(dump, '01', 1)
        self.assertEqual(req_dump, 'listenbrainz-01-00000')

        req_dump = ListenbrainzDataDownloader().get_dump_name_to_download(dump, None, 1)
        self.assertEqual(req_dump, 'listenbrainz-02-00000')

        with self.assertRaises(DumpNotFoundException):
            ListenbrainzDataDownloader().get_dump_name_to_download(dump, '03', 1)
def upload_artist_relation(force):
    """ Invoke script  to upload artist relation to HDFS.
    """
    from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader
    from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader
    with app.app_context():
        downloader_obj = ListenbrainzDataDownloader()
        src = downloader_obj.download_artist_relation(path.FTP_FILES_PATH)
        uploader_obj = ListenbrainzDataUploader()
        uploader_obj.upload_artist_relation(src, force=force)
def upload_mapping(force):
    """ Invoke script to upload mapping to HDFS.
    """
    from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader
    from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader
    with app.app_context():
        downloader_obj = ListenbrainzDataDownloader()
        src = downloader_obj.download_msid_mbid_mapping(path.FTP_FILES_PATH)
        uploader_obj = ListenbrainzDataUploader()
        uploader_obj.upload_mapping(src, force=force)
def upload_listens(force, incremental, id):
    """ Invoke script to upload listens to HDFS.
    """
    from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader
    from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader
    with app.app_context():
        downloader_obj = ListenbrainzDataDownloader()
        dump_type = 'incremental' if incremental else 'full'
        src, _ = downloader_obj.download_listens(directory=path.FTP_FILES_PATH, listens_dump_id=id, dump_type=dump_type)
        uploader_obj = ListenbrainzDataUploader()
        uploader_obj.upload_listens(src, force=force)
示例#8
0
    def test_get_listens_dump_file_name(self, mock_ftp_cons):
        filename = ListenbrainzDataDownloader().get_listens_dump_file_name(
            'listenbrainz-dump-17-20190101-000001-full/')
        self.assertEqual('listenbrainz-spark-dump-17-20190101-000001-full.tar',
                         filename)

        filename = ListenbrainzDataDownloader().get_listens_dump_file_name(
            'listenbrainz-dump-17-20190101-000001-incremental/')
        self.assertEqual(
            'listenbrainz-spark-dump-17-20190101-000001-incremental.tar',
            filename)
    def test_download_msid_mbid_mapping(self, mock_ftp_cons, mock_available_dump, mock_list_dir,
                                        mock_latest_mapping, mock_download_dump):
        directory = '/fakedir'
        mock_list_dir.return_value = [
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-text-20180603-202000.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
            'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2'
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2.md5'
        ]
        mock_available_dump.return_value = [
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
        ]
        mock_latest_mapping.return_value = 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2'
        dest_path, filename = ListenbrainzDataDownloader().download_msid_mbid_mapping(directory)

        mock_ftp_cons.return_value.cwd.assert_called_once_with(config.FTP_MSID_MBID_DIR)
        mock_available_dump.assert_called_once_with(mock_list_dir.return_value, 'msid-mbid-mapping-with-matchable')


        mock_latest_mapping.assert_called_once_with([
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
        ])
        mock_download_dump.assert_called_once_with('msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', directory)
        self.assertEqual(dest_path, mock_download_dump.return_value)
        self.assertEqual(filename, 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2')
示例#10
0
 def test_download_msid_mbid_mapping(self, mock_ftp_cons, mock_spark_dump):
     dest_path = ListenbrainzDataDownloader().download_msid_mbid_mapping(
         '/fakedir', 1)
     mock_spark_dump.assert_called_once_with('/fakedir', 1,
                                             config.FTP_MSID_MBID_DIR,
                                             MAPPING_DUMP_ID_POS)
     self.assertEqual(dest_path, mock_spark_dump.return_value)
示例#11
0
def import_incremental_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the incremental dump with the given dump_id if specified otherwise the
     latest incremental dump.

    Notes:
        All incremental dumps are stored together in incremental.parquet inside the
        listens directory.
    Args:
        dump_id: id of the incremental dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens(
            directory=temp_dir,
            dump_type=DumpType.INCREMENTAL,
            listens_dump_id=dump_id
        )

        # instantiating ListenbrainzDataUploader creates a spark session which
        # is a bit non-intuitive.
        # FIXME in future to make initializing of spark session more explicit?
        ListenbrainzDataUploader().upload_new_listens_incremental_dump(src)
    utils.insert_dump_data(dump_id, DumpType.INCREMENTAL, datetime.utcnow())
    return dump_name
    def test_download_artist_relation(self, mock_ftp_cons, mock_list_dir,
                                      mock_download_dump, mock_dump_archive):
        directory = '/fakedir'
        mock_list_dir.return_value = [
            'artist-credit-artist-credit-relations-01-20191230-134806/',
            'artist-credit-artist-credit-relations-02-20191230-134806/',
        ]
        mock_dump_archive.return_value = 'artist-credit-artist-credit-relations-02-20191230-134806.tar.bz2'
        dest_path, filename = ListenbrainzDataDownloader(
        ).download_artist_relation(directory)

        mock_list_dir.assert_called_once()
        mock_ftp_cons.return_value.cwd.assert_has_calls([
            call(config.FTP_ARTIST_RELATION_DIR),
            call('artist-credit-artist-credit-relations-02-20191230-134806/')
        ])

        self.assertEqual(
            'artist-credit-artist-credit-relations-02-20191230-134806.tar.bz2',
            filename)
        mock_dump_archive.assert_called_once_with(
            'artist-credit-artist-credit-relations-02-20191230-134806/')
        mock_download_dump.assert_called_once_with(
            mock_dump_archive.return_value, directory)
        self.assertEqual(dest_path, mock_download_dump.return_value)
示例#13
0
def import_newest_incremental_dump_handler():
    errors = []
    imported_dumps = []
    latest_full_dump = utils.get_latest_full_dump()
    if latest_full_dump is None:
        # If no prior full dump is present, just import the latest incremental dump
        imported_dumps.append(import_incremental_dump_to_hdfs(dump_id=None))

        error_msg = "No previous full dump found, importing latest incremental dump"
        errors.append(error_msg)
        logger.warning(error_msg, exc_info=True)
    else:
        # Import all missing dumps from last full dump import
        start_id = latest_full_dump["dump_id"] + 1
        imported_at = latest_full_dump["imported_at"]
        end_id = ListenbrainzDataDownloader().get_latest_dump_id(DumpType.INCREMENTAL) + 1

        for dump_id in range(start_id, end_id, 1):
            if not utils.search_dump(dump_id, DumpType.INCREMENTAL, imported_at):
                try:
                    imported_dumps.append(import_incremental_dump_to_hdfs(dump_id))
                except Exception as e:
                    # Skip current dump if any error occurs during import
                    error_msg = f"Error while importing incremental dump with ID {dump_id}: {e}"
                    errors.append(error_msg)
                    logger.error(error_msg, exc_info=True)
                    continue
            dump_id += 1
            request_consumer.rc.ping()
    return [{
        'type': 'import_incremental_dump',
        'imported_dump': imported_dumps,
        'errors': errors,
        'time': str(datetime.utcnow()),
    }]
示例#14
0
 def test_download_artist_relation(self, mock_ftp_cons, mock_spark_dump):
     dest_path = ListenbrainzDataDownloader().download_artist_relation(
         '/fakedir', 1)
     mock_spark_dump.assert_called_once_with('/fakedir', 1,
                                             config.FTP_ARTIST_RELATION_DIR,
                                             ARTIST_RELATION_DUMP_ID_POS)
     self.assertEqual(dest_path, mock_spark_dump.return_value)
    def test_download_listens_incremental_dump_by_id(self, mock_ftp,
                                                     mock_list_dir,
                                                     mock_get_f_name,
                                                     mock_download_dump):
        mock_list_dir.return_value = [
            'listenbrainz-dump-123-20190101-000000/',
            'listenbrainz-dump-45-20190201-000000'
        ]
        mock_get_f_name.return_value = 'listenbrainz-listens-dump-45-20190201-000000-spark-incremental.tar.xz'
        dest_path, filename, dump_id = ListenbrainzDataDownloader(
        ).download_listens('fakedir',
                           listens_dump_id=45,
                           dump_type='incremental')
        mock_list_dir.assert_called_once()
        mock_ftp.return_value.cwd.assert_has_calls([
            call(config.FTP_LISTENS_DIR + 'incremental/'),
            call('listenbrainz-dump-45-20190201-000000')
        ])
        self.assertEqual(
            'listenbrainz-listens-dump-45-20190201-000000-spark-incremental.tar.xz',
            filename)

        mock_get_f_name.assert_called_once()
        mock_download_dump.assert_called_once_with(
            mock_get_f_name.return_value, 'fakedir')
        self.assertEqual(dest_path, mock_download_dump.return_value)
        self.assertEqual(dump_id, 45)
def import_dump_to_hdfs(dump_type, force, dump_id=None):
    temp_dir = tempfile.mkdtemp()
    dump_type = 'incremental' if dump_type == 'incremental' else 'full'
    src, dump_name = ListenbrainzDataDownloader().download_listens(
        directory=temp_dir, dump_type=dump_type, listens_dump_id=dump_id)
    ListenbrainzDataUploader().upload_listens(src, force=force)
    shutil.rmtree(temp_dir)
    return dump_name
示例#17
0
def import_dump_to_hdfs(dump_type, overwrite, dump_id=None):
    temp_dir = tempfile.mkdtemp()
    dump_type = 'incremental' if dump_type == 'incremental' else 'full'
    src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens(
        directory=temp_dir, dump_type=dump_type, listens_dump_id=dump_id)
    ListenbrainzDataUploader().upload_listens(src, overwrite=overwrite)
    utils.insert_dump_data(dump_id, dump_type, datetime.utcnow())
    shutil.rmtree(temp_dir)
    return dump_name
示例#18
0
    def test_download_listens(self, mock_ftp_cons, mock_list_dir, mock_get_f_name, mock_download_dump):
        mock_ftp = mock_ftp_cons.return_value
        dest_path = ListenbrainzDataDownloader().download_listens('fakedir', None)
        mock_list_dir.assert_called_once()
        mock_ftp.cwd.assert_has_calls([call(config.FTP_LISTENS_DIR), call(config.TEMP_LISTENS_DIR)])

        mock_get_f_name.assert_called_once()
        mock_download_dump.assert_called_once_with(mock_get_f_name.return_value, 'fakedir')
        self.assertEqual(dest_path, mock_download_dump.return_value)
def import_mapping_to_hdfs():
    temp_dir = tempfile.mkdtemp()
    src, mapping_name = ListenbrainzDataDownloader().download_msid_mbid_mapping(directory=temp_dir)
    ListenbrainzDataUploader().upload_mapping(archive=src)
    shutil.rmtree(temp_dir)

    return [{
        'type': 'import_mapping',
        'imported_mapping': mapping_name,
        'time': str(datetime.utcnow())
    }]
def import_artist_relation_to_hdfs():
    temp_dir = tempfile.mkdtemp()
    src, artist_relation_name = ListenbrainzDataDownloader().download_artist_relation(directory=temp_dir)
    ListenbrainzDataUploader().upload_artist_relation(archive=src)
    shutil.rmtree(temp_dir)

    return [{
        'type': 'import_artist_relation',
        'imported_artist_relation': artist_relation_name,
        'time': str(datetime.utcnow())
    }]
示例#21
0
    def download_spark_dump_and_get_path(self, mock_ftp_cons, mock_list_dir, mock_req_dir,
        mock_get_f_name, mock_download_dump):
        mock_ftp = mock_ftp_cons.return_value
        dest_path = ListenbrainzDataDownloader().download_spark_dump_and_get_path('fakedir', None, 'fakeftpdir', 4)
        mock_list_dir.assert_called_once()

        mock_req_dir.assert_called_once_with(mock_list_dir.return_value, None, 4)
        mock_ftp.cwd.assert_has_calls([call('fakeftpdir'), call(mock_req_dir.return_value)])

        mock_get_f_name.assert_called_once_with(mock_req_dir.return_value)
        mock_download_dump.assert_called_once_with(mock_get_f_name.return_value, 'fakedir')
        self.assertEqual(dest_path, mock_download_dump.return_value)
def import_full_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the full dump with the given dump_id if specified otherwise the
     latest full dump.

    Notes:
        Deletes all the existing listens and uploads listens from new dump.
    Args:
        dump_id: id of the full dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        downloader = ListenbrainzDataDownloader()
        src, dump_name, dump_id = downloader.download_listens(
            directory=temp_dir,
            dump_type=DumpType.FULL,
            listens_dump_id=dump_id)
        downloader.connection.close()
        ListenbrainzDataUploader().upload_new_listens_full_dump(src)
    utils.insert_dump_data(dump_id, DumpType.FULL, datetime.utcnow())
    return dump_name
    def test_get_latest_mapping(self, mock_ftp):
        mapping = [
            'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2',
            'msid-mbid-mapping-with-text-20180603-202000.tar.bz2',
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2',
            'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2'
            'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2.md5'
        ]

        expected_mapping = 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2'
        latest_mapping = ListenbrainzDataDownloader().get_latest_mapping(mapping)
        self.assertEqual(latest_mapping, expected_mapping)
示例#24
0
def import_artist_relation_to_hdfs():
    ts = time.monotonic()
    temp_dir = tempfile.mkdtemp()
    src, artist_relation_name = ListenbrainzDataDownloader().download_artist_relation(directory=temp_dir)
    ListenbrainzDataUploader().upload_artist_relation(archive=src)
    shutil.rmtree(temp_dir)

    return [{
        'type': 'import_artist_relation',
        'imported_artist_relation': artist_relation_name,
        'import_time': str(datetime.utcnow()),
        'time_taken_to_import': '{:.2f}'.format(time.monotonic() - ts)
    }]
示例#25
0
    def test_download_listens_full_dump(self, mock_ftp, mock_list_dir,
                                        mock_get_f_name, mock_download_dump):
        mock_list_dir.return_value = [
            'listenbrainz-dump-123-20190101-000000/',
            'listenbrainz-dump-45-20190201-000000'
        ]
        mock_get_f_name.return_value = 'listenbrainz-spark-dump-123-20190101-000000-full.tar'
        dest_path, filename, dump_id = ListenbrainzDataDownloader(
        ).download_listens('fakedir', None, dump_type=DumpType.FULL)
        mock_list_dir.assert_called_once()
        mock_ftp.return_value.cwd.assert_has_calls([
            call(config.FTP_LISTENS_DIR + 'fullexport/'),
            call('listenbrainz-dump-123-20190101-000000/')
        ])
        self.assertEqual(
            'listenbrainz-spark-dump-123-20190101-000000-full.tar', filename)

        mock_get_f_name.assert_called_once()
        mock_download_dump.assert_called_once_with(
            mock_get_f_name.return_value, 'fakedir')
        self.assertEqual(dest_path, mock_download_dump.return_value)
        self.assertEqual(dump_id, 123)
示例#26
0
 def test_get_listens_file_name(self, mock_ftp_cons):
     filename = ListenbrainzDataDownloader().get_listens_file_name()
     self.assertEqual(filename, config.TEMP_LISTENS_DUMP)
示例#27
0
 def test_get_dump_archive_name(self, mock_ftp_cons):
     dump_name = 'listenbrainz-01-00000'
     filename = ListenbrainzDataDownloader().get_dump_archive_name(dump_name)
     self.assertEqual(dump_name + '.tar.bz2', filename)
def import_release_json_dump_to_hdfs():
    with tempfile.TemporaryDirectory() as temp_dir:
        downloader = ListenbrainzDataDownloader()
        dest = downloader.download_release_json_dump(temp_dir)
        downloader.connection.close()
        ListenbrainzDataUploader().upload_release_json_dump(dest)