def test_get_available_dumps(self, mock_ftp): dump = [ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-text-20180603-202000.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', 'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2' 'msid-mbid-mapping-with-matchable-20100603-202732.tar.bz2.md5', ] mapping = ListenbrainzDataDownloader().get_available_dumps( dump, 'msid-mbid-mapping-with-matchable') expected_mapping = [ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', ] self.assertEqual(mapping, expected_mapping) dump = [ 'msid-mbid-mapping-with-text-20180603-202000.tar.bz2', 'msid-mbid-mapping-with-matchable-20100603-202732.tar.bz2.md5', ] with self.assertRaises(DumpNotFoundException): ListenbrainzDataDownloader().get_available_dumps( dump, 'msid-mbid-mapping-with-matchable')
def upload_mapping(): """ Invoke script to upload mapping to HDFS. """ from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader downloader_obj = ListenbrainzDataDownloader() src, _ = downloader_obj.download_msid_mbid_mapping(path.FTP_FILES_PATH) uploader_obj = ListenbrainzDataUploader() uploader_obj.upload_mapping(src)
def upload_artist_relation(): """ Invoke script to upload artist relation to HDFS. """ from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader downloader_obj = ListenbrainzDataDownloader() src, _ = downloader_obj.download_artist_relation(path.FTP_FILES_PATH) uploader_obj = ListenbrainzDataUploader() uploader_obj.upload_artist_relation(src)
def test_get_dump_name_to_download(self, mock_ftp_cons): dump = ['listenbrainz-01-00000', 'listenbrainz-02-00000'] req_dump = ListenbrainzDataDownloader().get_dump_name_to_download(dump, '01', 1) self.assertEqual(req_dump, 'listenbrainz-01-00000') req_dump = ListenbrainzDataDownloader().get_dump_name_to_download(dump, None, 1) self.assertEqual(req_dump, 'listenbrainz-02-00000') with self.assertRaises(DumpNotFoundException): ListenbrainzDataDownloader().get_dump_name_to_download(dump, '03', 1)
def upload_artist_relation(force): """ Invoke script to upload artist relation to HDFS. """ from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader with app.app_context(): downloader_obj = ListenbrainzDataDownloader() src = downloader_obj.download_artist_relation(path.FTP_FILES_PATH) uploader_obj = ListenbrainzDataUploader() uploader_obj.upload_artist_relation(src, force=force)
def upload_mapping(force): """ Invoke script to upload mapping to HDFS. """ from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader with app.app_context(): downloader_obj = ListenbrainzDataDownloader() src = downloader_obj.download_msid_mbid_mapping(path.FTP_FILES_PATH) uploader_obj = ListenbrainzDataUploader() uploader_obj.upload_mapping(src, force=force)
def upload_listens(force, incremental, id): """ Invoke script to upload listens to HDFS. """ from listenbrainz_spark.ftp.download import ListenbrainzDataDownloader from listenbrainz_spark.hdfs.upload import ListenbrainzDataUploader with app.app_context(): downloader_obj = ListenbrainzDataDownloader() dump_type = 'incremental' if incremental else 'full' src, _ = downloader_obj.download_listens(directory=path.FTP_FILES_PATH, listens_dump_id=id, dump_type=dump_type) uploader_obj = ListenbrainzDataUploader() uploader_obj.upload_listens(src, force=force)
def test_get_listens_dump_file_name(self, mock_ftp_cons): filename = ListenbrainzDataDownloader().get_listens_dump_file_name( 'listenbrainz-dump-17-20190101-000001-full/') self.assertEqual('listenbrainz-spark-dump-17-20190101-000001-full.tar', filename) filename = ListenbrainzDataDownloader().get_listens_dump_file_name( 'listenbrainz-dump-17-20190101-000001-incremental/') self.assertEqual( 'listenbrainz-spark-dump-17-20190101-000001-incremental.tar', filename)
def test_download_msid_mbid_mapping(self, mock_ftp_cons, mock_available_dump, mock_list_dir, mock_latest_mapping, mock_download_dump): directory = '/fakedir' mock_list_dir.return_value = [ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-text-20180603-202000.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', 'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2' 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2.md5' ] mock_available_dump.return_value = [ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', ] mock_latest_mapping.return_value = 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2' dest_path, filename = ListenbrainzDataDownloader().download_msid_mbid_mapping(directory) mock_ftp_cons.return_value.cwd.assert_called_once_with(config.FTP_MSID_MBID_DIR) mock_available_dump.assert_called_once_with(mock_list_dir.return_value, 'msid-mbid-mapping-with-matchable') mock_latest_mapping.assert_called_once_with([ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', ]) mock_download_dump.assert_called_once_with('msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', directory) self.assertEqual(dest_path, mock_download_dump.return_value) self.assertEqual(filename, 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2')
def test_download_msid_mbid_mapping(self, mock_ftp_cons, mock_spark_dump): dest_path = ListenbrainzDataDownloader().download_msid_mbid_mapping( '/fakedir', 1) mock_spark_dump.assert_called_once_with('/fakedir', 1, config.FTP_MSID_MBID_DIR, MAPPING_DUMP_ID_POS) self.assertEqual(dest_path, mock_spark_dump.return_value)
def import_incremental_dump_to_hdfs(dump_id: int = None) -> str: """ Import the incremental dump with the given dump_id if specified otherwise the latest incremental dump. Notes: All incremental dumps are stored together in incremental.parquet inside the listens directory. Args: dump_id: id of the incremental dump to be imported Returns: the name of the imported dump """ with tempfile.TemporaryDirectory() as temp_dir: src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens( directory=temp_dir, dump_type=DumpType.INCREMENTAL, listens_dump_id=dump_id ) # instantiating ListenbrainzDataUploader creates a spark session which # is a bit non-intuitive. # FIXME in future to make initializing of spark session more explicit? ListenbrainzDataUploader().upload_new_listens_incremental_dump(src) utils.insert_dump_data(dump_id, DumpType.INCREMENTAL, datetime.utcnow()) return dump_name
def test_download_artist_relation(self, mock_ftp_cons, mock_list_dir, mock_download_dump, mock_dump_archive): directory = '/fakedir' mock_list_dir.return_value = [ 'artist-credit-artist-credit-relations-01-20191230-134806/', 'artist-credit-artist-credit-relations-02-20191230-134806/', ] mock_dump_archive.return_value = 'artist-credit-artist-credit-relations-02-20191230-134806.tar.bz2' dest_path, filename = ListenbrainzDataDownloader( ).download_artist_relation(directory) mock_list_dir.assert_called_once() mock_ftp_cons.return_value.cwd.assert_has_calls([ call(config.FTP_ARTIST_RELATION_DIR), call('artist-credit-artist-credit-relations-02-20191230-134806/') ]) self.assertEqual( 'artist-credit-artist-credit-relations-02-20191230-134806.tar.bz2', filename) mock_dump_archive.assert_called_once_with( 'artist-credit-artist-credit-relations-02-20191230-134806/') mock_download_dump.assert_called_once_with( mock_dump_archive.return_value, directory) self.assertEqual(dest_path, mock_download_dump.return_value)
def import_newest_incremental_dump_handler(): errors = [] imported_dumps = [] latest_full_dump = utils.get_latest_full_dump() if latest_full_dump is None: # If no prior full dump is present, just import the latest incremental dump imported_dumps.append(import_incremental_dump_to_hdfs(dump_id=None)) error_msg = "No previous full dump found, importing latest incremental dump" errors.append(error_msg) logger.warning(error_msg, exc_info=True) else: # Import all missing dumps from last full dump import start_id = latest_full_dump["dump_id"] + 1 imported_at = latest_full_dump["imported_at"] end_id = ListenbrainzDataDownloader().get_latest_dump_id(DumpType.INCREMENTAL) + 1 for dump_id in range(start_id, end_id, 1): if not utils.search_dump(dump_id, DumpType.INCREMENTAL, imported_at): try: imported_dumps.append(import_incremental_dump_to_hdfs(dump_id)) except Exception as e: # Skip current dump if any error occurs during import error_msg = f"Error while importing incremental dump with ID {dump_id}: {e}" errors.append(error_msg) logger.error(error_msg, exc_info=True) continue dump_id += 1 request_consumer.rc.ping() return [{ 'type': 'import_incremental_dump', 'imported_dump': imported_dumps, 'errors': errors, 'time': str(datetime.utcnow()), }]
def test_download_artist_relation(self, mock_ftp_cons, mock_spark_dump): dest_path = ListenbrainzDataDownloader().download_artist_relation( '/fakedir', 1) mock_spark_dump.assert_called_once_with('/fakedir', 1, config.FTP_ARTIST_RELATION_DIR, ARTIST_RELATION_DUMP_ID_POS) self.assertEqual(dest_path, mock_spark_dump.return_value)
def test_download_listens_incremental_dump_by_id(self, mock_ftp, mock_list_dir, mock_get_f_name, mock_download_dump): mock_list_dir.return_value = [ 'listenbrainz-dump-123-20190101-000000/', 'listenbrainz-dump-45-20190201-000000' ] mock_get_f_name.return_value = 'listenbrainz-listens-dump-45-20190201-000000-spark-incremental.tar.xz' dest_path, filename, dump_id = ListenbrainzDataDownloader( ).download_listens('fakedir', listens_dump_id=45, dump_type='incremental') mock_list_dir.assert_called_once() mock_ftp.return_value.cwd.assert_has_calls([ call(config.FTP_LISTENS_DIR + 'incremental/'), call('listenbrainz-dump-45-20190201-000000') ]) self.assertEqual( 'listenbrainz-listens-dump-45-20190201-000000-spark-incremental.tar.xz', filename) mock_get_f_name.assert_called_once() mock_download_dump.assert_called_once_with( mock_get_f_name.return_value, 'fakedir') self.assertEqual(dest_path, mock_download_dump.return_value) self.assertEqual(dump_id, 45)
def import_dump_to_hdfs(dump_type, force, dump_id=None): temp_dir = tempfile.mkdtemp() dump_type = 'incremental' if dump_type == 'incremental' else 'full' src, dump_name = ListenbrainzDataDownloader().download_listens( directory=temp_dir, dump_type=dump_type, listens_dump_id=dump_id) ListenbrainzDataUploader().upload_listens(src, force=force) shutil.rmtree(temp_dir) return dump_name
def import_dump_to_hdfs(dump_type, overwrite, dump_id=None): temp_dir = tempfile.mkdtemp() dump_type = 'incremental' if dump_type == 'incremental' else 'full' src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens( directory=temp_dir, dump_type=dump_type, listens_dump_id=dump_id) ListenbrainzDataUploader().upload_listens(src, overwrite=overwrite) utils.insert_dump_data(dump_id, dump_type, datetime.utcnow()) shutil.rmtree(temp_dir) return dump_name
def test_download_listens(self, mock_ftp_cons, mock_list_dir, mock_get_f_name, mock_download_dump): mock_ftp = mock_ftp_cons.return_value dest_path = ListenbrainzDataDownloader().download_listens('fakedir', None) mock_list_dir.assert_called_once() mock_ftp.cwd.assert_has_calls([call(config.FTP_LISTENS_DIR), call(config.TEMP_LISTENS_DIR)]) mock_get_f_name.assert_called_once() mock_download_dump.assert_called_once_with(mock_get_f_name.return_value, 'fakedir') self.assertEqual(dest_path, mock_download_dump.return_value)
def import_mapping_to_hdfs(): temp_dir = tempfile.mkdtemp() src, mapping_name = ListenbrainzDataDownloader().download_msid_mbid_mapping(directory=temp_dir) ListenbrainzDataUploader().upload_mapping(archive=src) shutil.rmtree(temp_dir) return [{ 'type': 'import_mapping', 'imported_mapping': mapping_name, 'time': str(datetime.utcnow()) }]
def import_artist_relation_to_hdfs(): temp_dir = tempfile.mkdtemp() src, artist_relation_name = ListenbrainzDataDownloader().download_artist_relation(directory=temp_dir) ListenbrainzDataUploader().upload_artist_relation(archive=src) shutil.rmtree(temp_dir) return [{ 'type': 'import_artist_relation', 'imported_artist_relation': artist_relation_name, 'time': str(datetime.utcnow()) }]
def download_spark_dump_and_get_path(self, mock_ftp_cons, mock_list_dir, mock_req_dir, mock_get_f_name, mock_download_dump): mock_ftp = mock_ftp_cons.return_value dest_path = ListenbrainzDataDownloader().download_spark_dump_and_get_path('fakedir', None, 'fakeftpdir', 4) mock_list_dir.assert_called_once() mock_req_dir.assert_called_once_with(mock_list_dir.return_value, None, 4) mock_ftp.cwd.assert_has_calls([call('fakeftpdir'), call(mock_req_dir.return_value)]) mock_get_f_name.assert_called_once_with(mock_req_dir.return_value) mock_download_dump.assert_called_once_with(mock_get_f_name.return_value, 'fakedir') self.assertEqual(dest_path, mock_download_dump.return_value)
def import_full_dump_to_hdfs(dump_id: int = None) -> str: """ Import the full dump with the given dump_id if specified otherwise the latest full dump. Notes: Deletes all the existing listens and uploads listens from new dump. Args: dump_id: id of the full dump to be imported Returns: the name of the imported dump """ with tempfile.TemporaryDirectory() as temp_dir: downloader = ListenbrainzDataDownloader() src, dump_name, dump_id = downloader.download_listens( directory=temp_dir, dump_type=DumpType.FULL, listens_dump_id=dump_id) downloader.connection.close() ListenbrainzDataUploader().upload_new_listens_full_dump(src) utils.insert_dump_data(dump_id, DumpType.FULL, datetime.utcnow()) return dump_name
def test_get_latest_mapping(self, mock_ftp): mapping = [ 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2', 'msid-mbid-mapping-with-text-20180603-202000.tar.bz2', 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2', 'msid-mbid-mapping-with-matchable-xxxx-20200603-202732.tar.bz2' 'msid-mbid-mapping-with-matchable-20200603-202732.tar.bz2.md5' ] expected_mapping = 'msid-mbid-mapping-with-matchable-20200603-203731.tar.bz2' latest_mapping = ListenbrainzDataDownloader().get_latest_mapping(mapping) self.assertEqual(latest_mapping, expected_mapping)
def import_artist_relation_to_hdfs(): ts = time.monotonic() temp_dir = tempfile.mkdtemp() src, artist_relation_name = ListenbrainzDataDownloader().download_artist_relation(directory=temp_dir) ListenbrainzDataUploader().upload_artist_relation(archive=src) shutil.rmtree(temp_dir) return [{ 'type': 'import_artist_relation', 'imported_artist_relation': artist_relation_name, 'import_time': str(datetime.utcnow()), 'time_taken_to_import': '{:.2f}'.format(time.monotonic() - ts) }]
def test_download_listens_full_dump(self, mock_ftp, mock_list_dir, mock_get_f_name, mock_download_dump): mock_list_dir.return_value = [ 'listenbrainz-dump-123-20190101-000000/', 'listenbrainz-dump-45-20190201-000000' ] mock_get_f_name.return_value = 'listenbrainz-spark-dump-123-20190101-000000-full.tar' dest_path, filename, dump_id = ListenbrainzDataDownloader( ).download_listens('fakedir', None, dump_type=DumpType.FULL) mock_list_dir.assert_called_once() mock_ftp.return_value.cwd.assert_has_calls([ call(config.FTP_LISTENS_DIR + 'fullexport/'), call('listenbrainz-dump-123-20190101-000000/') ]) self.assertEqual( 'listenbrainz-spark-dump-123-20190101-000000-full.tar', filename) mock_get_f_name.assert_called_once() mock_download_dump.assert_called_once_with( mock_get_f_name.return_value, 'fakedir') self.assertEqual(dest_path, mock_download_dump.return_value) self.assertEqual(dump_id, 123)
def test_get_listens_file_name(self, mock_ftp_cons): filename = ListenbrainzDataDownloader().get_listens_file_name() self.assertEqual(filename, config.TEMP_LISTENS_DUMP)
def test_get_dump_archive_name(self, mock_ftp_cons): dump_name = 'listenbrainz-01-00000' filename = ListenbrainzDataDownloader().get_dump_archive_name(dump_name) self.assertEqual(dump_name + '.tar.bz2', filename)
def import_release_json_dump_to_hdfs(): with tempfile.TemporaryDirectory() as temp_dir: downloader = ListenbrainzDataDownloader() dest = downloader.download_release_json_dump(temp_dir) downloader.connection.close() ListenbrainzDataUploader().upload_release_json_dump(dest)