def import_mapping_to_hdfs(): ts = time.monotonic() temp_dir = tempfile.mkdtemp() src, mapping_name = ListenbrainzDataDownloader( ).download_msid_mbid_mapping(directory=temp_dir) ListenbrainzDataUploader().upload_mapping(archive=src) shutil.rmtree(temp_dir) return [{ 'type': 'import_mapping', 'imported_mapping': mapping_name, 'import_time': str(datetime.utcnow()), 'time_taken_to_import': '{:.2f}'.format(time.monotonic() - ts) }]
def test_upload_archive_failed(self): faulty_tar = MagicMock() faulty_tar.extract.side_effect = tarfile.ReadError() member = MagicMock() faulty_tar.__iter__.return_value = [member] tmp_dump_dir = tempfile.mkdtemp() self.assertRaises(DumpInvalidException, ListenbrainzHDFSUploader().upload_archive, tmp_dump_dir, faulty_tar, '/test', schema.listen_schema, ListenbrainzDataUploader().process_json_listens) status = utils.path_exists('/test') self.assertFalse(status)
def import_full_dump_to_hdfs(dump_id: int = None) -> str: """ Import the full dump with the given dump_id if specified otherwise the latest full dump. Notes: Deletes all the existing listens and uploads listens from new dump. Args: dump_id: id of the full dump to be imported Returns: the name of the imported dump """ with tempfile.TemporaryDirectory() as temp_dir: src, dump_name, dump_id = ListenbrainzDataDownloader( ).download_listens(directory=temp_dir, dump_type=DumpType.FULL, listens_dump_id=dump_id) ListenbrainzDataUploader().upload_new_listens_full_dump(src) utils.insert_dump_data(dump_id, DumpType.FULL, datetime.utcnow()) return dump_name
def test_upload_archive(self): archive_path = self.create_test_tar() pxz = ListenbrainzHDFSUploader().get_pxz_output(archive_path) tmp_dump_dir = tempfile.mkdtemp() with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: ListenbrainzHDFSUploader().upload_archive(tmp_dump_dir, tar, '/test', schema.listen_schema, ListenbrainzDataUploader().process_json_listens) walk = utils.hdfs_walk('/test', depth=1) dirs = next(walk)[1] self.assertEqual(len(dirs), 1) df = utils.read_files_from_HDFS('/test/2020/1.parquet') self.assertEqual(df.count(), 1) status = utils.path_exists(tmp_dump_dir) self.assertFalse(status) utils.delete_dir('/test', recursive=True)
def test_process_json_listens_append(self, mock_read_json): fakeschema = StructType([StructField('column_1', StringType()), StructField('column_2', StringType())]) # Save old dataframe in HDFS old_df = utils.create_dataframe(Row(column_1='row_a', column_2='row_a'), fakeschema) old_df.union(utils.create_dataframe(Row(column_1='row_b', column_2='row_b'), fakeschema)) utils.save_parquet(old_df, os.path.join(self.path_, '/2020/1.parquet')) # Mock read_json to return new dataframe new_df = utils.create_dataframe(Row(column_1='row_c', column_2='row_c'), fakeschema) mock_read_json.return_value = new_df ListenbrainzDataUploader().process_json_listens('/2020/1.json', self.path_, self.path_, append=True, schema=fakeschema) received = utils.read_files_from_HDFS(os.path.join(self.path_, '/2020/1.parquet')) \ .rdd \ .map(list) \ .collect() old_df.union(new_df) expected = old_df.rdd.map(list).collect() self.assertCountEqual(received, expected)
def import_incremental_dump_to_hdfs(dump_id: int = None) -> str: """ Import the incremental dump with the given dump_id if specified otherwise the latest incremental dump. Notes: All incremental dumps are stored together in incremental.parquet inside the listens directory. Args: dump_id: id of the incremental dump to be imported Returns: the name of the imported dump """ with tempfile.TemporaryDirectory() as temp_dir: src, dump_name, dump_id = ListenbrainzDataDownloader( ).download_listens(directory=temp_dir, dump_type=DumpType.INCREMENTAL, listens_dump_id=dump_id) # instantiating ListenbrainzDataUploader creates a spark session which # is a bit non-intuitive. # FIXME in future to make initializing of spark session more explicit? ListenbrainzDataUploader().upload_new_listens_incremental_dump(src) utils.insert_dump_data(dump_id, DumpType.INCREMENTAL, datetime.utcnow()) return dump_name
def test_process_json(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json('_', '/fakedestpath', '/fakehdfspath', '__', fakeschema) mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')
def import_release_json_dump_to_hdfs(): with tempfile.TemporaryDirectory() as temp_dir: downloader = ListenbrainzDataDownloader() dest = downloader.download_release_json_dump(temp_dir) downloader.connection.close() ListenbrainzDataUploader().upload_release_json_dump(dest)
def setUpClass(cls) -> None: listenbrainz_spark.init_test_session(f"spark-test-run-{uuid.uuid4()}") hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) cls.uploader = ListenbrainzDataUploader()