Пример #1
0
def import_mapping_to_hdfs():
    ts = time.monotonic()
    temp_dir = tempfile.mkdtemp()
    src, mapping_name = ListenbrainzDataDownloader(
    ).download_msid_mbid_mapping(directory=temp_dir)
    ListenbrainzDataUploader().upload_mapping(archive=src)
    shutil.rmtree(temp_dir)

    return [{
        'type': 'import_mapping',
        'imported_mapping': mapping_name,
        'import_time': str(datetime.utcnow()),
        'time_taken_to_import': '{:.2f}'.format(time.monotonic() - ts)
    }]
    def test_upload_archive_failed(self):
        faulty_tar = MagicMock()
        faulty_tar.extract.side_effect = tarfile.ReadError()
        member = MagicMock()
        faulty_tar.__iter__.return_value = [member]

        tmp_dump_dir = tempfile.mkdtemp()
        self.assertRaises(DumpInvalidException,
                          ListenbrainzHDFSUploader().upload_archive,
                          tmp_dump_dir, faulty_tar, '/test',
                          schema.listen_schema,
                          ListenbrainzDataUploader().process_json_listens)

        status = utils.path_exists('/test')
        self.assertFalse(status)
Пример #3
0
def import_full_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the full dump with the given dump_id if specified otherwise the
     latest full dump.

    Notes:
        Deletes all the existing listens and uploads listens from new dump.
    Args:
        dump_id: id of the full dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        src, dump_name, dump_id = ListenbrainzDataDownloader(
        ).download_listens(directory=temp_dir,
                           dump_type=DumpType.FULL,
                           listens_dump_id=dump_id)
        ListenbrainzDataUploader().upload_new_listens_full_dump(src)
    utils.insert_dump_data(dump_id, DumpType.FULL, datetime.utcnow())
    return dump_name
Пример #4
0
    def test_upload_archive(self):
        archive_path = self.create_test_tar()
        pxz = ListenbrainzHDFSUploader().get_pxz_output(archive_path)
        tmp_dump_dir = tempfile.mkdtemp()

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            ListenbrainzHDFSUploader().upload_archive(tmp_dump_dir, tar, '/test', schema.listen_schema,
            ListenbrainzDataUploader().process_json_listens)

        walk = utils.hdfs_walk('/test', depth=1)
        dirs = next(walk)[1]
        self.assertEqual(len(dirs), 1)

        df = utils.read_files_from_HDFS('/test/2020/1.parquet')
        self.assertEqual(df.count(), 1)

        status = utils.path_exists(tmp_dump_dir)
        self.assertFalse(status)

        utils.delete_dir('/test', recursive=True)
Пример #5
0
    def test_process_json_listens_append(self, mock_read_json):
        fakeschema = StructType([StructField('column_1', StringType()), StructField('column_2', StringType())])

        # Save old dataframe in HDFS
        old_df = utils.create_dataframe(Row(column_1='row_a', column_2='row_a'), fakeschema)
        old_df.union(utils.create_dataframe(Row(column_1='row_b', column_2='row_b'), fakeschema))
        utils.save_parquet(old_df, os.path.join(self.path_, '/2020/1.parquet'))

        # Mock read_json to return new dataframe
        new_df = utils.create_dataframe(Row(column_1='row_c', column_2='row_c'), fakeschema)
        mock_read_json.return_value = new_df

        ListenbrainzDataUploader().process_json_listens('/2020/1.json', self.path_, self.path_, append=True, schema=fakeschema)

        received = utils.read_files_from_HDFS(os.path.join(self.path_, '/2020/1.parquet')) \
            .rdd \
            .map(list) \
            .collect()

        old_df.union(new_df)
        expected = old_df.rdd.map(list).collect()

        self.assertCountEqual(received, expected)
Пример #6
0
def import_incremental_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the incremental dump with the given dump_id if specified otherwise the
     latest incremental dump.

    Notes:
        All incremental dumps are stored together in incremental.parquet inside the
        listens directory.
    Args:
        dump_id: id of the incremental dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        src, dump_name, dump_id = ListenbrainzDataDownloader(
        ).download_listens(directory=temp_dir,
                           dump_type=DumpType.INCREMENTAL,
                           listens_dump_id=dump_id)

        # instantiating ListenbrainzDataUploader creates a spark session which
        # is a bit non-intuitive.
        # FIXME in future to make initializing of spark session more explicit?
        ListenbrainzDataUploader().upload_new_listens_incremental_dump(src)
    utils.insert_dump_data(dump_id, DumpType.INCREMENTAL, datetime.utcnow())
    return dump_name
Пример #7
0
 def test_process_json(self, mock_save, mock_read):
     fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)])
     ListenbrainzDataUploader().process_json('_', '/fakedestpath', '/fakehdfspath', '__', fakeschema)
     mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema)
     mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')
Пример #8
0
def import_release_json_dump_to_hdfs():
    with tempfile.TemporaryDirectory() as temp_dir:
        downloader = ListenbrainzDataDownloader()
        dest = downloader.download_release_json_dump(temp_dir)
        downloader.connection.close()
        ListenbrainzDataUploader().upload_release_json_dump(dest)
Пример #9
0
 def setUpClass(cls) -> None:
     listenbrainz_spark.init_test_session(f"spark-test-run-{uuid.uuid4()}")
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     cls.uploader = ListenbrainzDataUploader()