Пример #1
0
    def download_dump(self, filename, directory):
        """ Download file with `filename` from FTP.

            Args:
                filename (str): File name of FTP dump.
                directory (str): Dir to save dump locally.

            Returns:
                dest_path (str): Local path where dump has been downloaded.
        """
        # Check if sha256 is present to validate the download, if not present don't download
        sha_filename = filename + '.sha256'
        dir_content = self.list_dir()
        sha_dest_path = os.path.join(directory, sha_filename)
        if sha_filename in dir_content:
            self.download_file_binary(sha_filename, sha_dest_path)
        else:
            raise DumpInvalidException("SHA256 checksum for the given file missing, aborting download.")
        dest_path = os.path.join(directory, filename)
        self.download_file_binary(filename, dest_path)

        logger.info("Verifying dump integrity...")
        calculated_sha = self._calc_sha256(dest_path)
        received_sha = self._read_sha_file(sha_dest_path)

        os.remove(sha_dest_path)
        if calculated_sha != received_sha:
            # Cleanup
            os.remove(dest_path)
            raise DumpInvalidException("""Received SHA256 checksum doesn't match the calculated checksum, aborting.
                                       Calculated SHA: {calculated_sha}. Received SHA: {received_sha}""".format(
                calculated_sha=calculated_sha, received_sha=received_sha))

        self.connection.cwd('/')
        return dest_path
Пример #2
0
def mock_import_dump_to_hdfs_error(dump_id: int):
    """ Mock function returning dump name all dump ids less than 210, else raise DumpInvalidException"""
    if (dump_id < 210) or (dump_id > 210 and dump_id < 213):
        return f"listenbrainz-spark-dump-{dump_id}-incremental.tar"
    elif dump_id == 210:
        raise DumpInvalidException("Invalid Dump")
    else:
        raise DumpNotFoundException("Dump not found")
Пример #3
0
    def extract_and_upload_archive(self,
                                   archive,
                                   local_dir,
                                   hdfs_dir,
                                   cleanup_on_failure=True):
        """
        Extract the archive and upload it to the given hdfs directory.
        Args:
            archive: path to the tar archive to uploaded
            local_dir: path to local dir to be used for extraction
            hdfs_dir: path to hdfs dir where contents of tar should be uploaded
            cleanup_on_failure: whether to delete local and hdfs directories
                if error occurs during extraction
        """
        total_files = 0
        total_time = 0.0
        with tarfile.open(archive, mode='r') as tar:
            for member in tar:
                if member.isfile() and member.name.endswith(".parquet"):
                    logger.info(f"Uploading {member.name}...")
                    t0 = time.monotonic()

                    try:
                        tar.extract(member, path=local_dir)
                    except tarfile.TarError as err:
                        if cleanup_on_failure:
                            if utils.path_exists(hdfs_dir):
                                utils.delete_dir(hdfs_dir, recursive=True)
                            shutil.rmtree(local_dir, ignore_errors=True)
                        raise DumpInvalidException(
                            f"{type(err).__name__} while extracting {member.name}, aborting import"
                        )

                    hdfs_path = os.path.join(hdfs_dir, member.name)
                    local_path = os.path.join(local_dir, member.name)
                    utils.upload_to_HDFS(hdfs_path, local_path)

                    time_taken = time.monotonic() - t0
                    total_files += 1
                    total_time += time_taken
                    logger.info(
                        f"Done! Current file processed in {time_taken:.2f} sec"
                    )
        logger.info(
            f"Done! Total files processed {total_files}. Average time taken: {total_time / total_files:.2f}"
        )
Пример #4
0
    def upload_archive(self,
                       tmp_dump_dir,
                       tar,
                       dest_path,
                       schema,
                       callback=None,
                       overwrite=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                overwrite: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError(
                'Callback to process JSON missing. Aborting...')

        # Delete TEMP_DIR_PATH if it exists
        if utils.path_exists(TEMP_DIR_PATH):
            utils.delete_dir(TEMP_DIR_PATH, recursive=True)

        # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data
        if not overwrite and utils.path_exists(dest_path):
            t0 = time.monotonic()
            logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH))
            utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True)
            logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                          t0))

        logger.info("Uploading listens to temporary directory in HDFS...")
        total_files = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                logger.info("Uploading {}...".format(member.name))
                t0 = time.monotonic()

                try:
                    tar.extract(member)
                except TarError as err:
                    # Cleanup
                    if utils.path_exists(TEMP_DIR_PATH):
                        utils.delete_dir(TEMP_DIR_PATH, recursive=True)
                    if utils.path_exists(tmp_dump_dir):
                        utils.delete_dir(tmp_dump_dir, recursive=True)
                    raise DumpInvalidException(
                        "{} while extracting {}, aborting import".format(
                            type(err).__name__, member.name))

                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path,
                         not overwrite, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                time_taken = time.monotonic() - t0
                total_files += 1
                total_time += time_taken
                logger.info(
                    "Done! Current file processed in {:.2f} sec".format(
                        time_taken))
        logger.info(
            "Done! Total files processed {}. Average time taken: {:.2f}".
            format(total_files, total_time / total_files))

        # Delete dest_path if present
        if utils.path_exists(dest_path):
            logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info("Moving the processed files to {}".format(dest_path))
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = pathlib.Path(dest_path).parent
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(TEMP_DIR_PATH, dest_path)
        utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                            t0))

        # Cleanup
        utils.delete_dir(tmp_dump_dir, recursive=True)