class FileManagerTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        os.environ[
            'CUSTOMIZED_FILE_MANAGER'] = 'testing.fake_file_manager:FakeFileManager'

    @classmethod
    def tearDownClass(cls):
        del os.environ['CUSTOMIZED_FILE_MANAGER']

    def setUp(self):
        self._fm = FileManager()

    def test_can_handle(self):
        self.assertTrue(self._fm.can_handle('fake://123'))
        # Falls back to default manager
        self.assertTrue(self._fm.can_handle('/data/123'))
        self.assertFalse(self._fm.can_handle('hdfs:///123'))

    def test_ls(self):
        self.assertEqual(self._fm.ls('fake://data'), [{
            'path': 'fake://data/f1.txt',
            'size': 0
        }])

    def test_move(self):
        self.assertTrue(self._fm.move('fake://move/123', 'fake://move/234'))
        self.assertFalse(
            self._fm.move('fake://do_not_move/123', 'fake://move/234'))
        # No file manager can handle this
        self.assertRaises(RuntimeError,
                          lambda: self._fm.move('hdfs://123', 'fake://abc'))

    def test_remove(self):
        self.assertTrue(self._fm.remove('fake://remove/123'))
        self.assertFalse(self._fm.remove('fake://do_not_remove/123'))
        # No file manager can handle this
        self.assertRaises(RuntimeError, lambda: self._fm.remove('hdfs://123'))

    def test_copy(self):
        self.assertTrue(self._fm.copy('fake://copy/123', 'fake://copy/234'))
        self.assertFalse(
            self._fm.copy('fake://do_not_copy/123', 'fake://copy/234'))
        # No file manager can handle this
        self.assertRaises(RuntimeError,
                          lambda: self._fm.copy('hdfs://123', 'fake://abc'))

    def test_mkdir(self):
        self.assertTrue(self._fm.mkdir('fake://mkdir/123'))
        self.assertFalse(self._fm.mkdir('fake://do_not_mkdir/123'))
        # No file manager can handle this
        self.assertRaises(RuntimeError, lambda: self._fm.mkdir('hdfs:///123'))
示例#2
0
class ImportHandler(object):
    def __init__(self):
        self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3)
        self._file_manager = FileManager()
        self._pending_imports = set()
        self._running_imports = set()
        self._import_lock = threading.Lock()
        self._app = None

    def __del__(self):
        self._executor.shutdown()

    def init(self, app):
        self._app = app

    def schedule_to_handle(self, dataset_batch_ids):
        if isinstance(dataset_batch_ids, int):
            dataset_batch_ids = [dataset_batch_ids]
        self._pending_imports.update(dataset_batch_ids)

    def _copy_file(self, source_path, destination_path,
                   move=False, num_retry=3):
        logging.info('%s from %s to %s',
                     'moving' if move else 'copying',
                     source_path,
                     destination_path)
        # Creates parent folders if needed
        parent_folder = os.path.dirname(destination_path)
        self._file_manager.mkdir(parent_folder)
        success = False
        error_message = ''
        for _ in range(num_retry):
            try:
                if move:
                    success = self._file_manager.move(
                        source_path,
                        destination_path)
                else:
                    success = self._file_manager.copy(
                        source_path,
                        destination_path)
                if not success:
                    error_message = 'Unknown error'
                else:
                    break
            except Exception as e:  # pylint: disable=broad-except
                logging.error(
                    'Error occurred when importing file from %s to %s',
                    source_path,
                    destination_path)
                error_message = str(e)
        file = dataset_pb2.File(
            source_path=source_path,
            destination_path=destination_path
        )
        if not success:
            file.error_message = error_message
            file.state = dataset_pb2.File.State.FAILED
        else:
            file.size = self._file_manager.ls(
                destination_path)[0].size
            file.state = dataset_pb2.File.State.COMPLETED
        return file

    def _import_batch(self, batch_id):
        self._import_lock.acquire()
        if batch_id in self._running_imports:
            return
        self._running_imports.add(batch_id)
        self._import_lock.release()

        # Pushes app context to make db session work
        self._app.app_context().push()

        logging.info('Importing batch %d', batch_id)
        batch = DataBatch.query.get(batch_id)
        batch.state = BatchState.IMPORTING
        db.session.commit()
        db.session.refresh(batch)
        details = batch.get_details()

        for file in details.files:
            if file.state == dataset_pb2.File.State.UNSPECIFIED:
                # Recovers the state
                destination_existed = len(
                    self._file_manager.ls(file.destination_path)) > 0
                if destination_existed:
                    file.state = dataset_pb2.File.State.COMPLETED
                    continue
                # Moves/Copies
                file.MergeFrom(self._copy_file(
                    source_path=file.source_path,
                    destination_path=file.destination_path,
                    move=batch.move))

        batch.set_details(details)
        db.session.commit()

        self._import_lock.acquire()
        self._running_imports.remove(batch_id)
        self._import_lock.release()


    def handle(self, pull=False):
        """Handles all the batches in the queue or all batches which
        should be imported."""
        batches_to_run = self._pending_imports
        self._pending_imports = set()
        if pull:
            # TODO: should separate pull logic to a cron job,
            # otherwise there will be a race condition that two handlers
            # are trying to move the same batch
            one_hour_ago = datetime.utcnow() - timedelta(hours=1)
            pulled_batches = db.session.query(DataBatch.id).filter(
                    (DataBatch.state == BatchState.NEW) |
                    (DataBatch.state == BatchState.IMPORTING))\
                .filter(DataBatch.updated_at < one_hour_ago)\
                .all()
            pulled_ids = [bid for bid, in pulled_batches]
            batches_to_run.update(pulled_ids)

        for batch in batches_to_run:
            self._executor.submit(self._import_batch, batch)
示例#3
0
class SparkAppService(object):
    def __init__(self) -> None:
        self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp')
        self._file_client = FileManager()

        self._file_client.mkdir(self._base_dir)

    def _clear_and_make_an_empty_dir(self, dir_name: str):
        try:
            self._file_client.remove(dir_name)
        except Exception as err:  # pylint: disable=broad-except
            logging.error('failed to remove %s with exception %s', dir_name,
                          err)
        finally:
            self._file_client.mkdir(dir_name)

    def _get_sparkapp_upload_path(self, name: str) -> Tuple[bool, str]:
        """get upload path for specific sparkapp

        Args:
            name (str): sparkapp name

        Returns:
            Tuple[bool, str]:
                bool: True if this directory already exists
                str:  upload path for this sparkapp

        """
        sparkapp_path = os.path.join(self._base_dir, name)
        existable = False
        try:
            self._file_client.ls(sparkapp_path)
            existable = True
        except ValueError:
            existable = False

        return existable, sparkapp_path

    def _copy_files_to_target_filesystem(self, source_filesystem_path: str,
                                         target_filesystem_path: str) -> bool:
        """ copy files to remote filesystem
            - untar if file is tared
            - copy files to remote filesystem

        Args:
            source_filesystem_path (str): local filesystem
            target_filesystem_path (str): remote filesystem

        Returns:
            bool: whether success
        """
        temp_path = source_filesystem_path
        if source_filesystem_path.find('.tar') != -1:
            temp_path = os.path.abspath(
                os.path.join(source_filesystem_path, '../tmp'))
            os.makedirs(temp_path)
            TarCli.untar_file(source_filesystem_path, temp_path)

        for root, dirs, files in os.walk(temp_path):
            relative_path = os.path.relpath(root, temp_path)
            for f in files:
                file_path = os.path.join(root, f)
                remote_file_path = os.path.join(target_filesystem_path,
                                                relative_path, f)
                self._file_client.copy(file_path, remote_file_path)
            for d in dirs:
                remote_dir_path = os.path.join(target_filesystem_path,
                                               relative_path, d)
                self._file_client.mkdir(remote_dir_path)

        return True

    def submit_sparkapp(self, config: SparkAppConfig) -> SparkAppInfo:
        """submit sparkapp

        Args:
            config (SparkAppConfig): sparkapp config

        Raises:
            InternalException: if fail to get sparkapp

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        sparkapp_path = config.files_path
        if config.files_path is None:
            _, sparkapp_path = self._get_sparkapp_upload_path(config.name)
            self._clear_and_make_an_empty_dir(sparkapp_path)

            with tempfile.TemporaryDirectory() as temp_dir:
                tar_path = os.path.join(temp_dir, 'files.tar')
                with open(tar_path, 'wb') as fwrite:
                    fwrite.write(config.files)
                self._copy_files_to_target_filesystem(
                    source_filesystem_path=tar_path,
                    target_filesystem_path=sparkapp_path)

        config_dict = config.build_config(sparkapp_path)
        logging.info(f'submit sparkapp, config: {config_dict}')
        resp = k8s_client.create_sparkapplication(config_dict)
        return SparkAppInfo.from_k8s_resp(resp)

    def get_sparkapp_info(self, name: str) -> SparkAppInfo:
        """ get sparkapp info

        Args:
            name (str): sparkapp name

        Raises:
            WebConsoleApiException

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        resp = k8s_client.get_sparkapplication(name)
        return SparkAppInfo.from_k8s_resp(resp)

    def delete_sparkapp(self, name: str) -> SparkAppInfo:
        """delete sparkapp
            - delete sparkapp. If failed, raise exception
            - delete the tmp filesystem


        Args:
            name (str): sparkapp name

        Raises:
            WebConsoleApiException

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        existable, sparkapp_path = self._get_sparkapp_upload_path(name)
        if existable:
            self._file_client.remove(sparkapp_path)

        resp = k8s_client.delete_sparkapplication(name)
        sparkapp_info = SparkAppInfo.from_k8s_resp(resp)

        return sparkapp_info