class FileManagerTest(unittest.TestCase): @classmethod def setUpClass(cls): os.environ[ 'CUSTOMIZED_FILE_MANAGER'] = 'testing.fake_file_manager:FakeFileManager' @classmethod def tearDownClass(cls): del os.environ['CUSTOMIZED_FILE_MANAGER'] def setUp(self): self._fm = FileManager() def test_can_handle(self): self.assertTrue(self._fm.can_handle('fake://123')) # Falls back to default manager self.assertTrue(self._fm.can_handle('/data/123')) self.assertFalse(self._fm.can_handle('hdfs:///123')) def test_ls(self): self.assertEqual(self._fm.ls('fake://data'), [{ 'path': 'fake://data/f1.txt', 'size': 0 }]) def test_move(self): self.assertTrue(self._fm.move('fake://move/123', 'fake://move/234')) self.assertFalse( self._fm.move('fake://do_not_move/123', 'fake://move/234')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.move('hdfs://123', 'fake://abc')) def test_remove(self): self.assertTrue(self._fm.remove('fake://remove/123')) self.assertFalse(self._fm.remove('fake://do_not_remove/123')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.remove('hdfs://123')) def test_copy(self): self.assertTrue(self._fm.copy('fake://copy/123', 'fake://copy/234')) self.assertFalse( self._fm.copy('fake://do_not_copy/123', 'fake://copy/234')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.copy('hdfs://123', 'fake://abc')) def test_mkdir(self): self.assertTrue(self._fm.mkdir('fake://mkdir/123')) self.assertFalse(self._fm.mkdir('fake://do_not_mkdir/123')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.mkdir('hdfs:///123'))
class ImportHandler(object): def __init__(self): self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3) self._file_manager = FileManager() self._pending_imports = set() self._running_imports = set() self._import_lock = threading.Lock() self._app = None def __del__(self): self._executor.shutdown() def init(self, app): self._app = app def schedule_to_handle(self, dataset_batch_ids): if isinstance(dataset_batch_ids, int): dataset_batch_ids = [dataset_batch_ids] self._pending_imports.update(dataset_batch_ids) def _copy_file(self, source_path, destination_path, move=False, num_retry=3): logging.info('%s from %s to %s', 'moving' if move else 'copying', source_path, destination_path) # Creates parent folders if needed parent_folder = os.path.dirname(destination_path) self._file_manager.mkdir(parent_folder) success = False error_message = '' for _ in range(num_retry): try: if move: success = self._file_manager.move( source_path, destination_path) else: success = self._file_manager.copy( source_path, destination_path) if not success: error_message = 'Unknown error' else: break except Exception as e: # pylint: disable=broad-except logging.error( 'Error occurred when importing file from %s to %s', source_path, destination_path) error_message = str(e) file = dataset_pb2.File( source_path=source_path, destination_path=destination_path ) if not success: file.error_message = error_message file.state = dataset_pb2.File.State.FAILED else: file.size = self._file_manager.ls( destination_path)[0].size file.state = dataset_pb2.File.State.COMPLETED return file def _import_batch(self, batch_id): self._import_lock.acquire() if batch_id in self._running_imports: return self._running_imports.add(batch_id) self._import_lock.release() # Pushes app context to make db session work self._app.app_context().push() logging.info('Importing batch %d', batch_id) batch = DataBatch.query.get(batch_id) batch.state = BatchState.IMPORTING db.session.commit() db.session.refresh(batch) details = batch.get_details() for file in details.files: if file.state == dataset_pb2.File.State.UNSPECIFIED: # Recovers the state destination_existed = len( self._file_manager.ls(file.destination_path)) > 0 if destination_existed: file.state = dataset_pb2.File.State.COMPLETED continue # Moves/Copies file.MergeFrom(self._copy_file( source_path=file.source_path, destination_path=file.destination_path, move=batch.move)) batch.set_details(details) db.session.commit() self._import_lock.acquire() self._running_imports.remove(batch_id) self._import_lock.release() def handle(self, pull=False): """Handles all the batches in the queue or all batches which should be imported.""" batches_to_run = self._pending_imports self._pending_imports = set() if pull: # TODO: should separate pull logic to a cron job, # otherwise there will be a race condition that two handlers # are trying to move the same batch one_hour_ago = datetime.utcnow() - timedelta(hours=1) pulled_batches = db.session.query(DataBatch.id).filter( (DataBatch.state == BatchState.NEW) | (DataBatch.state == BatchState.IMPORTING))\ .filter(DataBatch.updated_at < one_hour_ago)\ .all() pulled_ids = [bid for bid, in pulled_batches] batches_to_run.update(pulled_ids) for batch in batches_to_run: self._executor.submit(self._import_batch, batch)
class SparkAppService(object): def __init__(self) -> None: self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp') self._file_client = FileManager() self._file_client.mkdir(self._base_dir) def _clear_and_make_an_empty_dir(self, dir_name: str): try: self._file_client.remove(dir_name) except Exception as err: # pylint: disable=broad-except logging.error('failed to remove %s with exception %s', dir_name, err) finally: self._file_client.mkdir(dir_name) def _get_sparkapp_upload_path(self, name: str) -> Tuple[bool, str]: """get upload path for specific sparkapp Args: name (str): sparkapp name Returns: Tuple[bool, str]: bool: True if this directory already exists str: upload path for this sparkapp """ sparkapp_path = os.path.join(self._base_dir, name) existable = False try: self._file_client.ls(sparkapp_path) existable = True except ValueError: existable = False return existable, sparkapp_path def _copy_files_to_target_filesystem(self, source_filesystem_path: str, target_filesystem_path: str) -> bool: """ copy files to remote filesystem - untar if file is tared - copy files to remote filesystem Args: source_filesystem_path (str): local filesystem target_filesystem_path (str): remote filesystem Returns: bool: whether success """ temp_path = source_filesystem_path if source_filesystem_path.find('.tar') != -1: temp_path = os.path.abspath( os.path.join(source_filesystem_path, '../tmp')) os.makedirs(temp_path) TarCli.untar_file(source_filesystem_path, temp_path) for root, dirs, files in os.walk(temp_path): relative_path = os.path.relpath(root, temp_path) for f in files: file_path = os.path.join(root, f) remote_file_path = os.path.join(target_filesystem_path, relative_path, f) self._file_client.copy(file_path, remote_file_path) for d in dirs: remote_dir_path = os.path.join(target_filesystem_path, relative_path, d) self._file_client.mkdir(remote_dir_path) return True def submit_sparkapp(self, config: SparkAppConfig) -> SparkAppInfo: """submit sparkapp Args: config (SparkAppConfig): sparkapp config Raises: InternalException: if fail to get sparkapp Returns: SparkAppInfo: resp of sparkapp """ sparkapp_path = config.files_path if config.files_path is None: _, sparkapp_path = self._get_sparkapp_upload_path(config.name) self._clear_and_make_an_empty_dir(sparkapp_path) with tempfile.TemporaryDirectory() as temp_dir: tar_path = os.path.join(temp_dir, 'files.tar') with open(tar_path, 'wb') as fwrite: fwrite.write(config.files) self._copy_files_to_target_filesystem( source_filesystem_path=tar_path, target_filesystem_path=sparkapp_path) config_dict = config.build_config(sparkapp_path) logging.info(f'submit sparkapp, config: {config_dict}') resp = k8s_client.create_sparkapplication(config_dict) return SparkAppInfo.from_k8s_resp(resp) def get_sparkapp_info(self, name: str) -> SparkAppInfo: """ get sparkapp info Args: name (str): sparkapp name Raises: WebConsoleApiException Returns: SparkAppInfo: resp of sparkapp """ resp = k8s_client.get_sparkapplication(name) return SparkAppInfo.from_k8s_resp(resp) def delete_sparkapp(self, name: str) -> SparkAppInfo: """delete sparkapp - delete sparkapp. If failed, raise exception - delete the tmp filesystem Args: name (str): sparkapp name Raises: WebConsoleApiException Returns: SparkAppInfo: resp of sparkapp """ existable, sparkapp_path = self._get_sparkapp_upload_path(name) if existable: self._file_client.remove(sparkapp_path) resp = k8s_client.delete_sparkapplication(name) sparkapp_info = SparkAppInfo.from_k8s_resp(resp) return sparkapp_info