async def run(self, server): # Connect to dashboard. self._stub = await self._connect_to_dashboard() # Start monitor task. self._monitor = monitor_events( self._event_dir, lambda data: create_task(self._cached_events.put(data)), source_types=event_consts.EVENT_AGENT_MONITOR_SOURCE_TYPES) # Start reporting events. await self.report_events()
def monitor_events( event_dir, callback, scan_interval_seconds=event_consts.SCAN_EVENT_DIR_INTERVAL_SECONDS, start_mtime=time.time() + event_consts.SCAN_EVENT_START_OFFSET_SECONDS, monitor_files=None, source_types=None, ): """Monitor events in directory. New events will be read and passed to the callback. Args: event_dir (str): The event log directory. callback (def callback(List[str]): pass): A callback accepts a list of event strings. scan_interval_seconds (float): An interval seconds between two scans. start_mtime (float): Only the event log files whose last modification time is greater than start_mtime are monitored. monitor_files (Dict[int, MonitorFile]): The map from event log file id to MonitorFile object. Monitor all files start from the beginning if the value is None. source_types (List[str]): A list of source type name from event_pb2.Event.SourceType.keys(). Monitor all source types if the value is None. """ loop = asyncio.get_event_loop() if monitor_files is None: monitor_files = {} logger.info( "Monitor events logs modified after %s on %s, " "the source types are %s.", start_mtime, event_dir, "all" if source_types is None else source_types, ) MonitorFile = collections.namedtuple("MonitorFile", ["size", "mtime", "position"]) def _source_file_filter(source_file): stat = os.stat(source_file) return stat.st_mtime > start_mtime def _read_monitor_file(file, pos): assert isinstance( file, str), f"File should be a str, but a {type(file)}({file}) found" fd = os.open(file, os.O_RDONLY) try: stat = os.stat(fd) # Check the file size to avoid raising the exception # ValueError: cannot mmap an empty file if stat.st_size <= 0: return [] fid = stat.st_ino or file monitor_file = monitor_files.get(fid) if monitor_file: if (monitor_file.position == monitor_file.size and monitor_file.size == stat.st_size and monitor_file.mtime == stat.st_mtime): logger.debug( "Skip reading the file because " "there is no change: %s", file) return [] position = monitor_file.position else: logger.info("Found new event log file: %s", file) position = pos # Close the fd in finally. r = _read_file(fd, position, closefd=False) # It should be fine to update the dict in executor thread. monitor_files[r.fid] = MonitorFile(r.size, r.mtime, r.position) loop.call_soon_threadsafe(callback, r.lines) except Exception as e: raise Exception(f"Read event file failed: {file}") from e finally: os.close(fd) @async_loop_forever(scan_interval_seconds, cancellable=True) async def _scan_event_log_files(): # Scan event files. source_files = await loop.run_in_executor(None, _get_source_files, event_dir, source_types, _source_file_filter) # Limit concurrent read to avoid fd exhaustion. semaphore = asyncio.Semaphore(event_consts.CONCURRENT_READ_LIMIT) async def _concurrent_coro(filename): async with semaphore: return await loop.run_in_executor(None, _read_monitor_file, filename, 0) # Read files. await asyncio.gather(*[ _concurrent_coro(filename) for filename in list(itertools.chain(*source_files.values())) ]) return create_task(_scan_event_log_files())
async def InitializeJobEnv(self, request, context): # TODO(fyrestone): Handle duplicated InitializeJobEnv requests # when initializing job environment. # TODO(fyrestone): Support reinitialize job environment. # TODO(fyrestone): Use job id instead of unique id. unique_id = secrets.token_hex(6) # Parse the job description from the request. try: job_description_data = json.loads(request.job_description) job_info = JobInfo(unique_id=unique_id, temp_dir=self._dashboard_agent.temp_dir, log_dir=self._dashboard_agent.log_dir, **job_description_data) except json.JSONDecodeError as ex: error_message = str(ex) error_message += f", job_payload:\n{request.job_description}" logger.error("[%s] Initialize job environment failed, %s.", unique_id, error_message) return job_agent_pb2.InitializeJobEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) except Exception as ex: logger.exception(ex) return job_agent_pb2.InitializeJobEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=traceback.format_exc()) async def _initialize_job_env(): os.makedirs(job_consts.JOB_DIR.format(temp_dir=job_info.temp_dir, unique_id=unique_id), exist_ok=True) # Download the job package. await DownloadPackage(job_info, self._dashboard_agent.http_session).run() # Start the driver. logger.info("[%s] Starting driver.", unique_id) language = job_info.language if language == job_consts.PYTHON: driver = await StartPythonDriver( job_info, self._dashboard_agent.redis_address, self._dashboard_agent.redis_password).run() else: raise Exception(f"Unsupported language type: {language}") job_info.driver = driver initialize_task = create_task(_initialize_job_env()) try: await initialize_task except asyncio.CancelledError: logger.error("[%s] Initialize job environment has been cancelled.", unique_id) return job_agent_pb2.InitializeJobEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message="InitializeJobEnv has been cancelled, " "did you call CleanJobEnv?") except Exception as ex: logger.exception(ex) return job_agent_pb2.InitializeJobEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=traceback.format_exc()) driver_pid = 0 if job_info.driver: driver_pid = job_info.driver.pid logger.info( "[%s] Job environment initialized, " "the driver (pid=%s) started.", unique_id, driver_pid) return job_agent_pb2.InitializeJobEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, driver_pid=driver_pid)