def __init__(self, config=None, executors=None, lazy_fail=True, rundir=None, fail_retries=2): """ Initialize the DataFlowKernel Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: config (Dict) : A single data object encapsulating all config attributes executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 lazy_fail(Bool) : Default=True, determine failure behavior rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN fail_retries(int): Default=2, Set the number of retry attempts in case of failure Returns: DataFlowKernel object """ # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) # Update config with defaults self._config = update_config(config, self.rundir) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() if self._config : self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get("lazyFail", lazy_fail) self.fail_retries = self._config["globals"].get("fail_retries", fail_retries) self.flowcontrol = FlowControl(self, self._config) else: self._executors_managed = False self.fail_retries = fail_retries self.lazy_fail = lazy_fail self.executors = {i:x for i,x in enumerate(executors)} print("Executors : ", self.executors) self.flowcontrol = FlowNoControl(self, None) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.debug("Starting DataFlowKernel with config\n{}".format(config)) logger.info("Parsl version: {}".format(get_version())) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # ES logging self.tasks_completed_count = 0 self.tasks_failed_count = 0 self.monitoring_config = config.monitoring_config if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database'\ and self.monitoring_config.eng_link is None: # uses the rundir as the default location. logger.info( 'Local monitoring database can be found inside the run_dir at: {}' .format(self.run_dir)) self.monitoring_config.eng_link = "sqlite:///{}".format( os.path.join(os.path.abspath(self.run_dir), 'monitoring.db')) if self.monitoring_config is None: self.db_logger = get_db_logger() else: self.db_logger = get_db_logger( monitoring_config=self.monitoring_config) self.workflow_name = None if self.monitoring_config is not None and self.monitoring_config.workflow_name is not None: self.workflow_name = self.monitoring_config.workflow_name else: for frame in inspect.stack(): fname = os.path.basename(str(frame.filename)) parsl_file_names = ['dflow.py'] # Find first file name not considered a parsl file if fname not in parsl_file_names: self.workflow_name = fname break self.workflow_version = None if self.monitoring_config is not None and self.monitoring_config.version is not None: self.workflow_version = self.monitoring_config.version self.time_began = time.time() self.time_completed = None self.run_id = str(uuid4()) self.dashboard = self.monitoring_config.dashboard_link if self.monitoring_config is not None else None # TODO: make configurable logger.info("Run id is: " + self.run_id) if self.dashboard is not None: logger.info("Dashboard is found at " + self.dashboard) # start tornado logging server if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database': self.logging_server = multiprocessing.Process( target=logging_server.run, kwargs={'monitoring_config': self.monitoring_config}) self.logging_server.start() self.web_app = multiprocessing.Process( target=index.run, kwargs={'monitoring_config': self.monitoring_config}) self.web_app.start() else: self.logging_server = None self.web_app = None workflow_info = { 'python_version': sys.version_info, 'parsl_version': get_version(), "time_began": str(self.time_began), 'time_completed': str(None), 'run_id': self.run_id, 'workflow_name': self.workflow_name, 'workflow_version': self.workflow_version, 'rundir': self.run_dir, 'tasks_completed_count': self.tasks_completed_count, 'tasks_failed_count': self.tasks_failed_count, 'user': getuser(), 'host': gethostname(), } self.db_logger.info("DFK start", extra=workflow_info) # ES logging end checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir if hasattr(executor, 'provider'): if hasattr(executor.provider, 'script_dir'): executor.provider.script_dir = os.path.join( self.run_dir, 'submit_scripts') if executor.provider.channel.script_dir is None: executor.provider.channel.script_dir = os.path.join( self.run_dir, 'submit_scripts') if not executor.provider.channel.isdir(self.run_dir): parent, child = pathlib.Path( self.run_dir).parts[-2:] remote_run_dir = os.path.join(parent, child) executor.provider.channel.script_dir = os.path.join( remote_run_dir, 'remote_submit_scripts') executor.provider.script_dir = os.path.join( self.run_dir, 'local_submit_scripts') executor.provider.channel.makedirs( executor.provider.channel.script_dir, exist_ok=True) os.makedirs(executor.provider.script_dir, exist_ok=True) executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.tasks = {} self.submitter_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html') self._config = config self.run_dir = make_rundir(config.run_dir) if config.initialize_logging: parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.debug("Starting DataFlowKernel with config\n{}".format(config)) if sys.version_info < (3, 6): logger.warning("Support for python versions < 3.6 is deprecated and will be removed after parsl 0.10") logger.info("Parsl version: {}".format(get_version())) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Monitoring self.run_id = str(uuid4()) self.tasks_completed_count = 0 self.tasks_failed_count = 0 self.tasks_dep_fail_count = 0 self.monitoring = config.monitoring # hub address and port for interchange to connect self.hub_address = None self.hub_interchange_port = None if self.monitoring: if self.monitoring.logdir is None: self.monitoring.logdir = self.run_dir self.hub_address = self.monitoring.hub_address self.hub_interchange_port = self.monitoring.start(self.run_id) self.time_began = datetime.datetime.now() self.time_completed = None # TODO: make configurable logger.info("Run id is: " + self.run_id) self.workflow_name = None if self.monitoring is not None and self.monitoring.workflow_name is not None: self.workflow_name = self.monitoring.workflow_name else: for frame in inspect.stack(): fname = os.path.basename(str(frame.filename)) parsl_file_names = ['dflow.py', 'typeguard.py'] # Find first file name not considered a parsl file if fname not in parsl_file_names: self.workflow_name = fname break self.workflow_version = str(self.time_began.replace(microsecond=0)) if self.monitoring is not None and self.monitoring.workflow_version is not None: self.workflow_version = self.monitoring.workflow_version workflow_info = { 'python_version': "{}.{}.{}".format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro), 'parsl_version': get_version(), "time_began": self.time_began, 'time_completed': None, 'workflow_duration': None, 'run_id': self.run_id, 'workflow_name': self.workflow_name, 'workflow_version': self.workflow_version, 'rundir': self.run_dir, 'tasks_completed_count': self.tasks_completed_count, 'tasks_failed_count': self.tasks_failed_count, 'user': getuser(), 'host': gethostname(), } if self.monitoring: self.monitoring.send(MessageType.WORKFLOW_INFO, workflow_info) checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode self.data_manager = DataManager(self) self.executors = {} data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager') self.add_executors(config.executors + [data_manager_executor]) if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint") except Exception: logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint") # if we use the functionality of dynamically adding executors # all executors should be managed. if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.tasks = {} self.submitter_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=None, executors=None, lazyErrors=True, appCache=True, rundir=None, retries=0, checkpointFiles=None, checkpointMode=None, data_manager=None): """ Initialize the DataFlowKernel. Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: - config (dict) : A single data object encapsulating all config attributes - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 - lazyErrors(bool) : Default=True, allow workflow to continue on app failures. - appCache (bool) :Enable caching of apps - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN - retries(int): Default=0, Set the number of retry attempts in case of failure - checkpointFiles (list of str): List of filepaths to checkpoint files - checkpointMode (None, 'dfk_exit', 'task_exit', 'periodic'): Method to use. - data_manager (DataManager): User created DataManager Returns: DataFlowKernel object """ # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) parsl.set_file_logger("{}/parsl.log".format(self.rundir), level=logging.DEBUG) logger.info("Parsl version: {}".format(parsl.__version__)) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) # Update config with defaults self._config = update_config(config, self.rundir) # Set the data manager if data_manager: self.data_manager = data_manager else: self.data_manager = DataManager(config=self._config) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Load checkpoints if any cpts = self.load_checkpoints(checkpointFiles) # Initialize the memoizer self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts) self.checkpointed_tasks = 0 self._checkpoint_timer = None if self._config: self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get( "lazyErrors", lazyErrors) self.fail_retries = self._config["globals"].get("retries", retries) self.flowcontrol = FlowControl(self, self._config) self.checkpoint_mode = self._config["globals"].get( "checkpointMode", checkpointMode) if self.checkpoint_mode == "periodic": period = self._config["globals"].get("checkpointPeriod", "00:30:00") try: h, m, s = map(int, period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpointPeriod provided:{0} expected HH:MM:SS" .format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) else: self._executors_managed = False self.fail_retries = retries self.lazy_fail = lazyErrors self.executors = {i: x for i, x in enumerate(executors)} self.flowcontrol = FlowNoControl(self, None) self.checkpoint_mode = checkpointMode self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=None, executors=None, lazyErrors=True, appCache=True, rundir=None, retries=0, checkpointFiles=None): """ Initialize the DataFlowKernel Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: - config (Dict) : A single data object encapsulating all config attributes - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 - lazyErrors(Bool) : Default=True, allow workflow to continue on app failures. - appCache (Bool) :Enable caching of apps - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN - retries(int): Default=0, Set the number of retry attempts in case of failure - checkpointFiles (list of str): List of filepaths to checkpoint files Returns: DataFlowKernel object """ # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) parsl.set_file_logger("{}/parsl.log".format(self.rundir), level=logging.INFO) logger.info("Parsl version: {}".format(parsl.__version__)) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) # Update config with defaults self._config = update_config(config, self.rundir) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Load checkpoints if any cpts = self.load_checkpoints(checkpointFiles) # Initialize the memoizer self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts) if self._config: self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get( "lazyErrors", lazyErrors) self.fail_retries = self._config["globals"].get("retries", retries) self.flowcontrol = FlowControl(self, self._config) else: self._executors_managed = False self.fail_retries = retries self.lazy_fail = lazyErrors self.executors = {i: x for i, x in enumerate(executors)} self.flowcontrol = FlowNoControl(self, None) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # ES logging self.db_logger_config = config.db_logger_config self.db_logger = get_db_logger( enable_es_logging=False ) if self.db_logger_config is None else get_db_logger( **self.db_logger_config) self.workflow_name = str(inspect.stack()[1][1]) self.time_began = datetime.now() self.time_completed = None self.run_id = self.workflow_name + "-" + str(self.time_began.minute) self.dashboard = self.db_logger_config.get( 'dashboard_link', None) if self.db_logger_config is not None else None # TODO: make configurable logger.info("Run id is: " + self.run_id) if self.dashboard is not None: logger.info("Dashboard is found at " + self.dashboard) self.db_logger.info("Python version: {}".format(sys.version_info)) self.db_logger.info("Parsl version: {}".format(get_version())) self.db_logger.info("Libsubmit version: {}".format( libsubmit.__version__)) self.db_logger.info( "DFK start", extra={ "time_began": str(self.time_began.strftime('%Y-%m-%d %H:%M:%S')), 'time_completed': str(self.time_completed), 'task_run_id': self.run_id, 'rundir': self.run_dir }) self.db_logger.info("Name of script/workflow: " + self.run_id, extra={'task_run_id': self.run_id}) for executor in self._config.executors: self.db_logger.info("Listed executor: " + executor.label, extra={'task_run_id': self.run_id}) # ES logging end checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)