def __init__(self, vm_reference, init_blocks=1, min_blocks=0, max_blocks=10, parallelism=1, worker_init='', location='westus', group_name='parsl.auto', key_name=None, key_file=None, vnet_name="parsl.auto", linger=False, launcher=SingleNodeLauncher()): if not _api_enabled: raise OptionalModuleMissing( ['azure', 'msrestazure'], "Azure Provider requires the azure module.") self._label = 'azure' self.init_blocks = init_blocks self.min_blocks = min_blocks self.max_blocks = max_blocks self.max_nodes = max_blocks self.parallelism = parallelism self.nodes_per_block = 1 self.worker_init = worker_init self.vm_reference = vm_reference self.region = location self.vnet_name = vnet_name self.key_name = key_name self.key_file = key_file self.location = location self.group_name = group_name self.launcher = launcher self.linger = linger self.resources = {} self.instances = [] env_specified = os.getenv("AZURE_CLIENT_ID") is not None and os.getenv( "AZURE_CLIENT_SECRET") is not None and os.getenv( "AZURE_TENANT_ID") is not None and os.getenv("AZURE_SUBSCRIPTION_ID") is not None if key_file is None and not env_specified: raise ConfigurationError("Must specify either, 'key_file', or\ `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`,\ and `AZURE_TENANT_ID` environment variables.") if key_file is None: self.clientid = os.getenv("AZURE_CLIENT_ID") self.clientsecret = os.getenv("AZURE_CLIENT_SECRET") self.tenantid = os.getenv("AZURE_TENANT_ID") self.subid = os.getenv("AZURE_SUBSCRIPTION_ID") else: with open(key_file) as fh: keys = json.load(fh) self.clientid = keys.get("AZURE_CLIENT_ID") self.clientsecret = keys.get("AZURE_CLIENT_SECRET") self.tenantid = keys.get("AZURE_TENANT_ID") self.subid = keys.get("AZURE_SUBSCRIPTION_ID") self.get_clients()
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html') self._config = config self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.debug("Starting DataFlowKernel with config\n{}".format(config)) logger.info("Parsl version: {}".format(get_version())) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Monitoring self.run_id = str(uuid4()) self.tasks_completed_count = 0 self.tasks_failed_count = 0 self.monitoring = config.monitoring # hub address and port for interchange to connect self.hub_address = None self.hub_interchange_port = None if self.monitoring: if self.monitoring.logdir is None: self.monitoring.logdir = self.run_dir self.hub_address = self.monitoring.hub_address self.hub_interchange_port = self.monitoring.start(self.run_id) self.time_began = datetime.datetime.now() self.time_completed = None # TODO: make configurable logger.info("Run id is: " + self.run_id) self.workflow_name = None if self.monitoring is not None and self.monitoring.workflow_name is not None: self.workflow_name = self.monitoring.workflow_name else: for frame in inspect.stack(): fname = os.path.basename(str(frame.filename)) parsl_file_names = ['dflow.py', 'typeguard.py'] # Find first file name not considered a parsl file if fname not in parsl_file_names: self.workflow_name = fname break self.workflow_version = str(self.time_began.replace(microsecond=0)) if self.monitoring is not None and self.monitoring.workflow_version is not None: self.workflow_version = self.monitoring.workflow_version workflow_info = { 'python_version': "{}.{}.{}".format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro), 'parsl_version': get_version(), "time_began": self.time_began, 'time_completed': None, 'workflow_duration': None, 'run_id': self.run_id, 'workflow_name': self.workflow_name, 'workflow_version': self.workflow_version, 'rundir': self.run_dir, 'tasks_completed_count': self.tasks_completed_count, 'tasks_failed_count': self.tasks_failed_count, 'user': getuser(), 'host': gethostname(), } if self.monitoring: self.monitoring.send(MessageType.WORKFLOW_INFO, workflow_info) checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode self.data_manager = DataManager(self) self.executors = {} data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager') self.add_executors(config.executors + [data_manager_executor]) if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint") except Exception: logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint") # if we use the functionality of dynamically adding executors # all executors should be managed. if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.tasks = {} self.submitter_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, label='HighThroughputExecutor', provider=LocalProvider(), launch_cmd=None, address="127.0.0.1", worker_ports=None, worker_port_range=(54000, 55000), interchange_port_range=(55000, 56000), storage_access=None, working_dir=None, worker_debug=False, cores_per_worker=1.0, max_workers=float('inf'), heartbeat_threshold=120, heartbeat_period=30, poll_period=10, container_image=None, worker_mode="singularity_reuse", suppress_failure=False, endpoint_id=None, endpoint_db=None, managed=True, task_status_queue=None): logger.debug("Initializing HighThroughputExecutor") self.label = label self.launch_cmd = launch_cmd self.provider = provider self.worker_debug = worker_debug self.storage_access = storage_access if storage_access is not None else [] if len(self.storage_access) > 1: raise ConfigurationError( 'Multiple storage access schemes are not supported') self.working_dir = working_dir self.managed = managed self.blocks = [] self.tasks = {} self.cores_per_worker = cores_per_worker self.max_workers = max_workers self.endpoint_db = endpoint_db self.endpoint_db.connect() self.endpoint_id = endpoint_id self._task_counter = 0 self.address = address self.worker_ports = worker_ports self.worker_port_range = worker_port_range self.interchange_port_range = interchange_port_range self.heartbeat_threshold = heartbeat_threshold self.heartbeat_period = heartbeat_period self.poll_period = poll_period self.suppress_failure = suppress_failure self.run_dir = '.' self.queue_proc = None self.task_status_queue = task_status_queue # FuncX specific options self.container_image = container_image self.worker_mode = worker_mode self.last_response_time = time.time() if not launch_cmd: self.launch_cmd = ("process_worker_pool.py {debug} {max_workers} " "-c {cores_per_worker} " "--poll {poll_period} " "--task_url={task_url} " "--result_url={result_url} " "--logdir={logdir} " "--hb_period={heartbeat_period} " "--hb_threshold={heartbeat_threshold} " "--mode={worker_mode} " "--container_image={container_image} ") self.ix_launch_cmd = ("htex-interchange {debug} -c={client_address} " "--client_ports={client_ports} " "--worker_port_range={worker_port_range} " "--logdir={logdir} " "{suppress_failure} " "--hb_threshold={heartbeat_threshold} ")
def __init__(self, image_id, key_name, init_blocks=1, min_blocks=0, max_blocks=10, nodes_per_block=1, parallelism=1, worker_init='', instance_type='t2.small', region='us-east-2', spot_max_bid=0, key_file=None, profile=None, iam_instance_profile_arn='', state_file=None, walltime="01:00:00", linger=False, launcher=SingleNodeLauncher()): if not _boto_enabled: raise OptionalModuleMissing(['boto3'], "AWS Provider requires the boto3 module.") self.image_id = image_id self._label = 'ec2' self.init_blocks = init_blocks self.min_blocks = min_blocks self.max_blocks = max_blocks self.nodes_per_block = nodes_per_block self.max_nodes = max_blocks * nodes_per_block self.parallelism = parallelism self.worker_init = worker_init self.instance_type = instance_type self.region = region self.spot_max_bid = spot_max_bid self.key_name = key_name self.key_file = key_file self.profile = profile self.iam_instance_profile_arn = iam_instance_profile_arn self.walltime = walltime self.launcher = launcher self.linger = linger self.resources = {} self.state_file = state_file if state_file is not None else 'awsproviderstate.json' env_specified = os.getenv("AWS_ACCESS_KEY_ID") is not None and os.getenv("AWS_SECRET_ACCESS_KEY") is not None if profile is None and key_file is None and not env_specified: raise ConfigurationError("Must specify either profile', 'key_file', or " "'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY' environment variables.") try: self.initialize_boto_client() except Exception as e: logger.error("{} failed to initialize.".format(self)) raise e state_file_exists = False try: self.read_state_file(self.state_file) state_file_exists = True except Exception: logger.info("No state file found. Cannot load previous options. Creating new infrastructure.") if not state_file_exists: try: self.create_vpc().id except Exception as e: logger.info("Failed to create ec2 infrastructure: {0}".format(e)) raise else: self.write_state_file()
def __init__(self, label: str = 'HighThroughputExecutor', provider: ExecutionProvider = LocalProvider(), launch_cmd: Optional[str] = None, address: str = "127.0.0.1", worker_ports: Optional[Tuple[int, int]] = None, worker_port_range: Optional[Tuple[int, int]] = (54000, 55000), interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000), storage_access: Optional[List[Any]] = None, working_dir: Optional[str] = None, worker_debug: bool = False, cores_per_worker: float = 1.0, mem_per_worker: Optional[float] = None, max_workers: Union[int, float] = float('inf'), prefetch_capacity: int = 0, heartbeat_threshold: int = 120, heartbeat_period: int = 30, poll_period: int = 10, suppress_failure: bool = False, managed: bool = True, worker_logdir_root: Optional[str] = None): logger.debug("Initializing HighThroughputExecutor") self.label = label self.launch_cmd = launch_cmd self.provider = provider self.worker_debug = worker_debug self.storage_access = storage_access if storage_access is not None else [] if len(self.storage_access) > 1: raise ConfigurationError( 'Multiple storage access schemes are not supported') self.working_dir = working_dir self.managed = managed self.blocks = {} # type: Dict[str, str] self.tasks = {} # type: Dict[str, Future] self.cores_per_worker = cores_per_worker self.mem_per_worker = mem_per_worker self.max_workers = max_workers self.prefetch_capacity = prefetch_capacity self._task_counter = 0 self.address = address self.hub_address = None # set to the correct hub address in dfk self.hub_port = None # set to the correct hub port in dfk self.worker_ports = worker_ports self.worker_port_range = worker_port_range self.interchange_port_range = interchange_port_range self.heartbeat_threshold = heartbeat_threshold self.heartbeat_period = heartbeat_period self.poll_period = poll_period self.suppress_failure = suppress_failure self.run_dir = '.' self.worker_logdir_root = worker_logdir_root if not launch_cmd: self.launch_cmd = ("process_worker_pool.py {debug} {max_workers} " "-p {prefetch_capacity} " "-c {cores_per_worker} " "-m {mem_per_worker} " "--poll {poll_period} " "--task_url={task_url} " "--result_url={result_url} " "--logdir={logdir} " "--block_id={{block_id}} " "--hb_period={heartbeat_period} " "--hb_threshold={heartbeat_threshold} ")