Exemplo n.º 1
0
    def __init__(self, args):
        self.MAX_IDLE_TIME = 120.0
        self.DELAY_PERIOD = 0.2
        self.idle_time = None
        self.EXIT_FLAG = False
        self.num_workers = args.num_workers
        self.active_ids = set()
        self.is_persistent = args.persistent

        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        if args.db_prefetch_count == 0:
            prefetch = self.num_workers * 96
        else:
            prefetch = args.db_prefetch_count

        logger.debug("Master creating source/status updater")
        self.job_source = BalsamJobSource(prefetch, args.wf_name)
        self.status_updater = BalsamDBStatusUpdater()
        self.status_updater.start()
        self.job_source.start()
        logger.debug("source/status updater created")

        logger.debug("Master ZMQ binding...")
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REP)
        self.socket.bind(f"tcp://*:{args.master_port}")
        logger.debug("Master ZMQ socket bound.")
Exemplo n.º 2
0
    def __init__(self, args, hostname):
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
        self.socket.connect(f"tcp://{args.master_address}")
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        self.hostname = hostname
        next(self.remaining_timer)
        self.EXIT_FLAG = False

        self.gpus_per_node = args.gpus_per_node
        self.prefetch_count = args.worker_prefetch_count
        config_logging('serial-launcher',
                       filename=args.log_filename,
                       use_buffer=True)
        self.processes = {}
        self.outfiles = {}
        self.cuteids = {}
        self.start_times = {}
        self.retry_counts = {}
        self.job_specs = {}
        self.runnable_cache = {}
        self.occupancy = 0.0
        self.all_affinity = [
            i * SERIAL_HYPERTHREAD_STRIDE for i in range(SERIAL_CORES_PER_NODE)
        ]
        self.used_affinity = []
Exemplo n.º 3
0
    def __init__(self, wf_name, time_limit_minutes, gpus_per_node):
        self.jobsource = BalsamJob.source
        self.jobsource.workflow = wf_name
        if wf_name:
            logger.info(f'Filtering jobs with workflow matching {wf_name}')
        else:
            logger.info('No workflow filter')

        self.jobsource.clear_stale_locks()
        self.jobsource.start_tick()
        self.worker_group = worker.WorkerGroup()
        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "mpi"

        self.timer = remaining_time_minutes(time_limit_minutes)
        self.delayer = delay_generator()
        self.last_report = 0
        self.exit_counter = 0
        self.mpi_runs = []
        self.jobsource.check_qLaunch()
        if self.jobsource.qLaunch is not None:
            sched_id = self.jobsource.qLaunch.scheduler_id
            self.RUN_MESSAGE = f'Batch Scheduler ID: {sched_id}'
        else:
            self.RUN_MESSAGE = 'Not scheduled by service'
Exemplo n.º 4
0
    def __init__(self, args):
        self.MAX_IDLE_TIME = 120.0
        self.DELAY_PERIOD = 0.2
        self.idle_time = 0.0
        self.EXIT_FLAG = False

        config_logging('serial-launcher',
                       filename=args.log_filename,
                       use_buffer=True)
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        if args.db_prefetch_count == 0:
            prefetch = args.num_workers * 96
        else:
            prefetch = args.db_prefetch_count

        self.job_source = BalsamJobSource(prefetch, args.wf_name)
        self.status_updater = BalsamDBStatusUpdater()
        self.status_updater.start()
        self.job_source.start()

        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REP)
        self.socket.bind(f"tcp://*:{args.master_port}")
Exemplo n.º 5
0
    def __init__(self,
                 wf_name=None,
                 time_limit_minutes=60,
                 gpus_per_node=None,
                 persistent=False,
                 limit_nodes=None,
                 offset_nodes=None):
        self.wf_name = wf_name
        self.gpus_per_node = gpus_per_node
        self.is_persistent = persistent

        timer = remaining_time_minutes(time_limit_minutes)
        minutes_left = max(0.1, next(timer) - 1)
        self.worker_group = worker.WorkerGroup(limit=limit_nodes,
                                               offset=offset_nodes)
        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "serial"

        self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}"
        self.app_cmd += f" --time-limit-min={minutes_left}"
        if self.wf_name:
            self.app_cmd += f" --wf-name={self.wf_name}"
        if self.gpus_per_node:
            self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
Exemplo n.º 6
0
    def __init__(self):
        self.MAX_IDLE_TIME = 120.0
        self.DELAY_PERIOD = 0.2
        self.idle_time = 0.0
        self.EXIT_FLAG = False

        args = self.parse_args()
        log_filename = config_logging('serial-launcher')
        bcast_msg = {
            "gpus_per_node": args.gpus_per_node,
            "worker_prefetch": args.worker_prefetch_count,
            "log_fname": log_filename,
        }
        comm.bcast(bcast_msg, root=0)
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        if args.db_prefetch_count == 0:
            prefetch = (comm.size - 1) * 128
        else:
            prefetch = args.db_prefetch_count

        job_source = BalsamJobSource(prefetch, args.wf_name)
        status_updater = BalsamDBStatusUpdater()
        self.manager = ResourceManager(job_source, status_updater)
Exemplo n.º 7
0
    def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None,
                 persistent=False, limit_nodes=None, offset_nodes=None):
        self.wf_name = wf_name
        self.gpus_per_node = gpus_per_node
        self.is_persistent = persistent

        timer = remaining_time_minutes(time_limit_minutes)
        minutes_left = max(0.1, next(timer) - 1)
        self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes)
        num_workers = len(self.worker_group)

        hostnames = sorted([w.hostname for w in self.worker_group])
        master_host = hostnames[0]
        master_port = 19876
        timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
        log_fname = f'serial-ensemble_{timestamp}.log'

        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "serial"

        self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}"
        self.app_cmd += f" --time-limit-min={minutes_left}"
        self.app_cmd += f" --master-address {master_host}:{master_port}"
        self.app_cmd += f" --log-filename {log_fname}"
        self.app_cmd += f" --num-workers {num_workers}"
        if self.wf_name:
            self.app_cmd += f" --wf-name={self.wf_name}"
        if self.gpus_per_node:
            self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
        if self.is_persistent:
            self.app_cmd += " --persistent"
Exemplo n.º 8
0
    def __init__(self):
        self.MAX_IDLE_TIME = 20.0
        self.DELAY_PERIOD = 1.0
        self.idle_time = 0.0
        self.EXIT_FLAG = False

        args = self.parse_args()
        comm.bcast(args.gpus_per_node, root=0)
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        job_source = BalsamJob.source
        job_source.workflow = args.wf_name
        job_source.start_tick()
        job_source.clear_stale_locks()
        self.manager = ResourceManager(job_source)

        if job_source.workflow:
            logger.info(f'MPI Ensemble pulling jobs with WF {args.wf_name}')
        else:
            logger.info('MPI Ensemble consuming jobs matching any WF name')