def __init__(self, args, hostname): self.context = zmq.Context() self.socket = self.context.socket(zmq.REQ) self.socket.connect(f"tcp://{args.master_address}") self.remaining_timer = remaining_time_minutes(args.time_limit_min) self.hostname = hostname next(self.remaining_timer) self.EXIT_FLAG = False self.gpus_per_node = args.gpus_per_node self.prefetch_count = args.worker_prefetch_count config_logging('serial-launcher', filename=args.log_filename, use_buffer=True) self.processes = {} self.outfiles = {} self.cuteids = {} self.start_times = {} self.retry_counts = {} self.job_specs = {} self.runnable_cache = {} self.occupancy = 0.0 self.all_affinity = [ i * SERIAL_HYPERTHREAD_STRIDE for i in range(SERIAL_CORES_PER_NODE) ] self.used_affinity = []
def __init__(self, args): self.MAX_IDLE_TIME = 120.0 self.DELAY_PERIOD = 0.2 self.idle_time = 0.0 self.EXIT_FLAG = False config_logging('serial-launcher', filename=args.log_filename, use_buffer=True) self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) if args.db_prefetch_count == 0: prefetch = args.num_workers * 96 else: prefetch = args.db_prefetch_count self.job_source = BalsamJobSource(prefetch, args.wf_name) self.status_updater = BalsamDBStatusUpdater() self.status_updater.start() self.job_source.start() self.context = zmq.Context() self.socket = self.context.socket(zmq.REP) self.socket.bind(f"tcp://*:{args.master_port}")
def main(self): bcast_msg = {} bcast_msg = comm.bcast(bcast_msg, root=0) self.gpus_per_node = bcast_msg["gpus_per_node"] self.prefetch_count = bcast_msg["worker_prefetch"] log_filename = bcast_msg["log_fname"] config_logging('serial-launcher', filename=log_filename) current_request, next_request = None, WorkerRequest() while True: done_pks, errors, active = self.poll_processes() started_pks = self.start_jobs() request_num_jobs = max( 0, self.prefetch_count - len(self.runnable_cache)) next_request.add_started(started_pks) next_request.add_done(done_pks) next_request.add_error(errors) next_request.set_request_num_jobs(request_num_jobs) if active: next_request.set_active() if current_request is None: current_request = next_request next_request = WorkerRequest() current_request.send() response_msg = {} else: response_msg = current_request.get_response() if response_msg is not None: current_request = None else: time.sleep(0.2) response_msg = {} if response_msg.get('exit', False): logger.debug(f"rank {RANK} received EXIT") break if response_msg.get('new_jobs'): self.runnable_cache.update( {job['pk']: job for job in response_msg["new_jobs"]}) logger.debug(f"rank {RANK} occupancy: {self.occupancy} " f"[{len(self.runnable_cache)} additional prefetched " f"jobs in cache]") self.exit()
def __init__(self): self.MAX_IDLE_TIME = 120.0 self.DELAY_PERIOD = 0.2 self.idle_time = 0.0 self.EXIT_FLAG = False args = self.parse_args() log_filename = config_logging('serial-launcher') bcast_msg = { "gpus_per_node": args.gpus_per_node, "worker_prefetch": args.worker_prefetch_count, "log_fname": log_filename, } comm.bcast(bcast_msg, root=0) self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) if args.db_prefetch_count == 0: prefetch = (comm.size - 1) * 128 else: prefetch = args.db_prefetch_count job_source = BalsamJobSource(prefetch, args.wf_name) status_updater = BalsamDBStatusUpdater() self.manager = ResourceManager(job_source, status_updater)
QueuedLaunch.refresh_from_scheduler() open_queues = get_open_queues() if open_queues: logger.info(f"Open queues: {list(open_queues.keys())}") qlaunch = jobpacker.create_qlaunch(open_queues) if qlaunch: submit_qlaunch(qlaunch) if not QueuedLaunch.acquire_advisory(): logger.error('Failed to refresh advisory lock; aborting') break elif not EXIT_FLAG: source.clear_stale_locks() time.sleep(10) if __name__ == "__main__": setup() config_logging('service') logger.info(f"Balsam Service starting on {gethostname()}") parser = service_subparser() transition_pool = transitions.TransitionProcessPool(5, '') source.start_tick() try: main(parser.parse_args()) except: raise finally: transition_pool.terminate() source.release_all_owned() logger.info(f"Balsam Service shutdown: released all locks OK")
parser.add_argument('--persistent', action='store_true') args = parser.parse_args() args.master_host = args.master_address.split(':')[0] args.master_port = int(args.master_address.split(':')[1]) return args if __name__ == "__main__": args = parse_args() hostname = socket.gethostname() if args.run_master: log_fname = args.log_filename + ".master" config_logging( 'serial-launcher', filename=log_fname, buffer_capacity=128, ) master = Master(args) # TODO(KGF): factor out signal handling to SigHandler class # (util/sighandler.py) like in B2 1fc1824c def handle_term(signum, stack): master.EXIT_FLAG = True signal.signal(signal.SIGINT, handle_term) signal.signal(signal.SIGTERM, handle_term) master.main() else: log_fname = args.log_filename + "." + hostname config_logging(
if nthread > 0: transition_pool = transitions.TransitionProcessPool( nthread, wf_filter) else: transition_pool = None launcher = Launcher(wf_filter, timelimit_min, gpus_per_node) launcher.run() except: raise finally: if transition_pool is not None: transition_pool.terminate() logger.info("Exit: Launcher exit graceful\n\n") def get_args(inputcmd=None): '''Parse command line arguments''' parser = config_launcher_subparser() if inputcmd: return parser.parse_args(inputcmd) else: return parser.parse_args() if __name__ == "__main__": setup() args = get_args() config_logging('launcher') logger.info("Loading Balsam Launcher") main(args)
import signal import time import psutil from mpi4py import MPI from django.db import transaction, connections from balsam import config_logging, settings, setup setup() from balsam.launcher.exceptions import * from balsam.launcher.util import cd, get_tail, remaining_time_minutes from balsam.core.models import BalsamJob, safe_select, PROCESSABLE_STATES logger = logging.getLogger('balsam.launcher.mpi_ensemble') config_logging('serial-launcher') comm = MPI.COMM_WORLD RANK = comm.Get_rank() MSG_BUFSIZE = 2**16 connections.close_all() class ResourceManager: FETCH_PERIOD = 2.0 KILLED_REFRESH_PERIOD = 3.0 def __init__(self, job_source): self.job_source = job_source self.node_occupancy = [0.0 for i in range(comm.size)]