def run_cmd(args, timer=None, timeout=None): logger.debug('dcop command "run" with arguments {}'.format(args)) global INFINITY, collect_on, output_file INFINITY = args.infinity collect_on = args.collect_on output_file = args.output period = None if args.collect_on == "period": period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) _, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info("loading dcop from {}".format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) dcop = filter_dcop(dcop) if args.distribution in DISTRIBUTION_METHODS: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) logger.info("loading scenario from {}".format(args.scenario)) scenario = load_scenario_from_file(args.scenario) logger.info("Building computation graph ") cg = graph_module.build_computation_graph(dcop) logger.info("Distributing computation graph ") if dist_module is not None: distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load, ) else: distribution = load_dist_from_file(args.distribution) logger.debug("Distribution Computation graph: %s ", distribution) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == "thread": orchestrator = run_local_thread_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, replication=args.replication_method, ) elif args.mode == "process": # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger("pydcop.agent") agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method("spawn") orchestrator = run_local_process_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, ) orchestrator.set_error_handler(_orchestrator_error) try: orchestrator.deploy_computations() orchestrator.start_replication(args.ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario, timeout=timeout) if timer: timer.cancel() if not timeout_stopped: if orchestrator.status == "TIMEOUT": _results("TIMEOUT") sys.exit(0) elif orchestrator.status != "STOPPED": _results("FINISHED") sys.exit(0) except Exception as e: logger.error(e, exc_info=1) print(e) for th in threading.enumerate(): print(th) traceback.print_stack(sys._current_frames()[th.ident]) print() orchestrator.stop_agents(5) orchestrator.stop() _results("ERROR")
def run_cmd(args, timer=None, timeout=None): logger.debug('dcop command "orchestrator" with arguments {} '.format(args)) global collect_on, output_file output_file = args.output collect_on = args.collect_on dcop_yaml_files = args.dcop_files output_file = args.output collect_on = args.collect_on period = None if args.collect_on == "period": period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) if args.distribution in ["oneagent", "adhoc", "ilp_fgdp", "heur_comhost"]: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) logger.info("loading dcop from {}".format(dcop_yaml_files)) dcop = load_dcop_from_file(dcop_yaml_files) if args.scenario: logger.info("loading scenario from {}".format(args.scenario)) scenario = load_scenario_from_file(args.scenario) else: logger.debug("No scenario") scenario = None # Build factor-graph computation graph logger.info( "Building computation graph for dcop {}".format(dcop_yaml_files)) cg = graph_module.build_computation_graph(dcop) logger.info("Distributing computation graph ") if dist_module is not None: if not hasattr(algo_module, "computation_memory"): algo_module.computation_memory = lambda *v, **k: 0 if not hasattr(algo_module, "communication_load"): algo_module.communication_load = lambda *v, **k: 0 distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load, ) else: distribution = load_dist_from_file(args.distribution) logger.info("Dcop distribution : {}".format(distribution)) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # When using the (default) 'fork' start method, http servers on agent's # processes did not work (why ?), but seems to be ok now ?! # multiprocessing.set_start_method('spawn') # FIXME infinity = 10000 # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() if args.ktarget: ktarget = args.ktarget else: if scenario: logger.debug("Scenario without k target, use 3 as default level") ktarget = 3 global orchestrator, start_time port = args.port if args.port else 9000 addr = args.address if args.address else None comm = HttpCommunicationLayer((addr, port)) orchestrator = Orchestrator( algo, cg, distribution, comm, dcop, infinity, collector=collector_queue, collect_moment=args.collect_on, collect_period=period, ui_port=args.uiport, ) try: start_time = time() logger.debug(f"Starting Orchestrator") orchestrator.start() logger.debug(f"Deploying computations") orchestrator.deploy_computations() if scenario: logger.debug(f"Starting Replication, targert {ktarget}") orchestrator.start_replication(ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario=scenario, timeout=timeout) else: logger.debug("No scenario, run the problem directly") orchestrator.run(timeout=timeout) if not timeout_stopped: if orchestrator.status == "TIMEOUT": _results("TIMEOUT") sys.exit(0) else: _results("FINISHED") sys.exit(0) except Exception as e: logger.error(e, exc_info=1) orchestrator.stop_agents(5) orchestrator.stop() _results("ERROR")
def run_cmd(args, timer): logger.debug('dcop command "run" with arguments {}'.format(args)) global INFINITY INFINITY = args.infinity global collect_on collect_on = args.collect_on period = None if args.collect_on == 'period': period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) _, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info('loading dcop from {}'.format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) logger.info('Loading distribution from {}'.format(args.distribution)) distribution = load_dist_from_file(args.distribution) # FIXME: load replica dist from file and pass to orchestrator # logger.info('Loading replica distribution from {}'.format( # args.distribution)) # replica_dist = load_replica_dist_from_file(args.replica_dist) # logger.info('Dcop distribution : %s', replica_dist) logger.info('loading scenario from {}'.format(args.scenario)) scenario = load_scenario_from_file(args.scenario) logger.info('Building computation graph ') cg = graph_module.build_computation_graph(dcop) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == 'thread': orchestrator = run_local_thread_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, replication=args.replication_method) elif args.mode == 'process': # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger('pydcop.agent') agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method('spawn') orchestrator = run_local_process_dcop(algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period) orchestrator.set_error_handler(_orchestrator_error) try: orchestrator.deploy_computations() orchestrator.start_replication(args.ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario) # orchestrator.run(scenario) # FIXME except Exception as e: logger.error(e, exc_info=1) print(e) for th in threading.enumerate(): print(th) traceback.print_stack(sys._current_frames()[th.ident]) print() orchestrator.stop_agents(5) orchestrator.stop() _results('ERROR', e)