def run_cmd(args): logger.debug('dcop command "solve" with arguments {} '.format(args)) dcop_yaml_files = args.dcop_files if args.distribution in ['oneagent', 'adhoc', 'ilp_fgdp']: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) logger.info('loading dcop from {}'.format(dcop_yaml_files)) dcop = load_dcop_from_file(dcop_yaml_files) # Build factor-graph computation graph logger.info( 'Building computation graph for dcop {}'.format(dcop_yaml_files)) cg = graph_module.build_computation_graph(dcop) logger.info('Distributing computation graph ') if dist_module is not None: distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load) else: distribution = load_dist_from_file(args.distribution) logger.debug('Distribution Computation graph: %s ', distribution) logger.info('Dcop distribution : {}'.format(distribution)) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method('spawn') # FIXME infinity = 10000 global orchestrator, start_time port = 9000 comm = HttpCommunicationLayer(('127.0.0.1', port)) orchestrator = Orchestrator(algo, cg, distribution, comm, dcop, infinity) start_time = time() orchestrator.start() orchestrator.deploy_computations() orchestrator.run()
def run_cmd(args, timer=None): logger.debug('dcop command "solve" with arguments {}'.format(args)) global INFINITY INFINITY = args.infinity global collect_on collect_on = args.collect_on period = None if args.collect_on == 'period': period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) if args.distribution in ['oneagent', 'adhoc', 'ilp_fgdp']: dist_module, algo_module, graph_module = _load_modules(args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info('loading dcop from {}'.format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) # Build factor-graph computation graph logger.info('Building computation graph ') cg = graph_module.build_computation_graph(dcop) logger.debug('Computation graph: %s ', cg) logger.info('Distributing computation graph ') if dist_module is not None: distribution = dist_module.\ distribute(cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load) else: distribution = load_dist_from_file(args.distribution) logger.debug('Distribution Computation graph: %s ', distribution) logger.info('Dcop distribution : {}'.format(distribution)) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == 'thread': orchestrator = run_local_thread_dcop(algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period) elif args.mode == 'process': # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger('pydcop.agent') agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method('spawn') orchestrator = run_local_process_dcop(algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period) try: orchestrator.deploy_computations() orchestrator.run() except Exception as e: logger.error(e, exc_info=1) orchestrator.stop_agents(5) orchestrator.stop() _results('ERROR')
def run_cmd(args, timer=None, timeout=None): logger.debug('dcop command "orchestrator" with arguments {} '.format(args)) global collect_on, output_file output_file = args.output collect_on = args.collect_on dcop_yaml_files = args.dcop_files output_file = args.output collect_on = args.collect_on period = None if args.collect_on == "period": period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) if args.distribution in ["oneagent", "adhoc", "ilp_fgdp", "heur_comhost"]: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) logger.info("loading dcop from {}".format(dcop_yaml_files)) dcop = load_dcop_from_file(dcop_yaml_files) if args.scenario: logger.info("loading scenario from {}".format(args.scenario)) scenario = load_scenario_from_file(args.scenario) else: logger.debug("No scenario") scenario = None # Build factor-graph computation graph logger.info( "Building computation graph for dcop {}".format(dcop_yaml_files)) cg = graph_module.build_computation_graph(dcop) logger.info("Distributing computation graph ") if dist_module is not None: if not hasattr(algo_module, "computation_memory"): algo_module.computation_memory = lambda *v, **k: 0 if not hasattr(algo_module, "communication_load"): algo_module.communication_load = lambda *v, **k: 0 distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load, ) else: distribution = load_dist_from_file(args.distribution) logger.info("Dcop distribution : {}".format(distribution)) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # When using the (default) 'fork' start method, http servers on agent's # processes did not work (why ?), but seems to be ok now ?! # multiprocessing.set_start_method('spawn') # FIXME infinity = 10000 # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() if args.ktarget: ktarget = args.ktarget else: if scenario: logger.debug("Scenario without k target, use 3 as default level") ktarget = 3 global orchestrator, start_time port = args.port if args.port else 9000 addr = args.address if args.address else None comm = HttpCommunicationLayer((addr, port)) orchestrator = Orchestrator( algo, cg, distribution, comm, dcop, infinity, collector=collector_queue, collect_moment=args.collect_on, collect_period=period, ui_port=args.uiport, ) try: start_time = time() logger.debug(f"Starting Orchestrator") orchestrator.start() logger.debug(f"Deploying computations") orchestrator.deploy_computations() if scenario: logger.debug(f"Starting Replication, targert {ktarget}") orchestrator.start_replication(ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario=scenario, timeout=timeout) else: logger.debug("No scenario, run the problem directly") orchestrator.run(timeout=timeout) if not timeout_stopped: if orchestrator.status == "TIMEOUT": _results("TIMEOUT") sys.exit(0) else: _results("FINISHED") sys.exit(0) except Exception as e: logger.error(e, exc_info=1) orchestrator.stop_agents(5) orchestrator.stop() _results("ERROR")
def run_cmd(args, timer=None, timeout=None): logger.debug('dcop command "solve" with arguments {}'.format(args)) global INFINITY, collect_on, output_file INFINITY = args.infinity output_file = args.output collect_on = args.collect_on period = None if args.collect_on == "period": period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) if args.distribution in DISTRIBUTION_METHODS: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info("loading dcop from {}".format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) logger.debug(f"dcop {dcop} ") # Build factor-graph computation graph logger.info("Building computation graph ") cg = graph_module.build_computation_graph(dcop) logger.debug("Computation graph: %s ", cg) logger.info("Distributing computation graph ") if dist_module is not None: if not hasattr(algo_module, "computation_memory"): algo_module.computation_memory = lambda *v, **k: 0 if not hasattr(algo_module, "communication_load"): algo_module.communication_load = lambda *v, **k: 0 distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load, ) else: distribution = load_dist_from_file(args.distribution) logger.debug("Distribution Computation graph: %s ", distribution) logger.info("Dcop distribution : {}".format(distribution)) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == "thread": orchestrator = run_local_thread_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, delay=args.delay, uiport=args.uiport, ) elif args.mode == "process": # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger("pydcop.agent") agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method("spawn") orchestrator = run_local_process_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, delay=args.delay, uiport=args.uiport, ) try: orchestrator.deploy_computations() orchestrator.run(timeout=timeout) if timer: timer.cancel() if not timeout_stopped: if orchestrator.status == "TIMEOUT": _results("TIMEOUT") sys.exit(0) elif orchestrator.status != "STOPPED": _results("FINISHED") sys.exit(0) # in case it did not stop, dump remaining threads except Exception as e: logger.error(e, exc_info=1) orchestrator.stop_agents(5) orchestrator.stop() _results("ERROR")
def run_cmd(args, timer=None, timeout=None): logger.debug('dcop command "run" with arguments {}'.format(args)) global INFINITY, collect_on, output_file INFINITY = args.infinity collect_on = args.collect_on output_file = args.output period = None if args.collect_on == "period": period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) _, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info("loading dcop from {}".format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) dcop = filter_dcop(dcop) if args.distribution in DISTRIBUTION_METHODS: dist_module, algo_module, graph_module = _load_modules( args.distribution, args.algo) else: dist_module, algo_module, graph_module = _load_modules(None, args.algo) logger.info("loading scenario from {}".format(args.scenario)) scenario = load_scenario_from_file(args.scenario) logger.info("Building computation graph ") cg = graph_module.build_computation_graph(dcop) logger.info("Distributing computation graph ") if dist_module is not None: distribution = dist_module.distribute( cg, dcop.agents.values(), hints=dcop.dist_hints, computation_memory=algo_module.computation_memory, communication_load=algo_module.communication_load, ) else: distribution = load_dist_from_file(args.distribution) logger.debug("Distribution Computation graph: %s ", distribution) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == "thread": orchestrator = run_local_thread_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, replication=args.replication_method, ) elif args.mode == "process": # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger("pydcop.agent") agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method("spawn") orchestrator = run_local_process_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, ) orchestrator.set_error_handler(_orchestrator_error) try: orchestrator.deploy_computations() orchestrator.start_replication(args.ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario, timeout=timeout) if timer: timer.cancel() if not timeout_stopped: if orchestrator.status == "TIMEOUT": _results("TIMEOUT") sys.exit(0) elif orchestrator.status != "STOPPED": _results("FINISHED") sys.exit(0) except Exception as e: logger.error(e, exc_info=1) print(e) for th in threading.enumerate(): print(th) traceback.print_stack(sys._current_frames()[th.ident]) print() orchestrator.stop_agents(5) orchestrator.stop() _results("ERROR")
def run_cmd(args, timer): logger.debug('dcop command "run" with arguments {}'.format(args)) global INFINITY INFINITY = args.infinity global collect_on collect_on = args.collect_on period = None if args.collect_on == 'period': period = 1 if args.period is None else args.period else: if args.period is not None: _error('Cannot use "period" argument when collect_on is not ' '"period"') csv_cb = prepare_metrics_files(args.run_metrics, args.end_metrics, collect_on) _, algo_module, graph_module = _load_modules(None, args.algo) global dcop logger.info('loading dcop from {}'.format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) logger.info('Loading distribution from {}'.format(args.distribution)) distribution = load_dist_from_file(args.distribution) # FIXME: load replica dist from file and pass to orchestrator # logger.info('Loading replica distribution from {}'.format( # args.distribution)) # replica_dist = load_replica_dist_from_file(args.replica_dist) # logger.info('Dcop distribution : %s', replica_dist) logger.info('loading scenario from {}'.format(args.scenario)) scenario = load_scenario_from_file(args.scenario) logger.info('Building computation graph ') cg = graph_module.build_computation_graph(dcop) algo = build_algo_def(algo_module, args.algo, dcop.objective, args.algo_params) # Setup metrics collection collector_queue = Queue() collect_t = Thread(target=collect_tread, args=[collector_queue, csv_cb], daemon=True) collect_t.start() global orchestrator if args.mode == 'thread': orchestrator = run_local_thread_dcop( algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period, replication=args.replication_method) elif args.mode == 'process': # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger('pydcop.agent') agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method('spawn') orchestrator = run_local_process_dcop(algo, cg, distribution, dcop, INFINITY, collector=collector_queue, collect_moment=args.collect_on, period=period) orchestrator.set_error_handler(_orchestrator_error) try: orchestrator.deploy_computations() orchestrator.start_replication(args.ktarget) if orchestrator.wait_ready(): orchestrator.run(scenario) # orchestrator.run(scenario) # FIXME except Exception as e: logger.error(e, exc_info=1) print(e) for th in threading.enumerate(): print(th) traceback.print_stack(sys._current_frames()[th.ident]) print() orchestrator.stop_agents(5) orchestrator.stop() _results('ERROR', e)
def run_cmd(args, timer: Timer): logger.debug('Distribution replicas : %s', args) global orchestrator # global dcop logger.info('loading dcop from {}'.format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) try: algo_module = import_module('pydcop.algorithms.{}'.format(args.algo)) algo = build_algo_def(algo_module, args.algo, dcop.objective, []) # FIXME : algo params needed? graph_module = import_module('pydcop.computations_graph.{}'.format( algo_module.GRAPH_TYPE)) logger.info('Building computation graph ') cg = graph_module.build_computation_graph(dcop) logger.info('Computation graph : %s', cg) except ImportError: _error('Could not find module for algorithm {} or graph model ' 'for this algorithm'.format(args.algo)) logger.info('loading distribution from {}'.format(args.distribution)) distribution = load_dist_from_file(args.distribution) INFINITY = 10000 # FIXME should not be mandatory global orchestrator if args.mode == 'thread': orchestrator = run_local_thread_dcop(algo, cg, distribution, dcop, INFINITY, replication=args.replication) elif args.mode == 'process': # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger('pydcop.agent') agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method('spawn') orchestrator = run_local_process_dcop(algo, cg, distribution, dcop, INFINITY, replication=args.replication) try: orchestrator.deploy_computations() orchestrator.start_replication(args.ktarget) orchestrator.wait_ready() orchestrator.stop_agents(5) orchestrator.stop() timer.cancel() rep_dist = { c: list(hosts) for c, hosts in orchestrator.mgt.replica_hosts.items() } result = { 'inputs': { 'dcop': args.dcop_files, 'algo': args.algo, 'replication': args.replication, 'k': args.ktarget }, 'replica_dist': rep_dist } result['inputs']['distribution'] = args.distribution if args.output is not None: with open(args.output, encoding='utf-8', mode='w') as fo: fo.write(yaml.dump(result)) print(yaml.dump(result)) sys.exit(0) # TODO : retrieve and display replica distribution # Each agent should send back to the orchestrator the agents hosting # the replicas for each of it's computations except Exception as e: orchestrator.stop_agents(5) orchestrator.stop() _error('ERROR', e)
def run_cmd(args, timer: Timer = None, timeout=None): logger.debug("Distribution replicas : %s", args) global orchestrator # global dcop logger.info("loading dcop from {}".format(args.dcop_files)) dcop = load_dcop_from_file(args.dcop_files) try: algo_module = load_algorithm_module(args.algo) algo = build_algo_def(algo_module, args.algo, dcop.objective, []) # FIXME : algo params needed? graph_module = import_module("pydcop.computations_graph.{}".format( algo_module.GRAPH_TYPE)) logger.info("Building computation graph ") cg = graph_module.build_computation_graph(dcop) logger.info("Computation graph : %s", cg) except ImportError: _error("Could not find module for algorithm {} or graph model " "for this algorithm".format(args.algo)) logger.info("loading distribution from {}".format(args.distribution)) distribution = load_dist_from_file(args.distribution) INFINITY = 10000 # FIXME should not be mandatory global orchestrator if args.mode == "thread": orchestrator = run_local_thread_dcop(algo, cg, distribution, dcop, INFINITY, replication=args.replication) elif args.mode == "process": # Disable logs from agents, they are in other processes anyway agt_logs = logging.getLogger("pydcop.agent") agt_logs.disabled = True # When using the (default) 'fork' start method, http servers on agent's # processes do not work (why ?) multiprocessing.set_start_method("spawn") orchestrator = run_local_process_dcop(algo, cg, distribution, dcop, INFINITY, replication=args.replication) try: orchestrator.deploy_computations() start_t = time.time() orchestrator.start_replication(args.ktarget) orchestrator.wait_ready() # print(f" Replication Metrics {orchestrator.replication_metrics()}") metrics = orchestrator.replication_metrics() msg_count, msg_size = 0, 0 for a in metrics: msg_count += metrics[a]["count_ext_msg"] msg_size += metrics[a]["size_ext_msg"] # print(f" Count: {msg_count} - Size {msg_size}") duration = time.time() - start_t if timer: timer.cancel() rep_dist = { c: list(hosts) for c, hosts in orchestrator.mgt.replica_hosts.items() } orchestrator.stop_agents(5) orchestrator.stop() result = { "inputs": { "dcop": args.dcop_files, "algo": args.algo, "replication": args.replication, "k": args.ktarget, }, "metrics": { "duration": duration, "msg_size": msg_size, "msg_count": msg_count, }, "replica_dist": rep_dist, } result["inputs"]["distribution"] = args.distribution if args.output is not None: with open(args.output, encoding="utf-8", mode="w") as fo: fo.write(yaml.dump(result)) else: print(yaml.dump(result)) sys.exit(0) # TODO : retrieve and display replica distribution # Each agent should send back to the orchestrator the agents hosting # the replicas for each of it's computations except Exception as e: orchestrator.stop_agents(5) orchestrator.stop() _error("ERROR", e)