def proc_run(sync_barrier: multiprocessing.Barrier, fin_event: multiprocessing.Event, args: argparse.Namespace): if args.e is None: experiment_names = list(_EXPERIMENTS.keys()) else: experiment_names = args.e try: pid = sync_barrier.wait() device_string = str(args.cuda_device[pid]) os.environ['CUDA_VISIBLE_DEVICES'] = device_string # Init torch in order to occupy GPU. torch.cuda.init() for experiment_name in experiment_names: sync_barrier.wait() out_log_path = os.path.join( args.dir, f'pid{os.getpid()}-{pid}_{experiment_name}.log') err_log_path = os.path.join( args.dir, f'pid{os.getpid()}-{pid}_{experiment_name}.err.log') sys.stdout = utils_io.LogFile(out_log_path, lazy_create=True) sys.stderr = utils_io.LogFile(err_log_path, lazy_create=True) print(f'CUDA_VISIBLE_DEVICES = {device_string}') experiment = _EXPERIMENTS[experiment_name] experiment(sync_barrier, pid, sync_barrier.parties, args) except threading.BrokenBarrierError: print('Aborted from outside!') finally: fin_event.set()
class TestBlockingSocketTransferer(unittest.TestCase): TEST_PORT = 8000 def setUp(self) -> None: try: from pytest_cov.embed import cleanup_on_sigterm except ImportError: pass else: cleanup_on_sigterm() self.barrier = Barrier(2) self.p = None def tearDown(self) -> None: self.p.join() TestBlockingSocketTransferer.TEST_PORT += 1 def test_send_text(self): self.p = Process(target=message_sender, args=(self.barrier, TestBlockingSocketTransferer.TEST_PORT)) self.p.start() self.barrier.wait() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(('localhost', TestBlockingSocketTransferer.TEST_PORT)) socket_transferer = BlockingSocketTransferer(sock) self.assertEqual(socket_transferer.receive_plain_text(), "Hola uacho") socket_transferer.close() def test_send_file(self): with open('/tmp/big_dummy_file_test', 'wb') as dummy_file: for i in range(100000): dummy_file.write(("%d%d%d" % (i, i, i)).encode('utf-8')) sha256 = hashlib.sha256() with open('/tmp/big_dummy_file_test', 'rb') as dummy_file: while True: data = dummy_file.read(2048) if not data: break sha256.update(data) original_hash = sha256.hexdigest() self.p = Process(target=file_sender, args=(self.barrier, TestBlockingSocketTransferer.TEST_PORT, '/tmp/big_dummy_file_test')) self.p.start() self.barrier.wait() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(('localhost', TestBlockingSocketTransferer.TEST_PORT)) socket_transferer = BlockingSocketTransferer(sock) with open('/tmp/big_dummy_file_test_out', 'wb') as write_file: socket_transferer.receive_file_data(write_file) sha256 = hashlib.sha256() with open('/tmp/big_dummy_file_test_out', 'rb') as dummy_file: while True: data = dummy_file.read(2048) if not data: break sha256.update(data) self.assertEqual(sha256.hexdigest(), original_hash) os.remove('/tmp/big_dummy_file_test') os.remove('/tmp/big_dummy_file_test_out') socket_transferer.close()
def test(self): filename = str(tempfile.mktemp()) def process(cur_count: int, barrier: Barrier): try: logger = logging.Logger('a logger') handler = rolling.MPRotatingFileHandler( filename, 'a', self.FILE_SIZE, self.FILE_COUNT ) logger.setLevel(20) logger.addHandler(handler) sleep(1) # This is just to simulate presence of handlers s = 'Proc {}, Pid {}'.format(cur_count, os.getpid()) s += '*' * (self.FILE_SIZE - len(s) - 2) logger.info(s) finally: barrier.wait() b = Barrier(self.PROCESS_COUNT + 1) processes = [Process(target=process, args=(i, b,)) for i in range(self.PROCESS_COUNT)] for p in processes: p.start() b.wait() base_filename = os.path.basename(filename) count = sum([_file_len('{}/{}'.format(os.path.dirname(filename), x)) for x in os.listdir(os.path.dirname(filename)) if base_filename in x]) - 1 self.assertEqual(self.PROCESS_COUNT, count)
def _to_device_loop(self, initialization_barrier: multiprocessing.Barrier): try: to_ser_sock = self.zmq_ctx.socket(zmq.SUB) addr = 'tcp://{}:{}'.format(self._base_addr, self._to_port_num) to_ser_sock.bind(addr) to_ser_sock.setsockopt(zmq.SUBSCRIBE, b'') logger(__name__).info('Bound to device broadcaster as a subscriber to {}'.format(addr)) watchdog = threading.Timer(10, self.kill) initialization_barrier.wait() watchdog.start() while not self.dying.is_set(): msg = to_ser_sock.recv_multipart() if not msg or self.dying.is_set(): continue if msg[0] == b'kick': logger(__name__).debug('Kicking watchdog on server {}'.format(threading.current_thread())) watchdog.cancel() watchdog = threading.Timer(msg[1][1] if len(msg) > 1 and len(msg[1]) > 0 else 5, self.kill) watchdog.start() elif msg[0] == b'send': logger(self).debug('Writing {} to {}'.format(bytes_to_str(msg[1]), self.port.port_name)) self.port.write(msg[1]) except Exception as e: initialization_barrier.abort() logger(__name__).exception(e) logger(__name__).warning('To Device Broadcaster is dying now.') try: self.kill(do_join=False) except: sys.exit(0)
def train(self, num_episodes: int, batch_size: int, decay: float, n_steps: int, experience_queue: Queue, queue_barrier: Barrier, exit_condition: Optional[Callable[[], bool]] = None) -> None: """ Trains the algorithm for the number of episodes specified on the environment. Args: num_episodes: The number of episodes to train for. batch_size: The number of ready experiences to train on at a time. decay: The decay of the next. n_steps: The number of steps. experience_queue: The queue to send experiences to. queue_barrier: A barrier to use when all queue tasks are complete on all processes. exit_condition: An alternative exit condition to num episodes which will be used if given. """ self.om.train(num_episodes, batch_size, decay, n_steps, experience_queue, exit_condition=exit_condition) # Wait for all processes to finish using queues queue_barrier.wait()
def request_concurrent_wrapper(self, method_name:str, method_args:dict, thread_num:int, num_threads:int, barrier:mp.Barrier, recieve_pipe:mp.Pipe): # wait for all threads barrier.wait() # call the appropriate request method if method_name == 'Getter': if method_args['command'] == 'list': response = self.request_getter(command='list') else: response = self.request_getter(command='get_file_url', filename=method_args['filename']) elif method_name == 'FeedGenerator': response = self.request_feed_generator(num_items=method_args['num_items']) elif method_name == 'FeedWebView': response = self.request_feed_webview(num_items=method_args['num_items']) elif method_name == 'Putter': # TODO raise NotImplementedError() # add concurrent metadata response.concurrent = True response.thread_num = thread_num response.num_threads = num_threads # return TestData recieve_pipe.send(response)
def run(test, count, concurrency, *, loop, verbose, profile): if verbose: print("Prepare") else: print('.', end='', flush=True) host, port = find_port() barrier = Barrier(2) server = Process(target=test, args=(host, port, barrier, profile)) server.start() barrier.wait() url = 'http://{}:{}'.format(host, port) connector = aiohttp.TCPConnector(loop=loop) with aiohttp.ClientSession(connector=connector) as client: for i in range(10): # make server hot resp = yield from client.get(url+'/prepare') assert resp.status == 200, resp.status yield from resp.release() if verbose: test_name = test.__name__ print("Attack", test_name) rps, data = yield from attack(count, concurrency, client, loop, url) if verbose: print("Done") resp = yield from client.get(url+'/stop') assert resp.status == 200, resp.status yield from resp.release() server.join() return rps, data
def run(test, count, concurrency, *, loop, verbose, profile): if verbose: print("Prepare") else: print('.', end='', flush=True) host, port = find_port() barrier = Barrier(2) server = Process(target=test, args=(host, port, barrier, profile)) server.start() barrier.wait() url = 'http://{}:{}'.format(host, port) connector = aiohttp.TCPConnector(loop=loop) with aiohttp.ClientSession(connector=connector) as client: for i in range(10): # make server hot resp = yield from client.get(url+'/prepare') assert resp.status == 200, resp.status yield from resp.release() if verbose: test_name = test.__name__ print("Attack", test_name) rps, data = yield from attack(count, concurrency, client, loop, url) if verbose: print("Done") resp = yield from client.get(url+'/stop') assert resp.status == 200, resp.status yield from resp.release() server.join() return rps, data
class BarrierNameFilter(Filter): def __init__(self): self._barrier = Barrier(2) def filter(self, items: Iterable[Any]) -> Iterable[Any]: self._barrier.wait() yield f"pid-{current_process().pid}"
def _start_single_replay( self, replay_index: int, offset: int, skip_offset_barrier: multiprocessing.Barrier, latest_timestamp: ValueProxy, ): setup_logger("info") self._create_producer() self.processor.set_latest_timestamp_valueproxy(latest_timestamp) _, lines = open_recording(self.bucket, self.key) # Skip header for _ in range(4): next(lines) # "Fast-forward" recording to offset for _ in range(offset): next(lines) logger.info( f"Replay {replay_index+1} of {skip_offset_barrier.parties} ready") skip_offset_barrier.wait() # Scheduler is used to replay messages with original relative timing scheduler = MultithreadingScheduler() scheduler.start() if replay_index == 0: _start_latency_marker_generator( self.config, self.processor.generate_latency_markers, self._ingest) # Necessary to start immediately despite "fast-forwarding" recording_start_offset = None message_regex = re.compile(r'(\S+) "(.+)" (\d) (\d) (\S*)') for i, line in enumerate(lines): # Prevent queue from growing too fast # Only check every 100 iterations for performance reasons if i % 100 == 0 and scheduler.is_queue_full(): time.sleep(0.2) continue t_offset, topic, _, _, payload = message_regex.match(line).groups() t_offset = float(t_offset) payload = json.loads(base64.b64decode(payload)) if recording_start_offset is None: recording_start_offset = t_offset scheduler.schedule( t_offset - recording_start_offset, functools.partial(self._process_and_ingest, topic, payload, replay_index), ) scheduler.stop()
def count_down_m(cnt: int, b: multiprocessing.Barrier): b.wait() print(f'Starting {time.ctime()}') t = time.time() while cnt > 0: cnt -= 1 delta = time.time() - t print(f"time taken {delta}")
class SNMPAgent(Process): ''' Execute a SNMP agent Process''' def __init__(self, port, responder): Process.__init__(self, daemon=True) timeout_s = 5 self.__listening_port = port self.__responder = responder self.__barrier = Barrier(parties=2, timeout=timeout_s) def run(self): snmpEngine = engine.SnmpEngine() config.addSocketTransport( snmpEngine, udp.domainName, udp.UdpTransport().openServerMode(('127.0.0.1', self.__listening_port)) ) config.addV1System( snmpEngine, 'my-area', 'public', contextName='my-context') config.addVacmUser(snmpEngine=snmpEngine, securityModel=2, securityName='my-area', securityLevel='noAuthNoPriv', readSubTree=SNMPAgentResponder.OID_PREFIX, writeSubTree=(), notifySubTree=()) snmpContext = context.SnmpContext(snmpEngine) snmpContext.registerContextName( v2c.OctetString('my-context'), # Context Name self.__responder # Management Instrumentation ) cmdrsp.GetCommandResponder(snmpEngine, snmpContext) snmpEngine.transportDispatcher.jobStarted(1) self.__barrier.wait() # TODO with statement here! try: snmpEngine.transportDispatcher.runDispatcher() except: snmpEngine.transportDispatcher.closeDispatcher() raise def __enter__(self): self.start() self.__barrier.wait() return self def __exit__(self, type, value, traceback): self.terminate()
class RabbitConnectionExample: """ RabbitMQ operations """ def __init__(self): """ Initializes the class """ self._url = os.environ['RABBITMQ_URL'] self._barrier = Barrier(2, timeout=120) def connection_callback(self, conn): """ Run on connecting to the server :param conn: The connection created in the previous step """ self._connection.channel(on_open_callback=self.channel_callback) def channel_callback(self, ch): """ Publish on the channel. You can use other methods with callbacks but only the channel creation method provides a channel. Other methods provide a frame you can choose to discard. :param ch: The channel established """ properties = pika.BasicProperties(content_type='application/json') ch.basic_publish(exchange='test_exchange', routing_key='tests', properties=properties, body='Hello CloudAMQP!') self._barrier.wait(timeout=1) ch.close() self._connection.close() def run(self): """ Runs the example """ print("Running") def run_io_loop(conn): conn.ioloop.start() params = pika.URLParameters(self._url) self._connection = pika.SelectConnection( params, on_open_callback=self.connection_callback) if self._connection: t = threading.Thread(target=run_io_loop, args=(self._connection, )) t.start() self._barrier.wait(timeout=30) print("Waiting on Barrier") self._connection.ioloop.stop() else: raise ValueError
def manage_data_m(b: multiprocessing.Barrier): app_ = new_app() name = multiprocessing.process.current_process().name b.wait() t = time.time() result = app_.send_task('tasks.data', args=()) data = result.get() t2 = time.time() delta = t2 - t logging.info(f'The Overall time taken is {delta}')
def ext_pot_init(ext_model, ext_kwargs, Potential_Q_list: list, barrier: Barrier, N_NEURON: int, ticks: int): N_N_THREAD = len(Potential_Q_list) for count in range(ticks): total_potentials = [[] for _ in range(N_N_THREAD)] external = ext_model(**ext_kwargs) for n in external: total_potentials[n[-1] // N_NEURON].append(n) barrier.wait() for idx, ftn in enumerate(total_potentials): Potential_Q_list[idx].put(ftn)
def startBackgroundStreaming(self, isMaster, intervalInMicroseconds, filePath, bufferSize=2 << 20, axis0=False, axis1=False, axis2=False): """ Starts concurrent and permanent position streaming to file in background. Programm must run in a main function: def main(): do_everything() if __name__ == '__main__': main() Parameters ---------- isMaster : bool Master intervalInMicroseconds : int Sample interval (in us) of the position samples filePath : str target file bufferSize : int Size of each buffer in bytes axis0 : bool, default: False Should Axis 0 be recorded? axis1 : bool, default: False Should Axis 1 be recorded? axis2 : bool, default: False Should Axis 2 be recorded? """ if self.background_process is not None: raise Exception("Stream recording already started") barrier = Barrier(2) stopped = Value('b', False) self.background_process = Process( target=fileWriter, args=(self.device.address, isMaster, intervalInMicroseconds, filePath, bufferSize, axis0, axis1, axis2, barrier, stopped)) self.background_process.daemon = True self.background_process.stopped = stopped self.background_process.start() barrier.wait() sleep(0.1) return
def process(cur_count: int, barrier: Barrier): try: logger = logging.Logger('a logger') handler = rolling.MPRotatingFileHandler( filename, 'a', self.FILE_SIZE, self.FILE_COUNT ) logger.setLevel(20) logger.addHandler(handler) sleep(1) # This is just to simulate presence of handlers s = 'Proc {}, Pid {}'.format(cur_count, os.getpid()) s += '*' * (self.FILE_SIZE - len(s) - 2) logger.info(s) finally: barrier.wait()
def publish_a_file(dir, port) -> Process: barrier = Barrier(2) def func(dir, port, barrier): with TCPServer(("", port), simple_http_handler(dir)) as server: barrier.wait() server.handle_request() proc = Process(target=func, args=(dir, port, barrier)) proc.start() barrier.wait() time.sleep(0.05) return proc
def __init__(self, bully_connections_config: Dict[int, Tuple], workers_config: Dict, lowest_port: int, host_id: int): """ Initializes connections and bully :param bully_connections_config: Dictionary that contains numerical ids of hosts as keys and a tuple with the host ip and port that will have a listening socket to receive messages as values. :param workers_config: Dictionary that contains all the configuration about the running workers in the system. :param lowest_port: Integer that represents the lowest port to listen from other nodes. :param host_id: Numerical value that represents the id of the host for the bully algorithm. """ self._workers_config = workers_config self._bully_connections_config = bully_connections_config self._host_id = host_id self._sockets_to_send_messages = {} bully_leader_election = BullyLeaderElection( host_id, list(bully_connections_config.keys()) + [host_id]) manager = Manager() concurrent_dict = manager.dict() concurrent_dict['bully'] = bully_leader_election self._bully_leader_election_dict = concurrent_dict self._bully_leader_election_lock = Lock() self._sending_connections = manager.dict() open_sockets_barrier = Barrier(len(bully_connections_config) + 1) for i in range(len(bully_connections_config)): bully_message_receiver = BullyMessageReceiver( host_id, lowest_port + i, self._bully_leader_election_dict, self._bully_leader_election_lock, self._sending_connections, open_sockets_barrier) listening_process = Process( target=bully_message_receiver.start_listening) listening_process.start() for h_id, host_and_port in bully_connections_config.items(): self._sending_connections[h_id] = open_sending_socket_connection( host_and_port[0], host_and_port[1]) # This barrier exists because a listening process can try to access to the sending connections after they are all initialized. open_sockets_barrier.wait()
def test_for_each_task(self): NUM_PROCS = 12 barrier = Barrier(NUM_PROCS + 1) def proc_func(): barrier.wait() try: procs = [Process(target=proc_func) for _ in range(NUM_PROCS)] for proc in procs: proc.start() pids = {task.pid.value_() for task in for_each_task(self.prog)} for proc in procs: self.assertIn(proc.pid, pids) self.assertIn(os.getpid(), pids) barrier.wait() except BaseException: barrier.abort() for proc in procs: proc.terminate() raise
def test_threads(self): NUM_PROCS = 12 barrier = Barrier(NUM_PROCS + 1) def proc_func(): barrier.wait() try: procs = [Process(target=proc_func) for _ in range(NUM_PROCS)] for proc in procs: proc.start() pids = {thread.tid for thread in self.prog.threads()} for proc in procs: self.assertIn(proc.pid, pids) self.assertIn(os.getpid(), pids) barrier.wait() except BaseException: barrier.abort() for proc in procs: proc.terminate() raise
def run_http(*messages): def run(message, queue, bar): server = create_server(message) queue.put(server.server_port) bar.wait() server.serve_forever() queue = Queue() bar = Barrier(len(messages) + 1, timeout=5) processes = [ Process(target=run, args=(message, queue, bar)) for message in messages ] for p in processes: p.start() bar.wait() return processes, [ 'localhost:' + str(queue.get()) for x in range(len(processes)) ]
def _from_device_loop(self, initialization_barrier: multiprocessing.Barrier): errors = 0 rxd = 0 try: from_ser_sock = self.zmq_ctx.socket(zmq.PUB) addr = 'tcp://{}:{}'.format(self._base_addr, self._from_port_num) from_ser_sock.bind(addr) logger(__name__).info('Bound from device broadcaster as a publisher to {}'.format(addr)) initialization_barrier.wait() buffer = bytearray() while not self.dying.is_set(): try: # read one byte as a blocking call so that we aren't just polling which sucks up a lot of CPU, # then read everything available buffer.extend(self.port.read(1)) buffer.extend(self.port.read(-1)) while b'\0' in buffer and not self.dying.is_set(): msg, buffer = buffer.split(b'\0', 1) msg = cobs.decode(msg) from_ser_sock.send_multipart((msg[:4], msg[4:])) rxd += 1 time.sleep(0) except Exception as e: # TODO: when getting a COBS decode error, rebroadcast the bytes on sout logger(__name__).error('Unexpected error handling {}'.format(bytes_to_str(msg[:-1]))) logger(__name__).exception(e) errors += 1 logger(__name__).info('Current from device broadcasting error rate: {} errors. {} successful. {}%' .format(errors, rxd, errors / (errors + rxd))) except Exception as e: initialization_barrier.abort() logger(__name__).exception(e) logger(__name__).warning('From Device Broadcaster is dying now.') logger(__name__).info('Current from device broadcasting error rate: {} errors. {} successful. {}%' .format(errors, rxd, errors / (errors + rxd))) try: self.kill(do_join=False) except: sys.exit(0)
def get_benchmark(optimization_config, test_params=None): manager = Manager() test_result = manager.dict() if optimization_config.throughput_tuning_enabled: main_process = Process(name="main_process", target=get_throughput, args=(optimization_config, test_params, test_result)) else: main_process = Process(name="main_process", target=get_latency, args=(optimization_config, test_params, test_result)) process_list = [] num_of_background = optimization_config.concurrency_num - 1 if num_of_background > 0: synchronizer = Barrier(num_of_background + 1) for i in range(0, num_of_background): p = Process(name="{}_{}".format(SUB_PROCESS_NAME_PREFIX, i), target=concurrent_inference, args=(synchronizer, optimization_config, test_params)) p.start() process_list.append(p) synchronizer.wait() # execute main func, we only collect benchmark from this main func main_process.start() main_process.join() # once the main func finished, stop child process for p in process_list: if p.is_alive(): p.terminate() logger.info("PID {} is killed".format(p.pid)) return dict(test_result)
def test_when_parent_died(self): port = get_available_tcp_port() mediator_logger.info('starting test pid=%s', os.getpid()) kill_parent = Barrier(2) def run_threaded_mediator(): mediator = MockedPiperMediator(port=port) mediator.start() mediator.join() def run_doomed_parent_browser(): mediator_logger.info('doomed_parent_browser pid=%s', os.getpid()) mediator_process = Process(target=run_threaded_mediator) mediator_process.start() mediator_process.join() def on_sig_child(signum, frame): pid, status = os.wait() mediator_logger.info('reaped child signum=%s pid=%s status=%s', signum, pid, status) def run_supervisor(): signal.signal(signal.SIGCHLD, on_sig_child) doomed_parent_browser = Process(target=run_doomed_parent_browser) doomed_parent_browser.start() kill_parent.wait() doomed_parent_browser.terminate() doomed_parent_browser.join() signal.signal(signal.SIGCHLD, on_sig_child) supervisor = Process(target=run_supervisor) supervisor.start() api = api_must_ready(port, 'mocked') # kill parent and expect mediator to terminate as well kill_parent.wait() self.assertTrue(Waiter(api.pid_not_ready).wait(timeout=1.0)) supervisor.join()
def main(): print("Doing setup...") conn = psycopg2.connect(dsn) curs = conn.cursor() curs.execute("CREATE TABLE IF NOT EXISTS concurrent_tx(id integer primary key);") conn.commit() print("done") print("Creating workers...") run_barrier = Barrier(3, timeout=10) # Race to insert a row runner1 = ConcurrentRunner("runner1", "INSERT INTO concurrent_tx(id) VALUES (1);", run_barrier, delay_seconds=0) runner2 = ConcurrentRunner("runner2", "INSERT INTO concurrent_tx(id) VALUES (1);", run_barrier, delay_seconds=0.01) # Start them waiting on the barrier, letting them get their connections set up: print("Starting workers...") runner1.start() runner2.start() # and release the barrier. It won't actually get released until all workers # have connected and are also waiting on the barrier. print("Releasing barrier...") run_barrier.wait() print("Waiting for results...") # OK, we're running. Wait until both workers finish. workers = [runner1, runner2] for worker in workers: worker.get_result() # Wait for termination for worker in workers: worker.join() # and report results for worker in workers: if worker.exception is not None: print("Worker {0} got exception: {1}".format(worker.name, worker.exception)) elif worker.result is not None: print("Worker {0} got result: {1}".format(worker.name, worker.result)) else: print("Worker {0} succeeded silently.".format(worker.name))
def synapse_init(s_list: list, pre_Q_list: list, post_Q_list: list, Potential_Q_list: list, barrier: Barrier, N_NEURON: int, num: int, ticks: int, log_begin=-1): weight_log = [] fired_log = [] start_index = s_list[0].get_id() N_N_THREAD = len(Potential_Q_list) fired_to_neurons = [[] for _ in range(N_N_THREAD)] if num < N_N_THREAD: put_order = np.arange(num, num + N_N_THREAD) % N_N_THREAD get_order = np.arange(num + N_N_THREAD, num, -1) % N_N_THREAD else: put_order = np.arange(N_N_THREAD) get_order = np.arange(N_N_THREAD - 1, -1, -1) for count in range(ticks): pre_fired = [] post_fired = [] # Get from pre/post queue, sent by Neurons for idx in get_order: pre_fired.extend(pre_Q_list[idx].get()) post_fired.extend(post_Q_list[idx].get()) # Tell finished getting from queues barrier.wait() # Put items BEFORE calculation starts, i.e. putting the items from the past iter. for idx in put_order: Potential_Q_list[idx].put(fired_to_neurons[idx]) if pre_fired == MULTI_sentinel or post_fired == MULTI_sentinel: # Log_q.put({ # MULTI_weight_log : weight_log, # MULTI_fired_synapse_log : fired_log, # }) break fired_to_neurons = [[] for _ in range(N_N_THREAD)] for s in s_list: s.tick() # 2 for s in post_fired: s_list[s[-1] - start_index].post_fired(s) for s in pre_fired: s_list[s[-1] - start_index].pre_fired(s) # 3 tmp_weight = [] tmp_fired = [] for idx, s in enumerate(s_list, start=start_index): if count >= log_begin: w = s.get_weight() if not w: tmp_weight.append([idx, 0]) else: tmp_weight.append([idx, int(w * 100 + 0.5) / 100]) if s.is_fired(): pot = s.get_signal() fired_to_neurons[pot[-1] // N_NEURON].append(pot) if count >= log_begin: tmp_fired.append(idx) if count >= log_begin: weight_log.append(tmp_weight) fired_log.append(tmp_fired) # Potential_Q.put(fired_to_neurons) with open(os.path.join(LOG_path, LOG_multi_synapse_name.format(num)), 'w') as logfile: rapidjson.dump( { str(MULTI_weight_log): weight_log, str(MULTI_fired_synapse_log): fired_log, }, logfile, number_mode=rapidjson.NM_NATIVE)
def neuron_init( n_list: list, pre_Q_list: list, post_Q_list: list, Potential_Q_list: list, barrier: Barrier, N_SYNAPSE: int, num: int, ticks: int, log_begin=-1, ): potent_log = [] # [[id, potential],...] fired_log = [] # [id,...] start_index = n_list[0].get_id() N_S_THREAD = len(pre_Q_list) pre_fired = [[] for _ in range(N_S_THREAD)] post_fired = [[] for _ in range(N_S_THREAD)] if num < N_S_THREAD: put_order = np.arange(num, num + N_S_THREAD) % N_S_THREAD get_order = np.arange(num + N_S_THREAD, num, -1) % N_S_THREAD # Don't forget to get external potentials get_order = np.append(get_order, -1) else: put_order = np.arange(N_S_THREAD) get_order = np.arange(N_S_THREAD - 1, -1, -1) # Don't forget to get external potentials get_order = np.append(get_order, -1) for count in range(ticks): # Put to synapse thread Queues for idx in put_order: pre_Q_list[idx].put(pre_fired[idx]) post_Q_list[idx].put(post_fired[idx]) # Wait until synapse threads gets all items barrier.wait() # Get all from neuron's Queues and start calculation fired_to_neurons = [] for idx in get_order: fired_to_neurons.extend(Potential_Q_list[idx].get()) pre_fired = [[] for _ in range(N_S_THREAD)] post_fired = [[] for _ in range(N_S_THREAD)] # if fired_to_neurons == MULTI_sentinel : # Log_q.put({ # MULTI_potent_log : potent_log, # MULTI_fired_neuron_log : fired_log, # }) # break for n in n_list: n.tick() # 4 for n in fired_to_neurons: n_list[n[-1] - start_index].input_potential(n[0]) # 1 tmp_potent = [] tmp_fired = [] for idx, n in enumerate(n_list, start=start_index): if count >= log_begin: p = n.get_potential() if not p: tmp_potent.append([idx, 0]) else: tmp_potent.append([idx, int(p * 100 + 0.5) / 100]) if n.is_fired(): i, e = n.get_signal() for pre in e: pre_fired[pre[-1] // N_SYNAPSE].append(pre) for post in i: post_fired[post[-1] // N_SYNAPSE].append(post) if count >= log_begin: tmp_fired.append(idx) if count >= log_begin: potent_log.append(tmp_potent) fired_log.append(tmp_fired) # pre_Q.put(pre_fired) # post_Q.put(post_fired) with open(os.path.join(LOG_path, LOG_multi_neuron_name.format(num)), 'w') as logfile: rapidjson.dump( { str(MULTI_potent_log): potent_log, str(MULTI_fired_neuron_log): fired_log }, logfile, number_mode=rapidjson.NM_NATIVE)
def train( self, algo: RLAlgo, done_event: Event, queue_barrier: Barrier, training_steps: int, sample_queue: Queue, priority_queue: Queue, param_pipes: Tuple[Pipe, ...] = tuple(), param_send_interval: int = 0, save_path: str = None, save_interval: int = 10000 ) -> None: """ Trains the algorithm until all agent processes have ended. Args: algo: The algorithm to train. done_event: The event to set to allow the other processes to exit. queue_barrier: A barrier to use when all queue tasks are complete on all processes. training_steps: The number of steps to train for. sample_queue: The queue to receive buffer samples from. priority_queue: The queue to send updated values to. param_pipes: A tuple of pipes to send the model state to periodically. param_send_interval: The number of training steps in between parameter sends, 0 for never. save_path: The directory to save the model to. save_interval: The number of training steps in-between model saves. """ if param_send_interval > 0: for pipe in param_pipes: pipe.send(algo.save_dict()) training_step = 0 train_start = 0 while training_step < training_steps and not done_event.is_set(): if algo.logger is not None: sample_start = time() sample = sample_queue.get() rollouts, ids, is_weights = sample if algo.logger is not None: if train_start == 0: train_start = time() train_step_start = time() if train_step_start - sample_start > 0: algo.logger["Train/Samples per Second"] = ( 1 / (train_step_start - sample_start), algo.training_steps ) new_qs, new_q_targs = algo.train_batch(rollouts, is_weights) errors = self.experience_replay.get_error(new_qs, new_q_targs) priorities = self.experience_replay.get_priority(errors) priority_queue.put((ids, priorities)) if algo.logger is not None: train_end = time() algo.logger["Train/Training Steps per Second"] = ( 1 / (train_end - train_step_start), algo.training_steps ) algo.logger["Train/Training Steps + Samples per Second"] = ( training_step / (train_end - train_start), algo.training_steps ) training_step += 1 if(save_path is not None and algo.training_steps % save_interval == 0): algo.save(save_path) if (param_send_interval > 0 and training_step % param_send_interval == 0): for pipe in param_pipes: pipe.send(algo.save_dict()) # Signal exit done_event.set() # Wait for all processes to finish using queues queue_barrier.wait() # Clear queues try: while not sample_queue.empty(): sample_queue.get_nowait() except queue.Empty: pass
def run_parallel_impl(args: argparse.Namespace, processes: "typing.List[Process]", mp_q: Queue, mp_barrier: Barrier, ssh_port_queue: Queue): timed_out = False starttime = datetime.datetime.now() ssh_ports = [ ] # check that we don't have multiple parallel jobs trying to use the same port assert not mp_barrier.broken, mp_barrier # FIXME: without this sleep it fails in jenkins (is the python version there broken?) # Works just fine everywhere else where I test it... boot_cheribsd.info("Waiting 5 seconds before releasing barrier") time.sleep(5) mp_debug(args, "Waiting for SSH port barrier") mp_barrier.wait(timeout=10) # wait for ssh ports to be assigned for i in range(len(processes)): try: ssh_port, index = ssh_port_queue.get(timeout=1) assert index <= len(processes) print("SSH port for ", processes[index - 1].name, "is", ssh_port) processes[index - 1].ssh_port = ssh_port if ssh_port in ssh_ports: timed_out = True # kill all child processes boot_cheribsd.failure( "ERROR: reusing the same SSH port in multiple jobs: ", ssh_port, exit=False) except Empty: # This seems to be happening in jenkins? Barrier should ensure that we can read without blocking! timed_out = True # kill all child processes boot_cheribsd.failure( "ERROR: Could not determine SSH port for one of the processes!", exit=False) # wait for the success/failure message from the process: # if the shard takes longer than 4 hours to run something went wrong start_time = datetime.datetime.utcnow() max_test_duration = datetime.timedelta(seconds=4 * 60 * 60) test_end_time = start_time + max_test_duration # If any shard has not yet booted CheriBSD after 10 minutes something went horribly wrong max_boot_time = datetime.timedelta( seconds=10 * 60) if not args.pretend else datetime.timedelta(seconds=5) boot_cheribsd.info("Waiting for all shards to boot...") boot_end_time = start_time + max_boot_time booted_shards = 0 remaining_processes = processes.copy() not_booted_processes = processes.copy() retrying_queue_read = False while len(remaining_processes) > 0: if timed_out: for p in remaining_processes: p.stage = run_remote_lit_test.MultiprocessStages.TIMED_OUT break loop_start_time = datetime.datetime.utcnow() num_shards_not_booted = len(not_booted_processes) if num_shards_not_booted > 0: mp_debug(args, "Still waiting for ", num_shards_not_booted, " shards to boot") if loop_start_time > boot_end_time: timed_out = True boot_cheribsd.failure("ERROR: ", num_shards_not_booted, " shards did not boot within ", max_boot_time, ". Shards remaining: ", remaining_processes, exit=False) dump_processes(processes) continue mp_debug(args, "Still waiting for ", remaining_processes, " to finish") if boot_end_time > test_end_time: timed_out = True boot_cheribsd.failure("Reached test timeout of", max_test_duration, " with ", len(remaining_processes), "shards remaining: ", remaining_processes, exit=False) dump_processes(processes) continue remaining_test_time = test_end_time - loop_start_time max_timeout = 120.0 if not args.pretend else 1.0 try: shard_result = mp_q.get(timeout=min( max(1.0, remaining_test_time.total_seconds()), max_timeout)) retrying_queue_read = False mp_debug(args, "Got message:", shard_result) target_process = processes[shard_result[1] - 1] if shard_result[0] == run_remote_lit_test.COMPLETED: boot_cheribsd.success("===> Shard ", shard_result[1], " completed successfully.") mp_debug(args, "Shard ", target_process, "exited!") if target_process in remaining_processes: remaining_processes.remove(target_process) target_process.stage = run_remote_lit_test.MultiprocessStages.EXITED elif shard_result[0] == run_remote_lit_test.NEXT_STAGE: mp_debug(args, "===> Shard ", shard_result[1], " reached next stage: ", shard_result[2]) if target_process.stage == run_remote_lit_test.MultiprocessStages.BOOTING_CHERIBSD: not_booted_processes.remove(target_process) boot_cheribsd.success("Shard ", shard_result[1], " has booted successfully afer ", loop_start_time - start_time) if len(not_booted_processes) == 0: boot_cheribsd.success( "All shards have booted succesfully. Releasing barrier (num_waiting = ", mp_barrier.n_waiting, ")") assert mp_barrier.n_waiting == len( processes), "{} != {}".format( mp_barrier.n_waiting, len(processes)) mp_barrier.wait(timeout=10) boot_cheribsd.success( "Barrier has been released, tests should run now.") # assert target_process.stage < shard_result[2], "STAGE WENT BACKWARDS?" target_process.stage = shard_result[2] elif shard_result[0] == run_remote_lit_test.FAILURE: previous_stage = target_process.stage target_process.stage = run_remote_lit_test.MultiprocessStages.FAILED target_process.error_message = shard_result[2] if target_process in remaining_processes: remaining_processes.remove(target_process) if previous_stage != run_remote_lit_test.MultiprocessStages.RUNNING_TESTS: boot_cheribsd.failure( "===> FATAL: Shard ", target_process, " failed before running tests stage: ", previous_stage, " -> Aborting all other shards", exit=False) timed_out = True break else: boot_cheribsd.failure("===> ERROR: Shard ", shard_result[1], " failed while running tests: ", shard_result[2], exit=True) else: boot_cheribsd.failure( "===> FATAL: Received invalid shard result message: ", shard_result, exit=True) except Empty: mp_debug(args, "Got Empty read from QUEUE. Checking ", remaining_processes) for p in list(remaining_processes): if not p.is_alive(): mp_debug(args, "Found dead process", p) if retrying_queue_read: mp_debug( args, "Already retried read after finding dead process", p) boot_cheribsd.failure( "===> ERROR: shard ", p, " died without sending a message!", exit=False) remaining_processes.remove(p) else: # Try to read from the queue one more time to see if we missed a message retrying_queue_read = True mp_debug(args, "Retrying read after finding dead process", p) break continue except KeyboardInterrupt: dump_processes(processes) boot_cheribsd.failure("GOT KEYBOARD INTERRUPT! EXITING!", exit=False) return if not timed_out: if not_booted_processes: boot_cheribsd.failure( "FATAL: all processes exited but some still not booted? ", not_booted_processes) boot_cheribsd.success("All shards have terminated") # If we got an error we should not end up here -> all processes should be in stage exited dump_processes(processes) # All shards should have completed -> give them 60 seconds to shut down cleanly wait_or_terminate_all_shards(processes, max_time=60, timed_out=timed_out) if timed_out: time.sleep(0.2) boot_cheribsd.failure("Error running the test jobs!", exit=True) else: boot_cheribsd.success("All parallel jobs completed!") boot_cheribsd.success("Total execution time for parallel libcxx tests: ", datetime.datetime.now() - starttime)
format="%(asctime)s - %(PID)s - %(message)s") logger = logging.getLogger(__name__) def increment(value, value2, barrier): with value.get_lock(): value.value += 1 + value2.value print(f"value increased: {value.value}") barrier.wait() if __name__ == '__main__': num = 10 val = Value("i", 0, lock=True) val2 = Value('i', 10) barrier = Barrier(num + 1) processes = [ Process(target=increment, args=(val, val2, barrier)) for _ in range(num) ] for p in processes: p.start() barrier.wait() print(val.value) for p in processes: p.join()
col] += matrix_a[row * matrix_size + i] * matrix_b[i * matrix_size + col] work_complete.wait() if __name__ == '__main__': multiprocessing.set_start_method('spawn') work_start = Barrier(process_count + 1) work_complete = Barrier(process_count + 1) matrix_a = multiprocessing.Array('i', [0] * (matrix_size * matrix_size), lock=False) matrix_b = multiprocessing.Array('i', [0] * (matrix_size * matrix_size), lock=False) result = multiprocessing.Array('i', [0] * (matrix_size * matrix_size), lock=False) for p in range(process_count): Process(target=work_out_row, args=(p, matrix_a, matrix_b, result, work_start, work_complete)).start() start = time.time() for t in range(10): generate_random_matrix(matrix_a) generate_random_matrix(matrix_b) for i in range(matrix_size * matrix_size): result[i] = 0 work_start.wait() work_complete.wait() end = time.time() print("Done, time taken", end - start)
def main(argv): """Command line interface. Args: argv: List of command-line arguments passed to the program. """ parser = argparse.ArgumentParser() parser.add_argument( "--model-path", required=True, help= "Local file path to the Tensorflow model, example pre-trained models \ can be found at \ https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md" ) parser.add_argument( "--classes", default='/classes.json', type=str, help="File containing json mapping of object class IDs to class names") parser.add_argument( "--number-tensorflow-processes", default=1, type=int, help="Number of Tensorflow processes to run in parallel") parser.add_argument( "--detection-threshold", default=0.7, type=float, help="Detection threshold to use for Tensorflow detections") parser.add_argument( "--sleep-between-capture", default=1.0, type=float, help= "Seconds to sleep between each image capture loop iteration, which captures " + "an image from all cameras") parser.add_argument( "--detection-class", default=1, type=int, help="Detection classes to use in the" + "Tensorflow model; Default is to use 1, which is a person in the Coco dataset" ) parser.add_argument( "--max-processing-delay", default=7.0, type=float, help="Maximum allowed delay for processing an image; " + "any image older than this value will be skipped") bosdyn.client.util.add_common_arguments(parser) options = parser.parse_args(argv) try: # Make sure the model path is a valid file if not _check_model_path(options.model_path): return False # Check for classes json file, otherwise use the COCO class dictionary _check_and_load_json_classes(options.classes) global TENSORFLOW_PROCESS_BARRIER # pylint: disable=global-statement TENSORFLOW_PROCESS_BARRIER = Barrier( options.number_tensorflow_processes + 1) # Start Tensorflow processes start_tensorflow_processes(options.number_tensorflow_processes, options.model_path, options.detection_class, options.detection_threshold, options.max_processing_delay) # sleep to give the Tensorflow processes time to initialize try: TENSORFLOW_PROCESS_BARRIER.wait() except BrokenBarrierError as exc: print( f'Error waiting for Tensorflow processes to initialize: {exc}') return False # Start the API related things # Create robot object with a world object client sdk = bosdyn.client.create_standard_sdk('SpotFollowClient') robot = sdk.create_robot(options.hostname) robot.authenticate(options.username, options.password) #Time sync is necessary so that time-based filter requests can be converted robot.time_sync.wait_for_sync() # Verify the robot is not estopped and that an external application has registered and holds # an estop endpoint. verify_estop(robot) # Create the sdk clients robot_state_client = robot.ensure_client( RobotStateClient.default_service_name) robot_command_client = robot.ensure_client( RobotCommandClient.default_service_name) lease_client = robot.ensure_client(LeaseClient.default_service_name) image_client = robot.ensure_client(ImageClient.default_service_name) source_list = get_source_list(image_client) image_task = AsyncImage(image_client, source_list) robot_state_task = AsyncRobotState(robot_state_client) task_list = [image_task, robot_state_task] _async_tasks = AsyncTasks(task_list) print('Detect and follow client connected.') lease = lease_client.take() lease_keep = LeaseKeepAlive(lease_client) # Power on the robot and stand it up resp = robot.power_on() try: blocking_stand(robot_command_client) except CommandFailedError as exc: print( f'Error ({exc}) occurred while trying to stand. Check robot surroundings.' ) return False except CommandTimedOutError as exc: print(f'Stand command timed out: {exc}') return False print('Robot powered on and standing.') params_set = get_mobility_params() # This thread starts the async tasks for image and robot state retrieval update_thread = Thread(target=_update_thread, args=[_async_tasks]) update_thread.daemon = True update_thread.start() # Wait for the first responses. while any(task.proto is None for task in task_list): time.sleep(0.1) # Start image capture process image_capture_thread = Thread(target=capture_images, args=( image_task, options.sleep_between_capture, )) image_capture_thread.start() while True: # This comes from the tensorflow processes and limits the rate of this loop entry = PROCESSED_BOXES_QUEUE.get() # find the highest confidence bounding box highest_conf_source = _find_highest_conf_source(entry) if highest_conf_source is None: # no boxes or scores found continue capture_to_use = entry[highest_conf_source] raw_time = capture_to_use['raw_image_time'] time_gap = time.time() - raw_time if time_gap > options.max_processing_delay: continue # Skip image due to delay # Find the transform to highest confidence object using the depth sensor world_tform_object = get_object_position( capture_to_use['world_tform_cam'], capture_to_use['visual_image'], capture_to_use['depth_image'], capture_to_use['boxes'][0], ROTATION_ANGLES[capture_to_use['source']]) # get_object_position can fail if there is insufficient depth sensor information if not world_tform_object: continue scores = capture_to_use['scores'] print( f'Transform for object with confidence {scores[0]}: {world_tform_object}' ) print( f'Process latency: {time.time() - capture_to_use["system_cap_time"]}' ) tag_cmd = get_go_to(world_tform_object, robot_state_task.proto, params_set) end_time = 15.0 if tag_cmd is not None: robot_command_client.robot_command(lease=None, command=tag_cmd, end_time_secs=time.time() + end_time) return True except Exception as exc: # pylint: disable=broad-except LOGGER.error("Spot Tensorflow Detector threw an exception: %s", exc) return False