def _read_chunks_from_disk(self, fds, offsets, sizes): sum_read_nbytes = 0 # for prometheus counter st = time.time() for i in range(self.n_smd_files): offset = offsets[i] size = sizes[i] chunk = bytearray() for j in range(self.max_retries + 1): chunk.extend(os.pread(fds[i], size, offset)) got = memoryview(chunk).nbytes if got == sizes[i]: break offset += got size -= got self.bigdata[i].extend(chunk) sum_read_nbytes += sizes[i] en = time.time() rate = 0 if sum_read_nbytes > 0: rate = (sum_read_nbytes / 1e6) / (en - st) logging.info( f"event_manager: bd reads chunk {sum_read_nbytes/1e6:.5f} MB took {en-st:.2f} s (Rate: {rate:.2f} MB/s)" ) self._inc_prometheus_counter('MB', sum_read_nbytes / 1e6) return
def _request_rank(self, rankreq): st_req = time.time() self.comms.bd_comm.Recv(rankreq, source=MPI.ANY_SOURCE) en_req = time.time() self.c_sent.labels('seconds', rankreq[0]).inc(en_req - st_req) logging.info("node: eb%d got bd %d (request took %.5f seconds)" % (self.comms.smd_rank, rankreq[0], (en_req - st_req)))
def _end_prometheus_client(self, mpi_rank=0): if not self.monitor: return logging.info('ds_base: END PROMETHEUS CLIENT (JOBID:%s RANK: %d)' % (self.prom_man.jobid, mpi_rank)) self.e.set()
def _close_opened_smd_files(self): # Make sure to close all smd files opened by previous run if self.smd_fds is not None: txt = "ds_base: " for fd in self.smd_fds: os.close(fd) txt += f' close smd fd:{fd}' logging.info(txt)
def _request_data(self, smd_comm): smd_comm.Send(np.array([self.comms.smd_rank], dtype='i'), dest=0) info = MPI.Status() smd_comm.Probe(source=0, status=info) count = info.Get_elements(MPI.BYTE) smd_chunk = bytearray(count) smd_comm.Recv(smd_chunk, source=0) logging.info( f"node: eb{self.comms.smd_rank} received {count/1e6:.5f} MB from smd0" ) return smd_chunk
def push_metrics(self, e, from_whom=''): while not e.isSet(): push_to_gateway(PUSH_GATEWAY, job='psana_pushgateway', grouping_key={ 'jobid': self.jobid, 'rank': from_whom }, registry=registry) logging.info('TS: %s PUSHED JOBID: %s RANK: %s e.isSet():%s' % (time.time(), self.jobid, from_whom, e.isSet())) time.sleep(PUSH_INTERVAL_SECS)
def _read_event_from_disk(self, offsets, sizes): sum_read_nbytes = np.sum(sizes) st = time.time() data = self.dm.jump(offsets, sizes) en = time.time() rate = 0 if sum_read_nbytes > 0: rate = (sum_read_nbytes / 1e6) / (en - st) logging.info( f"event_manager: bd reads single {sum_read_nbytes/1e6:.5f} MB took {en-st:.2f} s (Rate: {rate:.2f} MB/s)" ) return data
def _start_prometheus_client(self, mpi_rank=0): if not self.monitor: logging.info('ds_base: RUN W/O PROMETHEUS CLENT') else: logging.info( 'ds_base: START PROMETHEUS CLIENT (JOBID:%s RANK: %d)' % (self.prom_man.jobid, mpi_rank)) self.e = threading.Event() self.t = threading.Thread(name='PrometheusThread%s' % (mpi_rank), target=self.prom_man.push_metrics, args=(self.e, mpi_rank), daemon=True) self.t.start()
def _setup_configs(self): """ Creates and broadcasts configs only called by _setup_run() """ g_ts = self.prom_man.get_metric("psana_timestamp") if nodetype == 'smd0': super()._close_opened_smd_files() self.smd_fds = np.array([ os.open(smd_file, os.O_RDONLY) for smd_file in self.smd_files ], dtype=np.int32) logging.info(f'mpi_ds: smd0 opened smd_fds: {self.smd_fds}') self.smdr_man = SmdReaderManager(self.smd_fds, self.dsparms) self._configs = self.smdr_man.get_next_dgrams() super()._apply_detector_selection() super()._setup_det_class_table() super()._set_configinfo() g_ts.labels("first_event").set(time.time()) nbytes = np.array([memoryview(config).shape[0] for config in self._configs], \ dtype='i') else: self._configs = None nbytes = np.empty(len(self.smd_files), dtype='i') self.comms.psana_comm.Bcast( nbytes, root=0) # no. of bytes is required for mpich if nodetype != 'smd0': self._configs = [np.empty(nbyte, dtype='b') for nbyte in nbytes] for i in range(len(self._configs)): self.comms.psana_comm.Bcast( [self._configs[i], nbytes[i], MPI.BYTE], root=0) if nodetype != 'smd0': self._configs = [ dgram.Dgram(view=config, offset=0) for config in self._configs ] g_ts.labels("first_event").set(time.time()) self._setup_det_class_table() self._set_configinfo()
def start(self): rankreq = np.empty(1, dtype='i') smd_comm = self.comms.smd_comm n_bd_nodes = self.comms.bd_comm.Get_size() - 1 bd_comm = self.comms.bd_comm smd_rank = self.comms.smd_rank waiting_bds = [] while True: smd_chunk = self._request_data(smd_comm) if not smd_chunk: break eb_man = EventBuilderManager(smd_chunk, self.configs, self.dsparms) # Build batch of events for smd_batch_dict, step_batch_dict in eb_man.batches(): # If single item and dest_rank=0, send to any bigdata nodes. if 0 in smd_batch_dict.keys(): smd_batch, _ = smd_batch_dict[0] step_batch, _ = step_batch_dict[0] if waiting_bds: rankreq[0] = waiting_bds.pop() else: self._request_rank(rankreq) missing_step_views = self.step_hist.get_buffer(rankreq[0]) batch = repack_for_bd(smd_batch, missing_step_views, self.configs, client=rankreq[0]) bd_comm.Send(batch, dest=rankreq[0]) # sending data to prometheus logging.info( f'node: eb{self.comms.smd_rank} sent {eb_man.eb.nevents} events ({memoryview(smd_batch).nbytes} bytes) to bd{rankreq[0]}' ) self.c_sent.labels('evts', rankreq[0]).inc(eb_man.eb.nevents) self.c_sent.labels('batches', rankreq[0]).inc() self.c_sent.labels('MB', rankreq[0]).inc( memoryview(batch).nbytes / 1e6) if eb_man.eb.nsteps > 0 and memoryview( step_batch).nbytes > 0: step_pf = PacketFooter(view=step_batch) self.step_hist.extend_buffers(step_pf.split_packets(), rankreq[0], as_event=True) # With > 1 dest_rank, start looping until all dest_rank batches # have been sent. else: # Check if destinations are valid destinations = np.asarray(list(smd_batch_dict.keys())) if any(destinations > n_bd_nodes): print( f"Found invalid destination ({destinations}). Must be <= {n_bd_nodes} (#big data nodes)" ) break while smd_batch_dict: if waiting_bds: # Check first if there are bd nodes waiting copied_waiting_bds = waiting_bds[:] for dest_rank in copied_waiting_bds: if dest_rank in smd_batch_dict: self._send_to_dest(dest_rank, smd_batch_dict, step_batch_dict, eb_man) waiting_bds.remove(dest_rank) if smd_batch_dict: self._request_rank(rankreq) dest_rank = rankreq[0] if dest_rank in smd_batch_dict: self._send_to_dest(dest_rank, smd_batch_dict, step_batch_dict, eb_man) else: waiting_bds.append(dest_rank) # end else -> if 0 in smd_batch_dict.keys() # end for smd_batch_dict in # Check if there are missing steps to be sent for this batch copied_waiting_bds = waiting_bds[:] for dest_rank in copied_waiting_bds: missing_step_views = self.step_hist.get_buffer(dest_rank) batch = repack_for_bd(bytearray(), missing_step_views, self.configs, client=dest_rank) if batch: bd_comm.Send(batch, dest_rank) waiting_bds.remove(dest_rank) for i in range(n_bd_nodes - len(waiting_bds)): self._request_rank(rankreq) missing_step_views = self.step_hist.get_buffer(rankreq[0]) batch = repack_for_bd(bytearray(), missing_step_views, self.configs, client=rankreq[0]) if batch: bd_comm.Send(batch, dest=rankreq[0]) else: waiting_bds.append(rankreq[0]) # end While True: done - kill idling nodes for dest_rank in waiting_bds: bd_comm.Send(bytearray(), dest=dest_rank) logging.info( f"node: eb{self.comms.smd_rank} send null byte to bd{dest_rank}" ) # - kill all other nodes for i in range(n_bd_nodes - len(waiting_bds)): self._request_rank(rankreq) bd_comm.Send(bytearray(), dest=rankreq[0]) logging.info( f"node: eb{self.comms.smd_rank} send null byte to bd{rankreq[0]}" )
def start(self): rankreq = np.empty(1, dtype='i') waiting_ebs = [] for (smd_chunk, step_chunk) in self.smdr_man.chunks(): # Creates a chunk from smd and epics data to send to SmdNode # Anatomy of a chunk (pf=packet_footer): # [ [smd0][smd1][smd2][pf] ][ [epics0][epics1][epics2][pf] ][ pf ] # ----- smd_chunk ------ ---------epics_chunk------- # -------------------------- chunk ------------------------------ # Read new epics data as available in the queue # then send only unseen portion of data to the evtbuilder rank. if not (smd_chunk or step_chunk): break st_req = time.time() self.comms.smd_comm.Recv(rankreq, source=MPI.ANY_SOURCE) en_req = time.time() # Check missing steps for the current client missing_step_views = self.step_hist.get_buffer(rankreq[0], smd0=True) # Update step buffers (after getting the missing steps step_pf = PacketFooter(view=step_chunk) step_views = step_pf.split_packets() self.step_hist.extend_buffers(step_views, rankreq[0]) smd_extended = repack_for_eb(smd_chunk, missing_step_views, self.configs) self.comms.smd_comm.Send(smd_extended, dest=rankreq[0]) # sending data to prometheus self.c_sent.labels('evts', rankreq[0]).inc(self.smdr_man.got_events) self.c_sent.labels('batches', rankreq[0]).inc() self.c_sent.labels('MB', rankreq[0]).inc( memoryview(smd_extended).nbytes / 1e6) self.c_sent.labels('seconds', rankreq[0]).inc(en_req - st_req) logging.info( f'node: smd0 sent {self.smdr_man.got_events} events to {rankreq[0]} (waiting for this rank took {en_req-st_req:.5f} seconds)' ) found_endrun = self.smdr_man.smdr.found_endrun() if found_endrun: logging.info("node: smd0 found_endrun") break # end for (smd_chunk, step_chunk) # check if there are missing steps to be sent for i in range(self.comms.n_smd_nodes): self.comms.smd_comm.Recv(rankreq, source=MPI.ANY_SOURCE) missing_step_views = self.step_hist.get_buffer(rankreq[0], smd0=True) smd_extended = repack_for_eb(bytearray(), missing_step_views, self.configs) if smd_extended: self.comms.smd_comm.Send(smd_extended, dest=rankreq[0]) else: waiting_ebs.append(rankreq[0]) # kill waiting bd nodes for dest_rank in waiting_ebs: self.comms.smd_comm.Send(bytearray(), dest=dest_rank) for i in range(self.comms.n_smd_nodes - len(waiting_ebs)): self.comms.smd_comm.Recv(rankreq, source=MPI.ANY_SOURCE) self.comms.smd_comm.Send(bytearray(), dest=rankreq[0])