def after_run(self, run_context, run_value): metrics.emit_store(name="loss", value=run_value.results['loss'], tags={}) metrics.emit_store(name="auc", value=run_value.results['auc'], tags={})
def update_stats(self, item, kind='joined'): """ Args: item: RawDataIter.Item. Item from iterating RawDataVisitor kind: str. 'joined', 'unjoined', 'negative'. Indicate where the item should be counted towards. Returns: None Update stats dict. Emit join status and other fields of each item to ES. """ assert kind in ('joined', 'unjoined', 'negative') if kind == 'unjoined': self.sample_unjoined(item.example_id) item_stat = { 'joined': int(kind == 'joined'), 'original': int(kind != 'negative'), 'negative': int(kind == 'negative') } tags = copy.deepcopy(self._tags) for field in self._stat_fields: value = self._convert_to_str(getattr(item, field, '#None#')) item_stat[field] = value self._stats[kind]['{}={}'.format(field, value)] += 1 tags.update(item_stat) tags['example_id'] = self._convert_to_str(item.example_id) tags['event_time'] = self._convert_to_str(item.event_time) tags['event_time_iso'] = convert_to_iso_format(item.event_time) metrics.emit_store(name='datajoin', value=0, tags=tags)
def emit_metric(self, item): if random.random() < self._sample_ratio: tags = copy.deepcopy(self._tags) for field in self._stat_fields: value = self.convert_to_str(getattr(item, field, '#None#')) tags[field] = value tags['example_id'] = self.convert_to_str(item.example_id) tags['event_time'] = convert_to_iso_format(item.event_time) metrics.emit_store(name='input_data', value=0, tags=tags)
def _emit_dumper_metrics(self, file_index, dumped_index): dump_duration = time.time() - self._latest_dump_timestamp metrics.emit_timer(name='example_id_dump_duration', value=int(dump_duration), tags=self._metrics_tags) metrics.emit_store(name='example_dump_file_index', value=file_index, tags=self._metrics_tags) metrics.emit_store(name='example_id_dumped_index', value=dumped_index, tags=self._metrics_tags)
def _stats_metric(self, global_step, results): with self._stats_client.pipeline() as pipe: pipe.gauge("trainer.metric_global_step", global_step) for key in self._metric_names: value = results[key] pipe.gauge("trainer.metric_value", value.sum(), tags={"metric": key}) # for compatibility, also write to metrics(es) metrics.emit_store(name=key, value=value, tags={})
def after_run(self, run_context, run_value): self._iter += 1 if self._iter % self._every_n_iter == 0: result = run_value.results tags = {} if 'event_time' in result: event_time = result.pop('event_time').decode() tags['event_time'] = fcc.convert_to_datetime( event_time.decode(), True).isoformat(timespec='microseconds') for name, value in result.items(): metrics.emit_store(name=name, value=value, tags=tags)
def after_run(self, run_context, run_value): self._iter += 1 if self._iter % self._every_n_iter == 0: result = run_value.results tags = {} for tag in self._tag_names: if tag in result: tags[tag] = result[tag] for name in self._tensor_names: if name in result: metrics.emit_store(name=name, value=result[name], tags=tags)
def emit_metric(self, item): if random.random() < self._sample_ratio: tags = copy.deepcopy(self._tags) for field in self._stat_fields: value = convert_to_str(getattr(item, field, '#None#')) tags[field] = value tags['example_id'] = convert_to_str(item.example_id) tags['event_time'] = convert_to_datetime(item.event_time, True) \ .isoformat(timespec='microseconds') tags['process_time'] = datetime.now(tz=pytz.utc) \ .isoformat(timespec='microseconds') metrics.emit_store(name='input_data', value=0, tags=tags, index_type='raw_data')
def make_processor(self, next_index): input_finished = False with self._lock: if next_index is None: return if self._check_index_rollback(next_index): self._batch_queue = [] self._flying_item_count = 0 if len(self._batch_queue) > 0: end_batch = self._batch_queue[-1] next_index = end_batch.begin_index + len(end_batch) input_finished = self._input_finished assert next_index >= 0, "the next index should >= 0" end_batch = None batch_finished = False iter_round = 0 processed_index = None start_tm = time.time() for batch, batch_finished in self._make_inner_generator(next_index): if batch is not None: if len(batch) > 0: latency_mn = '{}.produce.latency'.format(self.name()) metrics.emit_timer(name=latency_mn, value=time.time() - start_tm, tags=self._get_metrics_tags()) store_mn = '{}.produce.index'.format(self.name()) metrics.emit_store(name=store_mn, value=batch.begin_index + len(batch) - 1, tags=self._get_metrics_tags()) self._append_next_item_batch(batch) yield batch start_tm = time.time() self._update_last_index(batch.begin_index + len(batch) - 1) iter_round += 1 processed_index = batch.begin_index + len(batch) - 1 if iter_round % 16 == 0: logging.info("%s process to index %d", self.name(), processed_index) if processed_index is not None: logging.info("%s process to index %d when round finished", self.name(), processed_index) if input_finished and batch_finished: self._set_process_finished()
def _update_peer_index(self, impl_ctx, peer_next_index, peer_dumped_index): assert isinstance(impl_ctx, TransmitLeader.ImplContext) _, dumped_index = impl_ctx.get_peer_index() impl_ctx.set_peer_index(peer_next_index, peer_dumped_index) if dumped_index < peer_dumped_index: req = dj_pb.RawDataRequest( data_source_meta=self._data_source.data_source_meta, rank_id=self._rank_id, partition_id=impl_ctx.partition_id, peer_dumped_index=dj_pb.PeerDumpedIndex( peer_dumped_index=peer_dumped_index)) rsp = self._master_client.ForwardPeerDumpedIndex(req) if rsp.code != 0: raise RuntimeError("{} failed to forward peer dumped index "\ "to {} reason: {}".format(self._repr_str, peer_dumped_index, rsp.error_message)) metrics.emit_store('peer_dumped_index', peer_dumped_index, self._get_metrics_tag(impl_ctx))
def iterator(): with lock: resend_msgs = list(resend_list) for item in resend_msgs: logging.warning("Streaming resend message seq_num=%d", item.seq_num) metrics.emit_store(name="resend_msg_seq_num", value=int(item.seq_num), tags={}) yield item while True: item = self._transmit_queue.get() with lock: resend_list.append(item) logging.debug("Streaming send message seq_num=%d", item.seq_num) metrics.emit_store(name="send_msg_seq_num", value=int(item.seq_num), tags={}) yield item
def after_run(self, run_context, run_value): self._iter += 1 if self._iter % self._every_n_iter == 0: for name, value in run_value.results.items(): metrics.emit_store(name=name, value=value, tags={})
def _emit_logger(self, metrics_tags): meta = self._data_block_meta nmetric_tags = self._metrics_tags if metrics_tags is not None and len(metrics_tags) > 0: nmetric_tags = copy.deepcopy(self._metrics_tags) nmetric_tags.update(metrics_tags) metrics.emit_store(name='data_block_index', value=meta.data_block_index, tags=nmetric_tags) metrics.emit_store(name='stats_cum_join_num', value=meta.joiner_stats_info.stats_cum_join_num, tags=nmetric_tags) metrics.emit_store(name='actual_cum_join_num', value=meta.joiner_stats_info.actual_cum_join_num, tags=nmetric_tags) metrics.emit_store(name='leader_stats_index', value=meta.joiner_stats_info.leader_stats_index, tags=nmetric_tags) metrics.emit_store(name='follower_stats_index', value=meta.joiner_stats_info.follower_stats_index, tags=nmetric_tags) leader_join_rate = 0.0 if meta.joiner_stats_info.leader_stats_index > 0: leader_join_rate = meta.joiner_stats_info.actual_cum_join_num / \ meta.joiner_stats_info.leader_stats_index follower_join_rate = 0.0 if meta.joiner_stats_info.follower_stats_index > 0: follower_join_rate = meta.joiner_stats_info.actual_cum_join_num / \ meta.joiner_stats_info.follower_stats_index metrics.emit_store(name='leader_join_rate_percent', value=int(leader_join_rate * 100), tags=nmetric_tags) metrics.emit_store(name='follower_join_rate_percent', value=int(follower_join_rate * 100), tags=nmetric_tags) logging.info("create new data block id: %s, data block index: %d," \ "stats:\n stats_cum_join_num: %d, actual_cum_join_num: "\ "%d, leader_stats_index: %d, follower_stats_index: %d, "\ "leader_join_rate: %f, follower_join_rate: %f", meta.block_id, meta.data_block_index, meta.joiner_stats_info.stats_cum_join_num, meta.joiner_stats_info.actual_cum_join_num, meta.joiner_stats_info.leader_stats_index, meta.joiner_stats_info.follower_stats_index, leader_join_rate, follower_join_rate)
def _client_daemon_fn(self): stop_event = threading.Event() generator = None channel = make_insecure_channel(self._remote_address, ChannelType.REMOTE, options=self._grpc_options, compression=self._compression) client = make_ready_client(channel, stop_event) lock = threading.Lock() resend_list = collections.deque() @metrics.timer(func_name="shutdown_fn", tags={}) def shutdown_fn(): with lock: while len(resend_list) > 0 or not self._transmit_queue.empty(): logging.debug( "Waiting for resend queue's being cleaned. " "Resend queue size: %d", len(resend_list)) lock.release() time.sleep(1) lock.acquire() stop_event.set() if generator is not None: generator.cancel() self._client_daemon_shutdown_fn = shutdown_fn while not stop_event.is_set(): try: def iterator(): with lock: resend_msgs = list(resend_list) for item in resend_msgs: logging.warning("Streaming resend message seq_num=%d", item.seq_num) metrics.emit_store(name="resend_msg_seq_num", value=int(item.seq_num), tags={}) yield item while True: item = self._transmit_queue.get() with lock: resend_list.append(item) logging.debug("Streaming send message seq_num=%d", item.seq_num) metrics.emit_store(name="send_msg_seq_num", value=int(item.seq_num), tags={}) yield item time_start = time.time() generator = client.StreamTransmit(iterator()) time_end = time.time() metrics.emit_timer(name="one_StreamTransmit_spend", value=int(time_end - time_start), tags={}) for response in generator: if response.status.code == common_pb.STATUS_SUCCESS: logging.debug( "Message with seq_num=%d is " "confirmed", response.next_seq_num - 1) elif response.status.code == \ common_pb.STATUS_MESSAGE_DUPLICATED: logging.debug( "Resent Message with seq_num=%d is " "confirmed", response.next_seq_num - 1) elif response.status.code == \ common_pb.STATUS_MESSAGE_MISSING: raise RuntimeError("Message with seq_num=%d is " "missing!" % (response.next_seq_num - 1)) else: raise RuntimeError("Trainsmit failed with %d" % response.status.code) with lock: while resend_list and \ resend_list[0].seq_num < response.next_seq_num: resend_list.popleft() min_seq_num_to_resend = resend_list[0].seq_num \ if resend_list else "NaN" logging.debug( "Resend queue size: %d, starting from seq_num=%s", len(resend_list), min_seq_num_to_resend) metrics.emit_store(name="sum_of_resend", value=int(len(resend_list)), tags={}) except Exception as e: # pylint: disable=broad-except if not stop_event.is_set(): logging.warning("Bridge streaming broken: %s.", repr(e)) finally: generator.cancel() channel.close() logging.warning( "Restarting streaming: resend queue size: %d, " "starting from seq_num=%s", len(resend_list), resend_list and resend_list[0].seq_num or "NaN") channel = make_insecure_channel(self._remote_address, ChannelType.REMOTE, options=self._grpc_options, compression=self._compression) client = make_ready_client(channel, stop_event) self._check_remote_heartbeat(client)