示例#1
0
 def __init__(self, dashboard_agent):
     """Initialize the reporter object."""
     super().__init__(dashboard_agent)
     self._cpu_counts = (psutil.cpu_count(),
                         psutil.cpu_count(logical=False))
     self._ip = ray._private.services.get_node_ip_address()
     self._hostname = socket.gethostname()
     self._workers = set()
     self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
     self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port)
     self._key = f"{reporter_consts.REPORTER_PREFIX}" \
                 f"{self._dashboard_agent.node_id}"
     # A list of gauges to record and export metrics.
     self._gauges = {
         "node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node",
                           "percentage", ["ip"]),
         "node_mem": Gauge("node_mem", "Total memory usage on a ray node",
                           "mb", ["ip"]),
         "raylet_cpu": Gauge("raylet_cpu",
                             "CPU usage of the raylet on a node.",
                             "percentage", ["ip", "pid"]),
         "raylet_mem": Gauge("raylet_mem",
                             "Memory usage of the raylet on a node", "mb",
                             ["ip", "pid"])
     }
示例#2
0
 def __init__(self, dashboard_agent):
     """Initialize the reporter object."""
     super().__init__(dashboard_agent)
     self._cpu_counts = (psutil.cpu_count(),
                         psutil.cpu_count(logical=False))
     self._ip = ray.services.get_node_ip_address()
     self._hostname = socket.gethostname()
     self._workers = set()
     self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
     self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port)
示例#3
0
文件: reporter.py 项目: zzmcdc/ray
    def __init__(self,
                 redis_address,
                 port,
                 metrics_export_port,
                 redis_password=None):
        """Initialize the reporter object."""
        self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
        self.ip = ray._private.services.get_node_ip_address()
        self.hostname = platform.node()
        self.port = port
        self.metrics_agent = MetricsAgent(metrics_export_port)
        self.reporter_grpc_server = ReporterServer(self.metrics_agent)

        _ = psutil.cpu_percent()  # For initialization

        self.redis_key = f"{ray.gcs_utils.REPORTER_CHANNEL}.{self.hostname}"
        self.redis_client = ray._private.services.create_redis_client(
            redis_address, password=redis_password)

        self.network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
示例#4
0
    def __init__(self, redis_address, port, redis_password=None):
        """Initialize the reporter object."""
        self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
        self.ip = ray.services.get_node_ip_address()
        self.hostname = platform.node()
        self.port = port
        metrics_agent_port = os.getenv("METRICS_AGENT_PORT")
        if not metrics_agent_port:
            metrics_agent_port = get_unused_port()
        self.metrics_agent = MetricsAgent(metrics_agent_port)
        self.reporter_grpc_server = ReporterServer(self.metrics_agent)

        _ = psutil.cpu_percent()  # For initialization

        self.redis_key = "{}.{}".format(ray.gcs_utils.REPORTER_CHANNEL,
                                        self.hostname)
        self.redis_client = ray.services.create_redis_client(
            redis_address, password=redis_password)

        self.network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
示例#5
0
def cleanup_agent():
    global metrics_agent
    if not metrics_agent:
        metrics_agent = MetricsAgent(get_unused_port())
    yield
    metrics_agent._registry = defaultdict(lambda: None)
示例#6
0
class ReporterAgent(dashboard_utils.DashboardAgentModule,
                    reporter_pb2_grpc.ReporterServiceServicer):
    """A monitor process for monitoring Ray nodes.

    Attributes:
        dashboard_agent: The DashboardAgent object contains global config
    """
    def __init__(self, dashboard_agent):
        """Initialize the reporter object."""
        super().__init__(dashboard_agent)
        self._cpu_counts = (psutil.cpu_count(),
                            psutil.cpu_count(logical=False))
        self._ip = ray.services.get_node_ip_address()
        self._hostname = socket.gethostname()
        self._workers = set()
        self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
        self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port)

    async def GetProfilingStats(self, request, context):
        pid = request.pid
        duration = request.duration
        profiling_file_path = os.path.join(ray.utils.get_ray_temp_dir(),
                                           f"{pid}_profiling.txt")
        sudo = "sudo" if ray.utils.get_user() != "root" else ""
        process = await asyncio.create_subprocess_shell(
            f"{sudo} $(which py-spy) record "
            f"-o {profiling_file_path} -p {pid} -d {duration} -f speedscope",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            shell=True)
        stdout, stderr = await process.communicate()
        if process.returncode != 0:
            profiling_stats = ""
        else:
            with open(profiling_file_path, "r") as f:
                profiling_stats = f.read()
        return reporter_pb2.GetProfilingStatsReply(
            profiling_stats=profiling_stats, std_out=stdout, std_err=stderr)

    async def ReportMetrics(self, request, context):
        # NOTE: Exceptions are not propagated properly
        # when we don't catch them here.
        try:
            metrcs_description_required = (
                self._metrics_agent.record_metrics_points(
                    request.metrics_points))
        except Exception as e:
            logger.error(e)
            logger.error(traceback.format_exc())

        # If metrics description is missing, we should notify cpp processes
        # that we need them. Cpp processes will then report them to here.
        # We need it when (1) a new metric is reported (application metric)
        # (2) a reporter goes down and restarted (currently not implemented).
        return reporter_pb2.ReportMetricsReply(
            metrcs_description_required=metrcs_description_required)

    @staticmethod
    def _get_cpu_percent():
        return psutil.cpu_percent()

    @staticmethod
    def _get_gpu_usage():
        if gpustat is None:
            return []
        gpu_utilizations = []
        gpus = []
        try:
            gpus = gpustat.new_query().gpus
        except Exception as e:
            logger.debug(
                "gpustat failed to retrieve GPU information: {}".format(e))
        for gpu in gpus:
            # Note the keys in this dict have periods which throws
            # off javascript so we change .s to _s
            gpu_data = {
                "_".join(key.split(".")): val
                for key, val in gpu.entry.items()
            }
            gpu_utilizations.append(gpu_data)
        return gpu_utilizations

    @staticmethod
    def _get_boot_time():
        return psutil.boot_time()

    @staticmethod
    def _get_network_stats():
        ifaces = [
            v for k, v in psutil.net_io_counters(pernic=True).items()
            if k[0] == "e"
        ]

        sent = sum((iface.bytes_sent for iface in ifaces))
        recv = sum((iface.bytes_recv for iface in ifaces))
        return sent, recv

    @staticmethod
    def _get_mem_usage():
        vm = psutil.virtual_memory()
        return vm.total, vm.available, vm.percent

    @staticmethod
    def _get_disk_usage():
        dirs = [
            os.environ["USERPROFILE"] if sys.platform == "win32" else os.sep,
            ray.utils.get_user_temp_dir(),
        ]
        return {x: psutil.disk_usage(x) for x in dirs}

    def _get_workers(self):
        curr_proc = psutil.Process()
        parent = curr_proc.parent()
        if parent is None or parent.pid == 1:
            return []
        else:
            workers = set(parent.children())
            self._workers.intersection_update(workers)
            self._workers.update(workers)
            self._workers.discard(curr_proc)
            return [
                w.as_dict(attrs=[
                    "pid",
                    "create_time",
                    "cpu_percent",
                    "cpu_times",
                    "cmdline",
                    "memory_info",
                ]) for w in self._workers if w.status() != psutil.STATUS_ZOMBIE
            ]

    @staticmethod
    def _get_raylet_cmdline():
        curr_proc = psutil.Process()
        parent = curr_proc.parent()
        if parent.pid == 1:
            return ""
        else:
            return parent.cmdline()

    def _get_load_avg(self):
        if sys.platform == "win32":
            cpu_percent = psutil.cpu_percent()
            load = (cpu_percent, cpu_percent, cpu_percent)
        else:
            load = os.getloadavg()
        per_cpu_load = tuple((round(x / self._cpu_counts[0], 2) for x in load))
        return load, per_cpu_load

    def _get_all_stats(self):
        now = dashboard_utils.to_posix_time(datetime.datetime.utcnow())
        network_stats = self._get_network_stats()

        self._network_stats_hist.append((now, network_stats))
        self._network_stats_hist = self._network_stats_hist[-7:]
        then, prev_network_stats = self._network_stats_hist[0]
        netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
                    (network_stats[1] - prev_network_stats[1]) / (now - then))

        return {
            "now": now,
            "hostname": self._hostname,
            "ip": self._ip,
            "cpu": self._get_cpu_percent(),
            "cpus": self._cpu_counts,
            "mem": self._get_mem_usage(),
            "workers": self._get_workers(),
            "bootTime": self._get_boot_time(),
            "loadAvg": self._get_load_avg(),
            "disk": self._get_disk_usage(),
            "gpus": self._get_gpu_usage(),
            "net": netstats,
            "cmdline": self._get_raylet_cmdline(),
        }

    async def _perform_iteration(self):
        """Get any changes to the log files and push updates to Redis."""
        aioredis_client = await aioredis.create_redis_pool(
            address=self._dashboard_agent.redis_address,
            password=self._dashboard_agent.redis_password)

        while True:
            try:
                stats = self._get_all_stats()
                await aioredis_client.publish(
                    "{}{}".format(reporter_consts.REPORTER_PREFIX,
                                  self._hostname), jsonify_asdict(stats))
            except Exception:
                logger.exception("Error publishing node physical stats.")
            await asyncio.sleep(reporter_consts.REPORTER_UPDATE_INTERVAL_MS /
                                1000)

    async def run(self, server):
        reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server)
        await self._perform_iteration()
示例#7
0
 def __init__(self, dashboard_agent):
     """Initialize the reporter object."""
     super().__init__(dashboard_agent)
     self._cpu_counts = (psutil.cpu_count(),
                         psutil.cpu_count(logical=False))
     self._ip = ray._private.services.get_node_ip_address()
     self._hostname = socket.gethostname()
     self._workers = set()
     self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
     self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port)
     self._key = f"{reporter_consts.REPORTER_PREFIX}" \
                 f"{self._dashboard_agent.node_id}"
     # A list of gauges to record and export metrics.
     self._gauges = {
         "node_cpu_utilization":
         Gauge("node_cpu_utilization", "Total CPU usage on a ray node",
               "percentage", ["ip"]),
         "node_cpu_count":
         Gauge("node_cpu_count", "Total CPUs available on a ray node",
               "cores", ["ip"]),
         "node_mem_used":
         Gauge("node_mem_used", "Memory usage on a ray node", "bytes",
               ["ip"]),
         "node_mem_available":
         Gauge("node_mem_available", "Memory available on a ray node",
               "bytes", ["ip"]),
         "node_mem_total":
         Gauge("node_mem_total", "Total memory on a ray node", "bytes",
               ["ip"]),
         "node_gpus_available":
         Gauge("node_gpus_available", "Total GPUs available on a ray node",
               "percentage", ["ip"]),
         "node_gpus_utilization":
         Gauge("node_gpus_utilization", "Total GPUs usage on a ray node",
               "percentage", ["ip"]),
         "node_gram_used":
         Gauge("node_gram_used", "Total GPU RAM usage on a ray node",
               "bytes", ["ip"]),
         "node_gram_available":
         Gauge("node_gram_available",
               "Total GPU RAM available on a ray node", "bytes", ["ip"]),
         "node_disk_usage":
         Gauge("node_disk_usage", "Total disk usage (bytes) on a ray node",
               "bytes", ["ip"]),
         "node_disk_utilization_percentage":
         Gauge("node_disk_utilization_percentage",
               "Total disk utilization (percentage) on a ray node",
               "percentage", ["ip"]),
         "node_network_sent":
         Gauge("node_network_sent", "Total network sent", "bytes", ["ip"]),
         "node_network_received":
         Gauge("node_network_received", "Total network received", "bytes",
               ["ip"]),
         "node_network_send_speed":
         Gauge("node_network_send_speed", "Network send speed", "bytes/sec",
               ["ip"]),
         "node_network_receive_speed":
         Gauge("node_network_receive_speed", "Network receive speed",
               "bytes/sec", ["ip"]),
         "raylet_cpu":
         Gauge("raylet_cpu", "CPU usage of the raylet on a node.",
               "percentage", ["ip", "pid"]),
         "raylet_mem":
         Gauge("raylet_mem", "Memory usage of the raylet on a node", "mb",
               ["ip", "pid"])
     }
示例#8
0
class ReporterAgent(dashboard_utils.DashboardAgentModule,
                    reporter_pb2_grpc.ReporterServiceServicer):
    """A monitor process for monitoring Ray nodes.

    Attributes:
        dashboard_agent: The DashboardAgent object contains global config
    """
    def __init__(self, dashboard_agent):
        """Initialize the reporter object."""
        super().__init__(dashboard_agent)
        self._cpu_counts = (psutil.cpu_count(),
                            psutil.cpu_count(logical=False))
        self._ip = ray._private.services.get_node_ip_address()
        self._hostname = socket.gethostname()
        self._workers = set()
        self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
        self._metrics_agent = MetricsAgent(dashboard_agent.metrics_export_port)
        self._key = f"{reporter_consts.REPORTER_PREFIX}" \
                    f"{self._dashboard_agent.node_id}"
        # A list of gauges to record and export metrics.
        self._gauges = {
            "node_cpu_utilization":
            Gauge("node_cpu_utilization", "Total CPU usage on a ray node",
                  "percentage", ["ip"]),
            "node_cpu_count":
            Gauge("node_cpu_count", "Total CPUs available on a ray node",
                  "cores", ["ip"]),
            "node_mem_used":
            Gauge("node_mem_used", "Memory usage on a ray node", "bytes",
                  ["ip"]),
            "node_mem_available":
            Gauge("node_mem_available", "Memory available on a ray node",
                  "bytes", ["ip"]),
            "node_mem_total":
            Gauge("node_mem_total", "Total memory on a ray node", "bytes",
                  ["ip"]),
            "node_gpus_available":
            Gauge("node_gpus_available", "Total GPUs available on a ray node",
                  "percentage", ["ip"]),
            "node_gpus_utilization":
            Gauge("node_gpus_utilization", "Total GPUs usage on a ray node",
                  "percentage", ["ip"]),
            "node_gram_used":
            Gauge("node_gram_used", "Total GPU RAM usage on a ray node",
                  "bytes", ["ip"]),
            "node_gram_available":
            Gauge("node_gram_available",
                  "Total GPU RAM available on a ray node", "bytes", ["ip"]),
            "node_disk_usage":
            Gauge("node_disk_usage", "Total disk usage (bytes) on a ray node",
                  "bytes", ["ip"]),
            "node_disk_utilization_percentage":
            Gauge("node_disk_utilization_percentage",
                  "Total disk utilization (percentage) on a ray node",
                  "percentage", ["ip"]),
            "node_network_sent":
            Gauge("node_network_sent", "Total network sent", "bytes", ["ip"]),
            "node_network_received":
            Gauge("node_network_received", "Total network received", "bytes",
                  ["ip"]),
            "node_network_send_speed":
            Gauge("node_network_send_speed", "Network send speed", "bytes/sec",
                  ["ip"]),
            "node_network_receive_speed":
            Gauge("node_network_receive_speed", "Network receive speed",
                  "bytes/sec", ["ip"]),
            "raylet_cpu":
            Gauge("raylet_cpu", "CPU usage of the raylet on a node.",
                  "percentage", ["ip", "pid"]),
            "raylet_mem":
            Gauge("raylet_mem", "Memory usage of the raylet on a node", "mb",
                  ["ip", "pid"])
        }

    async def GetProfilingStats(self, request, context):
        pid = request.pid
        duration = request.duration
        profiling_file_path = os.path.join(ray.utils.get_ray_temp_dir(),
                                           f"{pid}_profiling.txt")
        sudo = "sudo" if ray.utils.get_user() != "root" else ""
        process = await asyncio.create_subprocess_shell(
            f"{sudo} $(which py-spy) record "
            f"-o {profiling_file_path} -p {pid} -d {duration} -f speedscope",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            shell=True)
        stdout, stderr = await process.communicate()
        if process.returncode != 0:
            profiling_stats = ""
        else:
            with open(profiling_file_path, "r") as f:
                profiling_stats = f.read()
        return reporter_pb2.GetProfilingStatsReply(
            profiling_stats=profiling_stats, std_out=stdout, std_err=stderr)

    async def ReportOCMetrics(self, request, context):
        # This function receives a GRPC containing OpenCensus (OC) metrics
        # from a Ray process, then exposes those metrics to Prometheus.
        try:
            self._metrics_agent.record_metric_points_from_protobuf(
                request.metrics)
        except Exception:
            logger.error(traceback.format_exc())
        return reporter_pb2.ReportOCMetricsReply()

    @staticmethod
    def _get_cpu_percent():
        return psutil.cpu_percent()

    @staticmethod
    def _get_gpu_usage():
        if gpustat is None:
            return []
        gpu_utilizations = []
        gpus = []
        try:
            gpus = gpustat.new_query().gpus
        except Exception as e:
            logger.debug(f"gpustat failed to retrieve GPU information: {e}")
        for gpu in gpus:
            # Note the keys in this dict have periods which throws
            # off javascript so we change .s to _s
            gpu_data = {
                "_".join(key.split(".")): val
                for key, val in gpu.entry.items()
            }
            gpu_utilizations.append(gpu_data)
        return gpu_utilizations

    @staticmethod
    def _get_boot_time():
        return psutil.boot_time()

    @staticmethod
    def _get_network_stats():
        ifaces = [
            v for k, v in psutil.net_io_counters(pernic=True).items()
            if k[0] == "e"
        ]

        sent = sum((iface.bytes_sent for iface in ifaces))
        recv = sum((iface.bytes_recv for iface in ifaces))
        return sent, recv

    @staticmethod
    def _get_mem_usage():
        vm = psutil.virtual_memory()
        return vm.total, vm.available, vm.percent, vm.used

    @staticmethod
    def _get_disk_usage():
        dirs = [
            os.environ["USERPROFILE"] if sys.platform == "win32" else os.sep,
            ray.utils.get_user_temp_dir(),
        ]
        return {x: psutil.disk_usage(x) for x in dirs}

    def _get_workers(self):
        curr_proc = psutil.Process()
        parent = curr_proc.parent()
        if parent is None or parent.pid == 1:
            return []
        else:
            workers = set(parent.children())
            self._workers.intersection_update(workers)
            self._workers.update(workers)
            self._workers.discard(curr_proc)
            return [
                w.as_dict(attrs=[
                    "pid",
                    "create_time",
                    "cpu_percent",
                    "cpu_times",
                    "cmdline",
                    "memory_info",
                ]) for w in self._workers if w.status() != psutil.STATUS_ZOMBIE
            ]

    def _get_raylet_stats(self):
        curr_proc = psutil.Process()
        # Here, parent is always raylet because the
        # dashboard agent is a child of the raylet process.
        parent = curr_proc.parent()
        if parent is None or parent.pid == 1:
            return []
        if parent.status() == psutil.STATUS_ZOMBIE:
            return []

        return parent.as_dict(attrs=[
            "pid",
            "create_time",
            "cpu_percent",
            "cpu_times",
            "cmdline",
            "memory_info",
        ])

    @staticmethod
    def _get_raylet_cmdline():
        try:
            curr_proc = psutil.Process()
            parent = curr_proc.parent()
            if parent.pid == 1:
                return []
            else:
                return parent.cmdline()
        except (psutil.AccessDenied, ProcessLookupError):
            return []

    def _get_load_avg(self):
        if sys.platform == "win32":
            cpu_percent = psutil.cpu_percent()
            load = (cpu_percent, cpu_percent, cpu_percent)
        else:
            load = os.getloadavg()
        per_cpu_load = tuple((round(x / self._cpu_counts[0], 2) for x in load))
        return load, per_cpu_load

    def _get_all_stats(self):
        now = dashboard_utils.to_posix_time(datetime.datetime.utcnow())
        network_stats = self._get_network_stats()

        self._network_stats_hist.append((now, network_stats))
        self._network_stats_hist = self._network_stats_hist[-7:]
        then, prev_network_stats = self._network_stats_hist[0]
        prev_send, prev_recv = prev_network_stats
        now_send, now_recv = network_stats
        network_speed_stats = ((now_send - prev_send) / (now - then),
                               (now_recv - prev_recv) / (now - then))
        return {
            "now": now,
            "hostname": self._hostname,
            "ip": self._ip,
            "cpu": self._get_cpu_percent(),
            "cpus": self._cpu_counts,
            "mem": self._get_mem_usage(),
            "workers": self._get_workers(),
            "bootTime": self._get_boot_time(),
            "loadAvg": self._get_load_avg(),
            "disk": self._get_disk_usage(),
            "gpus": self._get_gpu_usage(),
            "network": network_stats,
            "network_speed": network_speed_stats,
            "cmdline": self._get_raylet_cmdline(),
        }

    def _record_stats(self, stats):
        ip = stats["ip"]
        # -- CPU per node --
        cpu_usage = float(stats["cpu"])
        cpu_record = Record(gauge=self._gauges["node_cpu_utilization"],
                            value=cpu_usage,
                            tags={"ip": ip})

        cpu_count, _ = stats["cpus"]
        cpu_count_record = Record(gauge=self._gauges["node_cpu_count"],
                                  value=cpu_count,
                                  tags={"ip": ip})

        # -- Mem per node --
        mem_total, mem_available, _, mem_used = stats["mem"]
        mem_used_record = Record(gauge=self._gauges["node_mem_used"],
                                 value=mem_used,
                                 tags={"ip": ip})
        mem_available_record = Record(gauge=self._gauges["node_mem_available"],
                                      value=mem_available,
                                      tags={"ip": ip})
        mem_total_record = Record(gauge=self._gauges["node_mem_total"],
                                  value=mem_total,
                                  tags={"ip": ip})

        # -- GPU per node --
        gpus = stats["gpus"]
        gpus_available = len(gpus)

        if gpus_available:
            gpus_utilization, gram_used, gram_total = 0, 0, 0
            for gpu in gpus:
                gpus_utilization += gpu["utilization_gpu"]
                gram_used += gpu["memory_used"]
                gram_total += gpu["memory_total"]

            gram_available = gram_total - gram_used

            gpus_available_record = Record(
                gauge=self._gauges["node_gpus_available"],
                value=gpus_available,
                tags={"ip": ip})
            gpus_utilization_record = Record(
                gauge=self._gauges["node_gpus_utilization"],
                value=gpus_utilization,
                tags={"ip": ip})
            gram_used_record = Record(gauge=self._gauges["node_gram_used"],
                                      value=gram_used,
                                      tags={"ip": ip})
            gram_available_record = Record(
                gauge=self._gauges["node_gram_available"],
                value=gram_available,
                tags={"ip": ip})

        # -- Disk per node --
        used, free = 0, 0
        for entry in stats["disk"].values():
            used += entry.used
            free += entry.free
        disk_utilization = float(used / (used + free)) * 100
        disk_usage_record = Record(gauge=self._gauges["node_disk_usage"],
                                   value=used,
                                   tags={"ip": ip})
        disk_utilization_percentage_record = Record(
            gauge=self._gauges["node_disk_utilization_percentage"],
            value=disk_utilization,
            tags={"ip": ip})

        # -- Network speed (send/receive) stats per node --
        network_stats = stats["network"]
        network_sent_record = Record(gauge=self._gauges["node_network_sent"],
                                     value=network_stats[0],
                                     tags={"ip": ip})
        network_received_record = Record(
            gauge=self._gauges["node_network_received"],
            value=network_stats[1],
            tags={"ip": ip})

        # -- Network speed (send/receive) per node --
        network_speed_stats = stats["network_speed"]
        network_send_speed_record = Record(
            gauge=self._gauges["node_network_send_speed"],
            value=network_speed_stats[0],
            tags={"ip": ip})
        network_receive_speed_record = Record(
            gauge=self._gauges["node_network_receive_speed"],
            value=network_speed_stats[1],
            tags={"ip": ip})

        raylet_stats = self._get_raylet_stats()
        raylet_pid = str(raylet_stats["pid"])
        # -- raylet CPU --
        raylet_cpu_usage = float(raylet_stats["cpu_percent"]) * 100
        raylet_cpu_record = Record(gauge=self._gauges["raylet_cpu"],
                                   value=raylet_cpu_usage,
                                   tags={
                                       "ip": ip,
                                       "pid": raylet_pid
                                   })

        # -- raylet mem --
        raylet_mem_usage = float(raylet_stats["memory_info"].rss) / 1e6
        raylet_mem_record = Record(gauge=self._gauges["raylet_mem"],
                                   value=raylet_mem_usage,
                                   tags={
                                       "ip": ip,
                                       "pid": raylet_pid
                                   })

        records_reported = [
            cpu_record, cpu_count_record, mem_used_record,
            mem_available_record, mem_total_record, disk_usage_record,
            disk_utilization_percentage_record, network_sent_record,
            network_received_record, network_send_speed_record,
            network_receive_speed_record
        ]

        if gpus_available:
            records_reported.extend([
                gpus_available_record, gpus_utilization_record,
                gram_used_record, gram_available_record
            ])

        raylet_records = [raylet_cpu_record, raylet_mem_record]
        records_reported.extend(raylet_records)

        self._metrics_agent.record_reporter_stats(records_reported)

    async def _perform_iteration(self, aioredis_client):
        """Get any changes to the log files and push updates to Redis."""
        while True:
            try:
                stats = self._get_all_stats()
                self._record_stats(stats)
                await aioredis_client.publish(self._key, jsonify_asdict(stats))
            except Exception:
                logger.exception("Error publishing node physical stats.")
            await asyncio.sleep(reporter_consts.REPORTER_UPDATE_INTERVAL_MS /
                                1000)

    async def run(self, server):
        aioredis_client = await aioredis.create_redis_pool(
            address=self._dashboard_agent.redis_address,
            password=self._dashboard_agent.redis_password)
        reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server)
        await self._perform_iteration(aioredis_client)