def default(self, obj): if isinstance(obj, bytes): return binary_to_hex(obj) if isinstance(obj, Immutable): return obj.mutable() # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj)
def _get_event(msg="empty message", job_id=None, source_type=None): return { "event_id": binary_to_hex(np.random.bytes(18)), "source_type": random.choice(event_pb2.Event.SourceType.keys()) if source_type is None else source_type, "host_name": "po-dev.inc.alipay.net", "pid": random.randint(1, 65536), "label": "", "message": msg, "time_stamp": time.time(), "severity": "INFO", "custom_fields": { "job_id": ray.JobID.from_int(random.randint(1, 100)).hex() if job_id is None else job_id, "node_id": "", "task_id": "", } }
def workers(self): """Get a dictionary mapping worker ID to worker information.""" self._check_connected() # Get all data in worker table worker_table = self.global_state_accessor.get_worker_table() workers_data = {} for i in range(len(worker_table)): worker_table_data = gcs_utils.WorkerTableData.FromString( worker_table[i]) if worker_table_data.is_alive and \ worker_table_data.worker_type == gcs_utils.WORKER: worker_id = binary_to_hex( worker_table_data.worker_address.worker_id) worker_info = worker_table_data.worker_info workers_data[worker_id] = { "node_ip_address": decode(worker_info[b"node_ip_address"]), "plasma_store_socket": decode(worker_info[b"plasma_store_socket"]) } if b"stderr_file" in worker_info: workers_data[worker_id]["stderr_file"] = decode( worker_info[b"stderr_file"]) if b"stdout_file" in worker_info: workers_data[worker_id]["stdout_file"] = decode( worker_info[b"stdout_file"]) return workers_data
def placement_group_table(self, placement_group_id=None): self._check_connected() if placement_group_id is not None: placement_group_id = ray.PlacementGroupID( hex_to_binary(placement_group_id.hex())) placement_group_info = ( self.global_state_accessor.get_placement_group_info( placement_group_id)) if placement_group_info is None: return {} else: placement_group_info = (gcs_utils.PlacementGroupTableData. FromString(placement_group_info)) return self._gen_placement_group_info(placement_group_info) else: placement_group_table = self.global_state_accessor.\ get_placement_group_table() results = {} for placement_group_info in placement_group_table: placement_group_table_data = gcs_utils.\ PlacementGroupTableData.FromString(placement_group_info) placement_group_id = binary_to_hex( placement_group_table_data.placement_group_id) results[placement_group_id] = \ self._gen_placement_group_info(placement_group_table_data) return results
def profile_table(self): self._check_connected() result = defaultdict(list) profile_table = self.global_state_accessor.get_profile_table() for i in range(len(profile_table)): profile = gcs_utils.ProfileTableData.FromString(profile_table[i]) component_type = profile.component_type component_id = binary_to_hex(profile.component_id) node_ip_address = profile.node_ip_address for event in profile.profile_events: try: extra_data = json.loads(event.extra_data) except ValueError: extra_data = {} profile_event = { "event_type": event.event_type, "component_id": component_id, "node_ip_address": node_ip_address, "component_type": component_type, "start_time": event.start_time, "end_time": event.end_time, "extra_data": extra_data } result[component_id].append(profile_event) return dict(result)
def actor_table(self, actor_id): """Fetch and parse the actor table information for a single actor ID. Args: actor_id: A hex string of the actor ID to fetch information about. If this is None, then the actor table is fetched. Returns: Information from the actor table. """ self._check_connected() if actor_id is not None: actor_id = ray.ActorID(hex_to_binary(actor_id)) actor_info = self.global_state_accessor.get_actor_info(actor_id) if actor_info is None: return {} else: actor_table_data = gcs_utils.ActorTableData.FromString( actor_info) return self._gen_actor_info(actor_table_data) else: actor_table = self.global_state_accessor.get_actor_table() results = {} for i in range(len(actor_table)): actor_table_data = gcs_utils.ActorTableData.FromString( actor_table[i]) results[binary_to_hex(actor_table_data.actor_id)] = \ self._gen_actor_info(actor_table_data) return results
def object_table(self, object_ref=None): """Fetch and parse the object table info for one or more object refs. Args: object_ref: An object ref to fetch information about. If this is None, then the entire object table is fetched. Returns: Information from the object table. """ self._check_connected() if object_ref is not None: object_ref = ray.ObjectRef(hex_to_binary(object_ref)) object_info = self.global_state_accessor.get_object_info( object_ref) if object_info is None: return {} else: object_location_info = gcs_utils.ObjectLocationInfo.FromString( object_info) return self._gen_object_info(object_location_info) else: object_table = self.global_state_accessor.get_object_table() results = {} for i in range(len(object_table)): object_location_info = gcs_utils.ObjectLocationInfo.FromString( object_table[i]) results[binary_to_hex(object_location_info.object_id)] = \ self._gen_object_info(object_location_info) return results
def _gen_placement_group_info(self, placement_group_info): # This should be imported here, otherwise, it will error doc build. from ray.core.generated.common_pb2 import PlacementStrategy def get_state(state): if state == gcs_utils.PlacementGroupTableData.PENDING: return "PENDING" elif state == gcs_utils.PlacementGroupTableData.CREATED: return "CREATED" else: return "REMOVED" def get_strategy(strategy): if strategy == PlacementStrategy.PACK: return "PACK" elif strategy == PlacementStrategy.STRICT_PACK: return "STRICT_PACK" elif strategy == PlacementStrategy.STRICT_SPREAD: return "STRICT_SPREAD" elif strategy == PlacementStrategy.SPREAD: return "SPREAD" else: raise ValueError( f"Invalid strategy returned: {PlacementStrategy}") stats = placement_group_info.stats assert placement_group_info is not None return { "placement_group_id": binary_to_hex(placement_group_info.placement_group_id), "name": placement_group_info.name, "bundles": { # The value here is needs to be dictionarified # otherwise, the payload becomes unserializable. bundle.bundle_id.bundle_index: MessageToDict(bundle)["unitResources"] for bundle in placement_group_info.bundles }, "strategy": get_strategy(placement_group_info.strategy), "state": get_state(placement_group_info.state), "stats": { "end_to_end_creation_latency_ms": (stats.end_to_end_creation_latency_us / 1000.0), "scheduling_latency_ms": (stats.scheduling_latency_us / 1000.0), "scheduling_attempt": stats.scheduling_attempt, "highest_retry_delay_ms": stats.highest_retry_delay_ms, "scheduling_state": gcs_pb2.PlacementGroupStats.SchedulingState.DESCRIPTOR. values_by_number[ # noqa: E501 stats.scheduling_state].name, }, }
def _gen_actor_info(self, actor_table_data): """Parse actor table data. Returns: Information from actor table. """ actor_info = { "ActorID": binary_to_hex(actor_table_data.actor_id), "ActorClassName": actor_table_data.class_name, "IsDetached": actor_table_data.is_detached, "Name": actor_table_data.name, "JobID": binary_to_hex(actor_table_data.job_id), "Address": { "IPAddress": actor_table_data.address.ip_address, "Port": actor_table_data.address.port, "NodeID": binary_to_hex(actor_table_data.address.raylet_id), }, "OwnerAddress": { "IPAddress": actor_table_data.owner_address.ip_address, "Port": actor_table_data.owner_address.port, "NodeID": binary_to_hex(actor_table_data.owner_address.raylet_id), }, "State": gcs_pb2.ActorTableData.ActorState.DESCRIPTOR.values_by_number[ actor_table_data.state].name, "NumRestarts": actor_table_data.num_restarts, "Timestamp": actor_table_data.timestamp, "StartTime": actor_table_data.start_time, "EndTime": actor_table_data.end_time, "DeathCause": actor_table_data.death_cause, "Pid": actor_table_data.pid, } return actor_info
def _decode_keys(d): for k, v in d.items(): if isinstance(v, dict): d[k] = _decode_keys(v) if isinstance(v, list): new_list = [] for i in v: if isinstance(i, dict): new_list.append(_decode_keys(i)) else: new_list.append(i) d[k] = new_list else: if k in decode_keys: d[k] = binary_to_hex(b64decode(v)) else: d[k] = v return d
def _gen_placement_group_info(self, placement_group_info): # This should be imported here, otherwise, it will error doc build. from ray.core.generated.common_pb2 import PlacementStrategy def get_state(state): if state == ray.gcs_utils.PlacementGroupTableData.PENDING: return "PENDING" elif state == ray.gcs_utils.PlacementGroupTableData.CREATED: return "CREATED" else: return "REMOVED" def get_strategy(strategy): if strategy == PlacementStrategy.PACK: return "PACK" elif strategy == PlacementStrategy.STRICT_PACK: return "STRICT_PACK" elif strategy == PlacementStrategy.STRICT_SPREAD: return "STRICT_SPREAD" elif strategy == PlacementStrategy.SPREAD: return "SPREAD" else: raise ValueError( f"Invalid strategy returned: {PlacementStrategy}") assert placement_group_info is not None return { "placement_group_id": binary_to_hex(placement_group_info.placement_group_id), "name": placement_group_info.name, "bundles": { # The value here is needs to be dictionarified # otherwise, the payload becomes unserializable. bundle.bundle_id.bundle_index: MessageToDict(bundle)["unitResources"] for bundle in placement_group_info.bundles }, "strategy": get_strategy(placement_group_info.strategy), "state": get_state(placement_group_info.state), }
def __getstate__(self): """Memento generator for Trial. Sets RUNNING trials to PENDING. Note this can only occur if the trial holds a PERSISTENT checkpoint. """ state = self.__dict__.copy() for key in self._nonjson_fields: state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["location"] = Location() # Avoid waiting for events that will never occur on resume. state["restoring_from"] = None state["saving_to"] = None state["_state_json"] = None state["_state_valid"] = False state["_default_result_or_future"] = None return copy.deepcopy(state)
async def list_tasks(self, *, option: ListApiOptions) -> dict: """List all task information from the cluster. Returns: {task_id -> task_data_in_dict} task_data_in_dict's schema is in TaskState """ replies = await asyncio.gather(*[ self._client.get_task_info(node_id, timeout=option.timeout) for node_id in self._client.get_all_registered_raylet_ids() ]) running_task_id = set() for reply in replies: for task_id in reply.running_task_ids: running_task_id.add(binary_to_hex(task_id)) result = [] for reply in replies: logger.info(reply) tasks = reply.owned_task_info_entries for task in tasks: data = self._message_to_dict( message=task, fields_to_decode=["task_id"], ) if data["task_id"] in running_task_id: data[ "scheduling_state"] = TaskStatus.DESCRIPTOR.values_by_number[ TaskStatus.RUNNING].name data = filter_fields(data, TaskState) result.append(data) # Sort to make the output deterministic. result.sort(key=lambda entry: entry["task_id"]) return {d["task_id"]: d for d in islice(result, option.limit)}
async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse: """List all task information from the cluster. Returns: {task_id -> task_data_in_dict} task_data_in_dict's schema is in TaskState """ raylet_ids = self._client.get_all_registered_raylet_ids() replies = await asyncio.gather( *[ self._client.get_task_info(node_id, timeout=option.timeout) for node_id in raylet_ids ], return_exceptions=True, ) unresponsive_nodes = 0 running_task_id = set() successful_replies = [] for reply in replies: if isinstance(reply, DataSourceUnavailable): unresponsive_nodes += 1 continue elif isinstance(reply, Exception): raise reply successful_replies.append(reply) for task_id in reply.running_task_ids: running_task_id.add(binary_to_hex(task_id)) partial_failure_warning = None if len(raylet_ids) > 0 and unresponsive_nodes > 0: warning_msg = NODE_QUERY_FAILURE_WARNING.format( type="raylet", total=len(raylet_ids), network_failures=unresponsive_nodes, log_command="raylet.out", ) if unresponsive_nodes == len(raylet_ids): raise DataSourceUnavailable(warning_msg) partial_failure_warning = ( f"The returned data may contain incomplete result. {warning_msg}" ) result = [] for reply in successful_replies: assert not isinstance(reply, Exception) tasks = reply.owned_task_info_entries for task in tasks: data = self._message_to_dict( message=task, fields_to_decode=["task_id"], ) if data["task_id"] in running_task_id: data["scheduling_state"] = TaskStatus.DESCRIPTOR.values_by_number[ TaskStatus.RUNNING ].name result.append(data) result = self._filter(result, option.filters, TaskState) # Sort to make the output deterministic. result.sort(key=lambda entry: entry["task_id"]) return ListApiResponse( result=list(islice(result, option.limit)), partial_failure_warning=partial_failure_warning, )
def _to_cloudpickle(self, obj): return { "_type": "CLOUDPICKLE_FALLBACK", "value": binary_to_hex(cloudpickle.dumps(obj)), }