def _get_rank(cluster_info): # As task placement may not be identical between two different jobs, # we cannot simply index cluster_info using partitionId to get current # ip and port. # The approach here is to first get all tasks' ip in this job and compute # a local rank by counting how many tasks has the same ip but with lower id. # We then use the local rank to find the right slot in cluster_info to find # the right global_rank. tc = BarrierTaskContext().get() infos = tc.getTaskInfos() idx = tc.partitionId() local_ip = infos[idx].address.split(":")[0] local_rank = 0 for i in range(0, idx): if infos[i].address.startswith(local_ip): local_rank += 1 global_rank = -1 local_count = 0 for node in cluster_info: if node.startswith(local_ip): local_count += 1 global_rank += 1 if local_count == local_rank + 1: break return global_rank
def find_ip_and_port(pre_iter): tc = BarrierTaskContext().get() address = tc.getTaskInfos()[tc.partitionId()].address.split(":")[0] with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(("", 0)) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tc.barrier() free_ip_port = f"{address}:{s.getsockname()[1]}" return [free_ip_port]