def create_layer_of_buckets( nparts_lhs: int, nparts_rhs: int, layer_idx: int, *, generator: random.Random, ) -> List[Bucket]: """Return the layer of #LHS x #RHS matrix of the given index The i-th layer contains the buckets (lhs, rhs) such that min(lhs, rhs) == i. Buckets that are one the transpose of the other will be consecutive. Other than that, the order is random. """ layer_p = Partition(layer_idx) pairs = [[Bucket(layer_p, layer_p)]] for idx in range(layer_idx + 1, max(nparts_lhs, nparts_rhs)): p = Partition(idx) pair = [] if p < nparts_lhs: pair.append(Bucket(p, layer_p)) if p < nparts_rhs: pair.append(Bucket(layer_p, p)) generator.shuffle(pair) pairs.append(pair) generator.shuffle(pairs) return [b for p in pairs for b in p]
def create_buckets_ordered_lexicographically( nparts_lhs: int, nparts_rhs: int ) -> List[Bucket]: """Return buckets in increasing LHS and, for the same LHS, in increasing RHS""" buckets = [ Bucket(lhs, rhs) for lhs in range(nparts_lhs) for rhs in range(nparts_rhs) ] return buckets
def acquire_bucket(self, rank: Rank, maybe_old_bucket: Optional[Bucket] = None ) -> Tuple[Optional[Bucket], int]: """ Finds a (lhs, rhs) partition pair that has not already been acquired this epoch, and where neither the lhs nor rhs partitions are currently locked. Locks this lhs and rhs until `release_pair` is called. Will try to find a pair that has the same lhs (if not, rhs) as old_bucket. If no pair is available, returns None. Returns: pair: a (lhs, rhs) partition pair. lhs and rhs are locked until `release_pair` is called. If no pair is available, None is returned. remaining: The number of pairs remaining. When this is 0 then the epoch is done. """ remaining = self.nparts_lhs * self.nparts_rhs - len(self.done) locked_entities_parts: Dict[Tuple[EntityName, Partition], Rank] = {} for bucket, other_rank in self.active.items(): locked_entities_parts.update(((entity, bucket.lhs), other_rank) for entity in self.entities_lhs) locked_entities_parts.update(((entity, bucket.rhs), other_rank) for entity in self.entities_rhs) acquirable_lhs_parts: List[Partition] = [] for part in range(self.nparts_lhs): if self._can_acquire(rank, part, locked_entities_parts, Side.LHS): acquirable_lhs_parts.append(part) acquirable_rhs_parts: List[Partition] = [] for part in range(self.nparts_rhs): if self._can_acquire(rank, part, locked_entities_parts, Side.RHS): acquirable_rhs_parts.append(part) acquirable_buckets: List[Bucket] = [] for part_lhs in acquirable_lhs_parts: for part_rhs in acquirable_rhs_parts: bucket = Bucket(part_lhs, part_rhs) if bucket not in self.done and self._is_initialized(bucket): acquirable_buckets.append(bucket) if len(acquirable_buckets) == 0: return None, remaining new_bucket = self._pick_bucket(acquirable_buckets, maybe_old_bucket) self.active[new_bucket] = rank self.done.add(new_bucket) if self.initialized_entities_partitions is not None: self.initialized_entities_partitions.update( (entity, new_bucket.lhs) for entity in self.entities_lhs) self.initialized_entities_partitions.update( (entity, new_bucket.rhs) for entity in self.entities_rhs) logger.info( f"Bucket {new_bucket} acquired by trainer {rank}: active= {self.active}" ) return new_bucket, remaining
def create_buckets_ordered_by_affinity( nparts_lhs: int, nparts_rhs: int, *, generator: random.Random) -> List[Bucket]: """Try having consecutive buckets share as many partitions as possible. Start from a random bucket. Until there are buckets left, try to choose the next one so that it has as many partitions in common as possible with the previous one. When multiple options are available, pick one randomly. """ if nparts_lhs <= 0 or nparts_rhs <= 0: return [] # TODO Change this function to use the same cost model as the LockServer # when computing affinity (based on the number of entities to save and load) # rather than just the number of partitions in common. Pay attention to keep # the complexity of this algorithm linear in the number of buckets. This # comment is too short to give a full description, but the idea is that only # a few transitions are possible between a bucket and the next: the one that # preserves all (ent, part) pairs, the one that preserves only the lhs ones, # only the rhs ones, only the intersection of the two, or none at all. So we # can keep a dict from sets of (ent, part) to lists of buckets, and insert # each bucket into four of those lists, namely the ones for all its (ent, # part), its lhs ones, its rhs ones and the intersection of its lhs and rhs # ones. Then, when looking for the next bucket, we figure out the transition # that is cheapest (among the options defined above), determine the set of # (ent, part) we need to move to in order to achieve that transition type # and we look up in the dict to find a bucket containing those (ent, part). # This is our "source of truth" on what buckets we haven't outputted yet. It # can be queried in constant time. remaining: Set[Bucket] = set() # These are our random orders: we shuffle them once and then pop from the # end. Each bucket appears in several of them. They are updated lazily, # which means they may contain buckets that have already been outputted. all_buckets: List[Bucket] = [] buckets_per_partition: List[List[Bucket]] = [ [] for _ in range(max(nparts_lhs, nparts_rhs)) ] for lhs in range(nparts_lhs): for rhs in range(nparts_rhs): b = Bucket(lhs, rhs) remaining.add(b) all_buckets.append(b) buckets_per_partition[lhs].append(b) buckets_per_partition[rhs].append(b) generator.shuffle(all_buckets) for buckets in buckets_per_partition: generator.shuffle(buckets) b = all_buckets.pop() remaining.remove(b) order = [b] while remaining: transposed_b = Bucket(b.rhs, b.lhs) if transposed_b in remaining: remaining.remove(transposed_b) order.append(transposed_b) if not remaining: break same_as_lhs = buckets_per_partition[b.lhs] same_as_rhs = buckets_per_partition[b.rhs] while len(same_as_lhs) > 0 or len(same_as_rhs) > 0: (chosen, ) = generator.choices( [same_as_lhs, same_as_rhs], weights=[len(same_as_lhs), len(same_as_rhs)]) next_b = chosen.pop() if next_b in remaining: break else: while True: next_b = all_buckets.pop() if next_b in remaining: break remaining.remove(next_b) order.append(next_b) b = next_b return order
def create_buckets_ordered_by_affinity( nparts_lhs: int, nparts_rhs: int, *, generator: random.Random, ) -> List[Bucket]: """Try having consecutive buckets share as many partitions as possible. Start from a random bucket. Until there are buckets left, try to choose the next one so that it has as many partitions in common as possible with the previous one. When multiple options are available, pick one randomly. """ if nparts_lhs <= 0 or nparts_rhs <= 0: return [] # This is our "source of truth" on what buckets we haven't outputted yet. It # can be queried in constant time. remaining: Set[Bucket] = set() # These are our random orders: we shuffle them once and then pop from the # end. Each bucket appears in several of them. They are updated lazily, # which means they may contain buckets that have already been outputted. all_buckets: List[Bucket] = [] buckets_per_partition: List[List[Bucket]] = \ [[] for _ in range(max(nparts_lhs, nparts_rhs))] for lhs in range(nparts_lhs): for rhs in range(nparts_rhs): b = Bucket(Partition(lhs), Partition(rhs)) remaining.add(b) all_buckets.append(b) buckets_per_partition[lhs].append(b) buckets_per_partition[rhs].append(b) generator.shuffle(all_buckets) for buckets in buckets_per_partition: generator.shuffle(buckets) b = all_buckets.pop() remaining.remove(b) order = [b] while remaining: transposed_b = Bucket(b.rhs, b.lhs) if transposed_b in remaining: remaining.remove(transposed_b) order.append(transposed_b) if not remaining: break same_as_lhs = buckets_per_partition[b.lhs] same_as_rhs = buckets_per_partition[b.rhs] while len(same_as_lhs) > 0 or len(same_as_rhs) > 0: chosen, = generator.choices( [same_as_lhs, same_as_rhs], weights=[len(same_as_lhs), len(same_as_rhs)], ) next_b = chosen.pop() if next_b in remaining: break else: while True: next_b = all_buckets.pop() if next_b in remaining: break remaining.remove(next_b) order.append(next_b) b = next_b return order