def get_more_nodes(self, part): """ Generator to get extra nodes for a partition for hinted handoff. :param part: partition to get handoff nodes for :returns: generator of node dicts See :func:`get_nodes` for a description of the node dicts. """ if time() > self._rtime: self._reload() used_tiers = set() for part2dev_id in self._replica2part2dev_id: for tier in tiers_for_dev(self.devs[part2dev_id[part]]): used_tiers.add(tier) for level in self.tiers_by_length: tiers = list(level) while tiers: tier = tiers.pop(part % len(tiers)) if tier in used_tiers: continue for i in xrange(len(self.tier2devs[tier])): dev = self.tier2devs[tier][(part + i) % len(self.tier2devs[tier])] if not dev.get('weight'): continue yield dev used_tiers.update(tiers_for_dev(dev)) break
def test_tiers_for_dev(self): self.assertEqual( tiers_for_dev(self.test_dev), ((1,), (1, 1), (1, 1, '192.168.1.1'), (1, 1, '192.168.1.1', 0)))
def _rebuild_tier_data(self): self.tier2devs = defaultdict(list) for dev in self._devs: if not dev: continue for tier in tiers_for_dev(dev): self.tier2devs[tier].append(dev) tiers_by_length = defaultdict(list) for tier in self.tier2devs: tiers_by_length[len(tier)].append(tier) self.tiers_by_length = sorted(tiers_by_length.values(), key=lambda x: len(x[0])) for tiers in self.tiers_by_length: tiers.sort()
def test_normalized_device_tier_names(self): rb = ring.RingBuilder(8, 3, 0) rb.add_dev({ 'region': 1, 'zone': 1, 'ip': '127.0.0.1', 'port': 6011, 'device': 'd1', 'weight': 0.0, }) dev = rb.devs[0] expected = 'r1z1-127.0.0.1/d1' self.assertEqual(expected, get_tier_name(tiers_for_dev(dev)[-1], rb)) self.assertEqual(expected, pretty_dev(dev))
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different regions are considered the farthest-apart things, followed by zones, then different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) available_devs = \ sorted((d for d in self._iter_devs() if d['weight']), key=lambda x: x['sort_key']) tier2devs = defaultdict(list) tier2sort_key = defaultdict(list) max_tier_depth = 0 for dev in available_devs: for tier in tiers_for_dev(dev): tier2devs[tier].append(dev) # <-- starts out sorted! tier2sort_key[tier].append(dev['sort_key']) if len(tier) > max_tier_depth: max_tier_depth = len(tier) tier2children_sets = build_tier_tree(available_devs) tier2children = defaultdict(list) tier2children_sort_key = {} tiers_list = [()] depth = 1 while depth <= max_tier_depth: new_tiers_list = [] for tier in tiers_list: child_tiers = list(tier2children_sets[tier]) child_tiers.sort(key=lambda t: tier2sort_key[t][-1]) tier2children[tier] = child_tiers tier2children_sort_key[tier] = map( lambda t: tier2sort_key[t][-1], child_tiers) new_tiers_list.extend(child_tiers) tiers_list = new_tiers_list depth += 1 for part, replace_replicas in reassign_parts: # Gather up what other tiers (regions, zones, ip/ports, and # devices) the replicas not-to-be-moved are in for this part. other_replicas = defaultdict(int) unique_tiers_by_tier_len = defaultdict(set) for replica in self._replicas_for_part(part): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: tier = () depth = 1 while depth <= max_tier_depth: # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas, pick the # tier with the hungriest drive and then continue # searching in that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator will balance things as # evenly as possible at each level of the device # layout. If your layout is extremely unbalanced, # this may produce poor results. # # This used to be a cute, recursive function, but it's been # unrolled for performance. candidate_tiers = tier2children[tier] candidates_with_replicas = \ unique_tiers_by_tier_len[len(tier) + 1] if len(candidate_tiers) > len(candidates_with_replicas): # There exists at least one tier with 0 other replicas, # so work backward among the candidates, accepting the # first which isn't in other_replicas. # # This optimization is to avoid calling the min() # below, which is expensive if you've got thousands of # drives. for t in reversed(candidate_tiers): if other_replicas[t] == 0: tier = t break else: min_count = min(other_replicas[t] for t in candidate_tiers) tier = (t for t in reversed(candidate_tiers) if other_replicas[t] == min_count).next() depth += 1 dev = tier2devs[tier][-1] dev['parts_wanted'] -= 1 dev['parts'] += 1 old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in tiers_for_dev(dev): other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2sort_key[tier].insert(new_index, new_sort_key) # Now jiggle tier2children values to keep them sorted new_last_sort_key = tier2sort_key[tier][-1] parent_tier = tier[0:-1] index = bisect.bisect_left( tier2children_sort_key[parent_tier], old_sort_key) popped = tier2children[parent_tier].pop(index) tier2children_sort_key[parent_tier].pop(index) new_index = bisect.bisect_left( tier2children_sort_key[parent_tier], new_last_sort_key) tier2children[parent_tier].insert(new_index, popped) tier2children_sort_key[parent_tier].insert( new_index, new_last_sort_key) self._replica2part2dev[replica][part] = dev['id'] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key']
def _gather_reassign_parts(self): """ Returns a list of (partition, replicas) pairs to be reassigned by gathering from removed devices, insufficiently-far-apart replicas, and overweight drives. """ # inline memoization of tiers_for_dev() results (profiling reveals it # as a hot-spot). tfd = {} # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. removed_dev_parts = defaultdict(list) if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for part, replica in self._each_part_replica(): dev_id = self._replica2part2dev[replica][part] if dev_id in dev_ids: self._last_part_moves[part] = 0 removed_dev_parts[part].append(replica) # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. spread_out_parts = defaultdict(list) max_allowed_replicas = self._build_max_replicas_by_tier() for part in xrange(self.parts): # Only move one replica at a time if possible. if part in removed_dev_parts: continue # First, add up the count of replicas at each tier for each # partition. # replicas_at_tier was a "lambda: 0" defaultdict, but profiling # revealed the lambda invocation as a significant cost. replicas_at_tier = {} for dev in self._devs_for_part(part): if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: if tier not in replicas_at_tier: replicas_at_tier[tier] = 1 else: replicas_at_tier[tier] += 1 # Now, look for partitions not yet spread out enough and not # recently moved. for replica in self._replicas_for_part(part): dev = self.devs[self._replica2part2dev[replica][part]] removed_replica = False if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: rep_at_tier = 0 if tier in replicas_at_tier: rep_at_tier = replicas_at_tier[tier] if (rep_at_tier > max_allowed_replicas[tier] and self._last_part_moves[part] >= self.min_part_hours): self._last_part_moves[part] = 0 spread_out_parts[part].append(replica) dev['parts_wanted'] += 1 dev['parts'] -= 1 removed_replica = True break if removed_replica: if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: replicas_at_tier[tier] -= 1 # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. reassign_parts = defaultdict(list) # We randomly pick a new starting point in the "circular" ring of # partitions to try to get a better rebalance when called multiple # times. start = self._last_part_gather_start / 4 start += random.randint(0, self.parts / 2) # GRAH PEP8!!! self._last_part_gather_start = start for replica, part2dev in enumerate(self._replica2part2dev): # If we've got a partial replica, start may be out of # range. Scale it down so that we get a similar movement # pattern (but scaled down) on sequential runs. this_start = int(float(start) * len(part2dev) / self.parts) for part in itertools.chain(xrange(this_start, len(part2dev)), xrange(0, this_start)): if self._last_part_moves[part] < self.min_part_hours: continue if part in removed_dev_parts or part in spread_out_parts: continue dev = self.devs[part2dev[part]] if dev['parts_wanted'] < 0: self._last_part_moves[part] = 0 dev['parts_wanted'] += 1 dev['parts'] -= 1 reassign_parts[part].append(replica) reassign_parts.update(spread_out_parts) reassign_parts.update(removed_dev_parts) reassign_parts_list = list(reassign_parts.iteritems()) # We shuffle the partitions to reassign so we get a more even # distribution later. There has been discussion of trying to distribute # partitions more "regularly" because that would actually reduce risk # but 1) it is really difficult to do this with uneven clusters and 2) # it would concentrate load during failure recovery scenarios # (increasing risk). The "right" answer has yet to be debated to # conclusion, but working code wins for now. random.shuffle(reassign_parts_list) return reassign_parts_list
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different regions are considered the farthest-apart things, followed by zones, then different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ for dev in self._iter_devs(): dev["sort_key"] = self._sort_key_for(dev) available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"]) tier2devs = defaultdict(list) tier2sort_key = defaultdict(tuple) tier2dev_sort_key = defaultdict(list) max_tier_depth = 0 for dev in available_devs: dev["tiers"] = tiers_for_dev(dev) for tier in dev["tiers"]: tier2devs[tier].append(dev) # <-- starts out sorted! tier2dev_sort_key[tier].append(dev["sort_key"]) tier2sort_key[tier] = dev["sort_key"] if len(tier) > max_tier_depth: max_tier_depth = len(tier) tier2children_sets = build_tier_tree(available_devs) tier2children = defaultdict(list) tier2children_sort_key = {} tiers_list = [()] depth = 1 while depth <= max_tier_depth: new_tiers_list = [] for tier in tiers_list: child_tiers = list(tier2children_sets[tier]) child_tiers.sort(key=tier2sort_key.__getitem__) tier2children[tier] = child_tiers tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers) new_tiers_list.extend(child_tiers) tiers_list = new_tiers_list depth += 1 for part, replace_replicas in reassign_parts: # Gather up what other tiers (regions, zones, ip/ports, and # devices) the replicas not-to-be-moved are in for this part. other_replicas = defaultdict(int) unique_tiers_by_tier_len = defaultdict(set) for replica in self._replicas_for_part(part): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in dev["tiers"]: other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: tier = () depth = 1 while depth <= max_tier_depth: # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas, pick the # tier with the hungriest drive and then continue # searching in that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator will balance things as # evenly as possible at each level of the device # layout. If your layout is extremely unbalanced, # this may produce poor results. # # This used to be a cute, recursive function, but it's been # unrolled for performance. # We sort the tiers here so that, when we look for a tier # with the lowest number of replicas, the first one we # find is the one with the hungriest drive (i.e. drive # with the largest sort_key value). This lets us # short-circuit the search while still ensuring we get the # right tier. candidates_with_replicas = unique_tiers_by_tier_len[len(tier) + 1] # Find a tier with the minimal replica count and the # hungriest drive among all the tiers with the minimal # replica count. if len(tier2children[tier]) > len(candidates_with_replicas): # There exists at least one tier with 0 other replicas tier = max( (t for t in tier2children[tier] if other_replicas[t] == 0), key=tier2sort_key.__getitem__ ) else: tier = max(tier2children[tier], key=lambda t: (-other_replicas[t], tier2sort_key[t])) depth += 1 dev = tier2devs[tier][-1] dev["parts_wanted"] -= 1 dev["parts"] += 1 old_sort_key = dev["sort_key"] new_sort_key = dev["sort_key"] = self._sort_key_for(dev) for tier in dev["tiers"]: other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2dev_sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2dev_sort_key[tier].insert(new_index, new_sort_key) new_last_sort_key = tier2dev_sort_key[tier][-1] tier2sort_key[tier] = new_last_sort_key # Now jiggle tier2children values to keep them sorted parent_tier = tier[0:-1] index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key) popped = tier2children[parent_tier].pop(index) tier2children_sort_key[parent_tier].pop(index) new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key) tier2children[parent_tier].insert(new_index, popped) tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key) self._replica2part2dev[replica][part] = dev["id"] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev["sort_key"] dev.pop("tiers", None) # May be absent for devices w/o weight
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different zones are considered the farthest-apart things, followed by different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) available_devs = \ sorted((d for d in self._iter_devs() if d['weight']), key=lambda x: x['sort_key']) tier2devs = defaultdict(list) tier2sort_key = defaultdict(list) max_tier_depth = 0 for dev in available_devs: for tier in tiers_for_dev(dev): tier2devs[tier].append(dev) # <-- starts out sorted! tier2sort_key[tier].append(dev['sort_key']) if len(tier) > max_tier_depth: max_tier_depth = len(tier) tier2children_sets = build_tier_tree(available_devs) tier2children = defaultdict(list) tier2children_sort_key = {} tiers_list = [()] depth = 1 while depth <= max_tier_depth: new_tiers_list = [] for tier in tiers_list: child_tiers = list(tier2children_sets[tier]) child_tiers.sort(key=lambda t: tier2sort_key[t][-1]) tier2children[tier] = child_tiers tier2children_sort_key[tier] = map( lambda t: tier2sort_key[t][-1], child_tiers) new_tiers_list.extend(child_tiers) tiers_list = new_tiers_list depth += 1 for part, replace_replicas in reassign_parts: # Gather up what other tiers (zones, ip_ports, and devices) the # replicas not-to-be-moved are in for this part. other_replicas = defaultdict(int) unique_tiers_by_tier_len = defaultdict(set) for replica in self._replicas_for_part(part): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: tier = () depth = 1 while depth <= max_tier_depth: # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas, pick the # tier with the hungriest drive and then continue # searching in that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator will balance things as # evenly as possible at each level of the device # layout. If your layout is extremely unbalanced, # this may produce poor results. # # This used to be a cute, recursive function, but it's been # unrolled for performance. candidate_tiers = tier2children[tier] candidates_with_replicas = \ unique_tiers_by_tier_len[len(tier) + 1] if len(candidate_tiers) > len(candidates_with_replicas): # There exists at least one tier with 0 other replicas, # so work backward among the candidates, accepting the # first which isn't in other_replicas. # # This optimization is to avoid calling the min() # below, which is expensive if you've got thousands of # drives. for t in reversed(candidate_tiers): if other_replicas[t] == 0: tier = t break else: min_count = min(other_replicas[t] for t in candidate_tiers) tier = (t for t in reversed(candidate_tiers) if other_replicas[t] == min_count).next() depth += 1 dev = tier2devs[tier][-1] dev['parts_wanted'] -= 1 dev['parts'] += 1 old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in tiers_for_dev(dev): other_replicas[tier] += 1 unique_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2sort_key[tier].insert(new_index, new_sort_key) # Now jiggle tier2children values to keep them sorted new_last_sort_key = tier2sort_key[tier][-1] parent_tier = tier[0:-1] index = bisect.bisect_left( tier2children_sort_key[parent_tier], old_sort_key) popped = tier2children[parent_tier].pop(index) tier2children_sort_key[parent_tier].pop(index) new_index = bisect.bisect_left( tier2children_sort_key[parent_tier], new_last_sort_key) tier2children[parent_tier].insert(new_index, popped) tier2children_sort_key[parent_tier].insert( new_index, new_last_sort_key) self._replica2part2dev[replica][part] = dev['id'] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key']
def _gather_reassign_parts(self): """ Returns a list of (partition, replicas) pairs to be reassigned by gathering from removed devices, insufficiently-far-apart replicas, and overweight drives. """ # inline memoization of tiers_for_dev() results (profiling reveals it # as a hot-spot). tfd = {} # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. removed_dev_parts = defaultdict(list) if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for part, replica in self._each_part_replica(): dev_id = self._replica2part2dev[replica][part] if dev_id in dev_ids: self._last_part_moves[part] = 0 removed_dev_parts[part].append(replica) # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. spread_out_parts = defaultdict(list) max_allowed_replicas = self._build_max_replicas_by_tier() for part in xrange(self.parts): # Only move one replica at a time if possible. if part in removed_dev_parts: continue # First, add up the count of replicas at each tier for each # partition. # replicas_at_tier was a "lambda: 0" defaultdict, but profiling # revealed the lambda invocation as a significant cost. replicas_at_tier = {} for dev in self._devs_for_part(part): if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: if tier not in replicas_at_tier: replicas_at_tier[tier] = 1 else: replicas_at_tier[tier] += 1 # Now, look for partitions not yet spread out enough and not # recently moved. for replica in self._replicas_for_part(part): dev = self.devs[self._replica2part2dev[replica][part]] removed_replica = False if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: rep_at_tier = 0 if tier in replicas_at_tier: rep_at_tier = replicas_at_tier[tier] if (rep_at_tier > max_allowed_replicas[tier] and self._last_part_moves[part] >= self.min_part_hours): self._last_part_moves[part] = 0 spread_out_parts[part].append(replica) dev['parts_wanted'] += 1 dev['parts'] -= 1 removed_replica = True break if removed_replica: if dev['id'] not in tfd: tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: replicas_at_tier[tier] -= 1 # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. reassign_parts = defaultdict(list) # We randomly pick a new starting point in the "circular" ring of # partitions to try to get a better rebalance when called multiple # times. start = self._last_part_gather_start / 4 start += random.randint(0, self.parts / 2) # GRAH PEP8!!! self._last_part_gather_start = start for replica, part2dev in enumerate(self._replica2part2dev): # If we've got a partial replica, start may be out of # range. Scale it down so that we get a similar movement # pattern (but scaled down) on sequential runs. this_start = int(float(start) * len(part2dev) / self.parts) for part in itertools.chain(xrange(this_start, len(part2dev)), xrange(0, this_start)): if self._last_part_moves[part] < self.min_part_hours: continue if part in removed_dev_parts or part in spread_out_parts: continue dev = self.devs[part2dev[part]] if dev['parts_wanted'] < 0: self._last_part_moves[part] = 0 dev['parts_wanted'] += 1 dev['parts'] -= 1 reassign_parts[part].append(replica) reassign_parts.update(spread_out_parts) reassign_parts.update(removed_dev_parts) reassign_parts_list = list(reassign_parts.iteritems()) # We shuffle the partitions to reassign so we get a more even # distribution later. There has been discussion of trying to distribute # partitions more "regularly" because that would actually reduce risk # but 1) it is really difficult to do this with uneven clusters and 2) # it would concentrate load during failure recovery scenarios # (increasing risk). The "right" answer has yet to be debated to # conclusion, but working code wins for now. random.shuffle(reassign_parts_list) return reassign_parts_list
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different regions are considered the farthest-apart things, followed by zones, then different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ parts_available_in_tier = defaultdict(int) for dev in self._iter_devs(): dev["sort_key"] = self._sort_key_for(dev) tiers = tiers_for_dev(dev) dev["tiers"] = tiers for tier in tiers: # Note: this represents how many partitions may be assigned to # a given tier (region/zone/server/disk). It does not take # into account how many partitions a given tier wants to shed. # # If we did not do this, we could have a zone where, at some # point during assignment, number-of-parts-to-gain equals # number-of-parts-to-shed. At that point, no further placement # into that zone would occur since its parts_available_in_tier # would be 0. This would happen any time a zone had any device # with partitions to shed, which is any time a device is being # removed, which is a pretty frequent operation. parts_available_in_tier[tier] += max(dev["parts_wanted"], 0) available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"]) tier2devs = defaultdict(list) tier2sort_key = defaultdict(tuple) tier2dev_sort_key = defaultdict(list) max_tier_depth = 0 for dev in available_devs: for tier in dev["tiers"]: tier2devs[tier].append(dev) # <-- starts out sorted! tier2dev_sort_key[tier].append(dev["sort_key"]) tier2sort_key[tier] = dev["sort_key"] if len(tier) > max_tier_depth: max_tier_depth = len(tier) tier2children_sets = build_tier_tree(available_devs) tier2children = defaultdict(list) tier2children_sort_key = {} tiers_list = [()] depth = 1 while depth <= max_tier_depth: new_tiers_list = [] for tier in tiers_list: child_tiers = list(tier2children_sets[tier]) child_tiers.sort(key=tier2sort_key.__getitem__) tier2children[tier] = child_tiers tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers) new_tiers_list.extend(child_tiers) tiers_list = new_tiers_list depth += 1 for part, replace_replicas in reassign_parts: # Gather up what other tiers (regions, zones, ip/ports, and # devices) the replicas not-to-be-moved are in for this part. other_replicas = defaultdict(int) occupied_tiers_by_tier_len = defaultdict(set) for replica in self._replicas_for_part(part): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in dev["tiers"]: other_replicas[tier] += 1 occupied_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: # Find a new home for this replica tier = () depth = 1 while depth <= max_tier_depth: # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas and that have # room to accept more partitions, pick the tier with # the hungriest drive and then continue searching in # that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator prioritizes even device # filling over dispersion, so if your layout is # extremely unbalanced, you may not get the replica # dispersion that you expect, and your durability # may be lessened. # # This used to be a cute, recursive function, but it's been # unrolled for performance. # We sort the tiers here so that, when we look for a tier # with the lowest number of replicas, the first one we # find is the one with the hungriest drive (i.e. drive # with the largest sort_key value). This lets us # short-circuit the search while still ensuring we get the # right tier. candidates_with_replicas = occupied_tiers_by_tier_len[len(tier) + 1] # Among the tiers with room for more partitions, # find one with the smallest possible number of # replicas already in it, breaking ties by which one # has the hungriest drive. candidates_with_room = [t for t in tier2children[tier] if parts_available_in_tier[t] > 0] if len(candidates_with_room) > len(candidates_with_replicas): # There exists at least one tier with room for # another partition and 0 other replicas already # in it, so we can use a faster search. The else # branch's search would work here, but it's # significantly slower. tier = max( (t for t in candidates_with_room if other_replicas[t] == 0), key=tier2sort_key.__getitem__ ) else: tier = max(candidates_with_room, key=lambda t: (-other_replicas[t], tier2sort_key[t])) depth += 1 dev = tier2devs[tier][-1] dev["parts_wanted"] -= 1 dev["parts"] += 1 old_sort_key = dev["sort_key"] new_sort_key = dev["sort_key"] = self._sort_key_for(dev) for tier in dev["tiers"]: parts_available_in_tier[tier] -= 1 other_replicas[tier] += 1 occupied_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2dev_sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2dev_sort_key[tier].insert(new_index, new_sort_key) new_last_sort_key = tier2dev_sort_key[tier][-1] tier2sort_key[tier] = new_last_sort_key # Now jiggle tier2children values to keep them sorted parent_tier = tier[0:-1] index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key) popped = tier2children[parent_tier].pop(index) tier2children_sort_key[parent_tier].pop(index) new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key) tier2children[parent_tier].insert(new_index, popped) tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key) self._replica2part2dev[replica][part] = dev["id"] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev["sort_key"] del dev["tiers"]
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different zones are considered the farthest-apart things, followed by different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) available_devs = \ sorted((d for d in self._iter_devs() if d['weight']), key=lambda x: x['sort_key']) tier2children = build_tier_tree(available_devs) tier2devs = defaultdict(list) tier2sort_key = defaultdict(list) tiers_by_depth = defaultdict(set) for dev in available_devs: for tier in tiers_for_dev(dev): tier2devs[tier].append(dev) # <-- starts out sorted! tier2sort_key[tier].append(dev['sort_key']) tiers_by_depth[len(tier)].add(tier) for part, replace_replicas in reassign_parts: # Gather up what other tiers (zones, ip_ports, and devices) the # replicas not-to-be-moved are in for this part. other_replicas = defaultdict(lambda: 0) for replica in xrange(self.replicas): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): other_replicas[tier] += 1 def find_home_for_replica(tier=(), depth=1): # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas, pick the # tier with the hungriest drive and then continue # searching in that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator will balance things as # evenly as possible at each level of the device # layout. If your layout is extremely unbalanced, # this may produce poor results. candidate_tiers = tier2children[tier] min_count = min(other_replicas[t] for t in candidate_tiers) candidate_tiers = [t for t in candidate_tiers if other_replicas[t] == min_count] candidate_tiers.sort( key=lambda t: tier2sort_key[t][-1]) if depth == max(tiers_by_depth.keys()): return tier2devs[candidate_tiers[-1]][-1] return find_home_for_replica(tier=candidate_tiers[-1], depth=depth + 1) for replica in replace_replicas: dev = find_home_for_replica() dev['parts_wanted'] -= 1 dev['parts'] += 1 old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in tiers_for_dev(dev): other_replicas[tier] += 1 index = bisect.bisect_left(tier2sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2sort_key[tier].insert(new_index, new_sort_key) self._replica2part2dev[replica][part] = dev['id'] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key']
def _gather_reassign_parts(self): """ Returns a list of (partition, replicas) pairs to be reassigned by gathering from removed devices, insufficiently-far-apart replicas, and overweight drives. """ # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. removed_dev_parts = defaultdict(list) if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for replica in xrange(self.replicas): part2dev = self._replica2part2dev[replica] for part in xrange(self.parts): if part2dev[part] in dev_ids: self._last_part_moves[part] = 0 removed_dev_parts[part].append(replica) # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. spread_out_parts = defaultdict(list) max_allowed_replicas = self._build_max_replicas_by_tier() for part in xrange(self.parts): # Only move one replica at a time if possible. if part in removed_dev_parts: continue # First, add up the count of replicas at each tier for each # partition. replicas_at_tier = defaultdict(lambda: 0) for replica in xrange(self.replicas): dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): replicas_at_tier[tier] += 1 # Now, look for partitions not yet spread out enough and not # recently moved. for replica in xrange(self.replicas): dev = self.devs[self._replica2part2dev[replica][part]] removed_replica = False for tier in tiers_for_dev(dev): if (replicas_at_tier[tier] > max_allowed_replicas[tier] and self._last_part_moves[part] >= self.min_part_hours): self._last_part_moves[part] = 0 spread_out_parts[part].append(replica) dev['parts_wanted'] += 1 dev['parts'] -= 1 removed_replica = True break if removed_replica: for tier in tiers_for_dev(dev): replicas_at_tier[tier] -= 1 # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. reassign_parts = defaultdict(list) # We randomly pick a new starting point in the "circular" ring of # partitions to try to get a better rebalance when called multiple # times. start = self._last_part_gather_start / 4 + randint(0, self.parts / 2) self._last_part_gather_start = start for replica in xrange(self.replicas): part2dev = self._replica2part2dev[replica] for part in itertools.chain(xrange(start, self.parts), xrange(0, start)): if self._last_part_moves[part] < self.min_part_hours: continue if part in removed_dev_parts or part in spread_out_parts: continue dev = self.devs[part2dev[part]] if dev['parts_wanted'] < 0: self._last_part_moves[part] = 0 dev['parts_wanted'] += 1 dev['parts'] -= 1 reassign_parts[part].append(replica) reassign_parts.update(spread_out_parts) reassign_parts.update(removed_dev_parts) reassign_parts_list = list(reassign_parts.iteritems()) # We shuffle the partitions to reassign so we get a more even # distribution later. There has been discussion of trying to distribute # partitions more "regularly" because that would actually reduce risk # but 1) it is really difficult to do this with uneven clusters and 2) # it would concentrate load during failure recovery scenarios # (increasing risk). The "right" answer has yet to be debated to # conclusion, but working code wins for now. shuffle(reassign_parts_list) return reassign_parts_list
def test_tiers_for_dev(self): self.assertEqual(tiers_for_dev(self.test_dev), ((1, ), (1, 1), (1, 1, '192.168.1.1'), (1, 1, '192.168.1.1', 0)))
def test_tiers_for_dev(self): self.assertEqual( tiers_for_dev(self.test_dev), ((1,), (1, 1), (1, 1, "192.168.1.1:6000"), (1, 1, "192.168.1.1:6000", 0)) )
def _reassign_parts(self, reassign_parts): """ For an existing ring data set, partitions are reassigned similarly to the initial assignment. The devices are ordered by how many partitions they still want and kept in that order throughout the process. The gathered partitions are iterated through, assigning them to devices according to the "most wanted" while keeping the replicas as "far apart" as possible. Two different zones are considered the farthest-apart things, followed by different ip/port pairs within a zone; the least-far-apart things are different devices with the same ip/port pair in the same zone. If you want more replicas than devices, you won't get all your replicas. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the replica (an int) to replace for that partition. replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) available_devs = \ sorted((d for d in self._iter_devs() if d['weight']), key=lambda x: x['sort_key']) tier2children = build_tier_tree(available_devs) tier2devs = defaultdict(list) tier2sort_key = defaultdict(list) tiers_by_depth = defaultdict(set) for dev in available_devs: for tier in tiers_for_dev(dev): tier2devs[tier].append(dev) # <-- starts out sorted! tier2sort_key[tier].append(dev['sort_key']) tiers_by_depth[len(tier)].add(tier) for part, replace_replicas in reassign_parts: # Gather up what other tiers (zones, ip_ports, and devices) the # replicas not-to-be-moved are in for this part. other_replicas = defaultdict(lambda: 0) for replica in xrange(self.replicas): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): other_replicas[tier] += 1 def find_home_for_replica(tier=(), depth=1): # Order the tiers by how many replicas of this # partition they already have. Then, of the ones # with the smallest number of replicas, pick the # tier with the hungriest drive and then continue # searching in that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest # sum-of-parts-wanted) or picking one at random. # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # # Note that this allocator will balance things as # evenly as possible at each level of the device # layout. If your layout is extremely unbalanced, # this may produce poor results. candidate_tiers = tier2children[tier] min_count = min(other_replicas[t] for t in candidate_tiers) candidate_tiers = [ t for t in candidate_tiers if other_replicas[t] == min_count ] candidate_tiers.sort( key=lambda t: tier2devs[t][-1]['parts_wanted']) if depth == max(tiers_by_depth.keys()): return tier2devs[candidate_tiers[-1]][-1] return find_home_for_replica(tier=candidate_tiers[-1], depth=depth + 1) for replica in replace_replicas: dev = find_home_for_replica() dev['parts_wanted'] -= 1 dev['parts'] += 1 old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in tiers_for_dev(dev): other_replicas[tier] += 1 index = bisect.bisect_left(tier2sort_key[tier], old_sort_key) tier2devs[tier].pop(index) tier2sort_key[tier].pop(index) new_index = bisect.bisect_left(tier2sort_key[tier], new_sort_key) tier2devs[tier].insert(new_index, dev) tier2sort_key[tier].insert(new_index, new_sort_key) self._replica2part2dev[replica][part] = dev['id'] # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key']
def _gather_reassign_parts(self): """ Returns a list of (partition, replicas) pairs to be reassigned by gathering from removed devices, insufficiently-far-apart replicas, and overweight drives. """ # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. removed_dev_parts = defaultdict(list) if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for replica in xrange(self.replicas): part2dev = self._replica2part2dev[replica] for part in xrange(self.parts): if part2dev[part] in dev_ids: self._last_part_moves[part] = 0 removed_dev_parts[part].append(replica) # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. spread_out_parts = defaultdict(list) max_allowed_replicas = self._build_max_replicas_by_tier() for part in xrange(self.parts): # Only move one replica at a time if possible. if part in removed_dev_parts: continue # First, add up the count of replicas at each tier for each # partition. replicas_at_tier = defaultdict(lambda: 0) for replica in xrange(self.replicas): dev = self.devs[self._replica2part2dev[replica][part]] for tier in tiers_for_dev(dev): replicas_at_tier[tier] += 1 # Now, look for partitions not yet spread out enough and not # recently moved. for replica in xrange(self.replicas): dev = self.devs[self._replica2part2dev[replica][part]] removed_replica = False for tier in tiers_for_dev(dev): if (replicas_at_tier[tier] > max_allowed_replicas[tier] and self._last_part_moves[part] >= self.min_part_hours): self._last_part_moves[part] = 0 spread_out_parts[part].append(replica) dev['parts_wanted'] += 1 dev['parts'] -= 1 removed_replica = True break if removed_replica: for tier in tiers_for_dev(dev): replicas_at_tier[tier] -= 1 # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. reassign_parts = defaultdict(list) # We randomly pick a new starting point in the "circular" ring of # partitions to try to get a better rebalance when called multiple # times. start = self._last_part_gather_start / 4 + randint(0, self.parts / 2) self._last_part_gather_start = start for replica in xrange(self.replicas): part2dev = self._replica2part2dev[replica] for part in itertools.chain(xrange(start, self.parts), xrange(0, start)): if self._last_part_moves[part] < self.min_part_hours: continue if part in removed_dev_parts or part in spread_out_parts: continue dev = self.devs[part2dev[part]] if dev['parts_wanted'] < 0: self._last_part_moves[part] = 0 dev['parts_wanted'] += 1 dev['parts'] -= 1 reassign_parts[part].append(replica) reassign_parts.update(spread_out_parts) reassign_parts.update(removed_dev_parts) reassign_parts_list = list(reassign_parts.iteritems()) # We shuffle the partitions to reassign so we get a more even # distribution later. There has been discussion of trying to distribute # partitions more "regularly" because that would actually reduce risk # but 1) it is really difficult to do this with uneven clusters and 2) # it would concentrate load during failure recovery scenarios # (increasing risk). The "right" answer has yet to be debated to # conclusion, but working code wins for now. shuffle(reassign_parts_list) return reassign_parts_list