def get_partition_sizes(self): # Get broker partition sizes FNULL = open(os.devnull, 'w') for broker_id, broker in self.cluster.brokers.items(): if broker.hostname is None: raise UnknownBrokerException("Cannot get sizes for broker ID {0} which has no hostname. " "Remove the broker from the cluster before balance".format(broker_id)) if 'sshuser' in self.properties: connection_endpoint = self.properties['sshuser']+'@'+broker.hostname else: connection_endpoint = broker.hostname if 'sshkey' in self.properties: key = self.properties['sshkey'] else: key = None if key is None: log.info("Getting partition sizes via SSH for {0}".format(broker.hostname)) proc = subprocess.Popen(['ssh', connection_endpoint, 'du -sk {0}/*'.format(self.properties['datadir'])], stdout=subprocess.PIPE, stderr=FNULL) else: log.info("Getting partition sizes via SSH using key: {0} for {1}".format(key, broker.hostname)) proc = subprocess.Popen(['ssh', '-i', key, connection_endpoint, 'du -sk {0}/*'.format(self.properties['datadir'])], stdout=subprocess.PIPE, stderr=FNULL) for line in proc.stdout: self.process_df_match(self.size_re.match(line.decode()), broker_id)
def log_broker_summary(self): for broker_id in sorted(self.brokers.keys()): broker = self.brokers[broker_id] log.info( "Broker {0}: partitions={1}/{2} ({3:.2f}%), size={4}".format( broker_id, broker.num_leaders(), broker.num_partitions(), broker.percent_leaders(), broker.total_size()))
def _execute(self, num, total, zookeeper, tools_path): with NamedTemporaryFile(mode='w') as assignfile: json.dump(self.dict_for_reassignment(), assignfile) assignfile.flush() FNULL = open(os.devnull, 'w') proc = subprocess.Popen([ '{0}/kafka-reassign-partitions.sh'.format(tools_path), '--execute', '--zookeeper', zookeeper, '--reassignment-json-file', assignfile.name ], stdout=FNULL, stderr=FNULL) proc.wait() # Wait until finished while True: remaining_partitions = self.check_completion( zookeeper, tools_path, assignfile.name) if remaining_partitions == 0: break log.info( 'Partition reassignment {0}/{1} in progress [ {2}/{3} partitions remain ]. Sleeping {4} seconds' .format(num, total, remaining_partitions, len(self.partitions), self.pause_time)) time.sleep(self.pause_time)
def get_partition_sizes(self): self._validate_properties() for broker_id, broker in self.cluster.brokers.items(): if broker.hostname is None: raise UnknownBrokerException("Cannot get sizes for broker ID {0} which has no hostname. " "Remove the broker from the cluster before balance".format(broker_id)) log.info("Getting partition sizes via Prometheus exporter for {0}".format(broker.hostname)) self._query_prometheus(broker.hostname)
def run_preferred_replica_elections(batches, args, tools_path, plugins, dry_run): for i, batch in enumerate(batches): # Sleep between PLEs if i > 0 and not dry_run: log.info("Waiting {0} seconds for replica election to complete".format(args.ple_wait)) time.sleep(args.ple_wait) log.info("Executing preferred replica election {0}/{1}".format(i + 1, len(batches))) batch.execute(i + 1, len(batches), args.zookeeper, tools_path, plugins, dry_run)
def close(self): log.info("Disconnecting from {0}".format(self.hostname)) # Shutdown throws an error if the socket is not connected, but that's OK try: self._sock.shutdown(socket.SHUT_RDWR) except OSError: pass self._sock.close()
def check_and_get_sizes(action_cls, args, cluster, sizer_map): if action_cls.needs_sizes: sizer_to_run = sizer_map[args.sizer](args, cluster) sizer_to_run.get_partition_sizes() if args.size: log.info("Partition Sizes:") for topic in cluster.topics: for partition in cluster.topics[topic].partitions: log.info("{0} {1}:{2}".format(partition.size, topic, partition.num))
def connect(self): protocol = 'SSL' if self._configuration.ssl_context is not None else 'PLAINTEXT' endpoint = self.get_endpoint(protocol) log.info("Connecting to {0} on port {1} using {2}".format(self.hostname, self.port, protocol)) try: self._sock = self._sock or self._get_socket(self._configuration.ssl_context) self._sock.connect((endpoint.hostname, endpoint.port)) except socket.error as e: log.error("Cannot connect to broker {0}:{1}: {2}".format(endpoint.hostname, endpoint.port, e)) raise ConnectionError("Cannot connect to broker {0}:{1}: {2}".format(endpoint.hostname, endpoint.port, e))
def close(self): log.info("Disconnecting from {0}".format(self.hostname)) # Shutdown throws an error if the socket is not connected, but that's OK try: self._sock.shutdown(socket.SHUT_RDWR) except OSError: pass self._sock.close() self._sock = None
def get_partition_sizes(self): self._validate_properties() for broker_id, broker in self.cluster.brokers.items(): if broker.hostname is None: raise UnknownBrokerException( "Cannot get sizes for broker ID {0} which has no hostname. " "Remove the broker from the cluster before balance".format( broker_id)) log.info("Getting partition sizes via Prometheus exporter for {0}". format(broker.hostname)) self._query_prometheus(broker.hostname)
def get_partition_sizes(self): # Get broker partition sizes FNULL = open(os.devnull, 'w') for broker_id, broker in self.cluster.brokers.items(): if broker.hostname is None: raise UnknownBrokerException("Cannot get sizes for broker ID {0} which has no hostname. " "Remove the broker from the cluster before balance".format(broker_id)) log.info("Getting partition sizes via SSH for {0}".format(broker.hostname)) proc = subprocess.Popen(['ssh', broker.hostname, 'du -sk {0}/*'.format(self.properties['datadir'])], stdout=subprocess.PIPE, stderr=FNULL) for line in proc.stdout: self.process_df_match(self.size_re.match(line.decode()), broker_id)
def run_preferred_replica_elections(batches, args, tools_path, plugins, dry_run): for i, batch in enumerate(batches): # Sleep between PLEs if i > 0 and not dry_run: log.info( "Waiting {0} seconds for replica election to complete".format( args.ple_wait)) time.sleep(args.ple_wait) log.info("Executing preferred replica election {0}/{1}".format( i + 1, len(batches))) batch.execute(i + 1, len(batches), args.zookeeper, tools_path, plugins, dry_run)
def process_cluster(self): log.info("Starting partition balance by rack") # Check if rack information is set for the cluster broker_racks = [broker.rack for broker in self.cluster.brokers.values()] if len(set(broker_racks)) == 1: raise BalanceException("Cannot balance cluster by rack as it has no rack information") # Figure out the max RF for the cluster max_rf = self.cluster.max_replication_factor() # Balance partitions at each position separately for pos in range(max_rf): self._process_partitions_at_pos(pos)
def main(): # Start by loading all the modules action_map = get_module_map(kafka.tools.assigner.actions, kafka.tools.assigner.actions.ActionModule) sizer_map = get_module_map(kafka.tools.assigner.sizers, kafka.tools.assigner.sizers.SizerModule) plugins = get_all_plugins() # Set up and parse all CLI arguments args = set_up_arguments(action_map, sizer_map, plugins) run_plugins_at_step(plugins, 'set_arguments', args) tools_path = get_tools_path(args.tools_path) check_java_home() cluster = Cluster.create_from_zookeeper(args.zookeeper, getattr(args, 'default_retention', 1)) run_plugins_at_step(plugins, 'set_cluster', cluster) # If the module needs the partition sizes, call a size module to get the information check_and_get_sizes(action_map[args.action], args, cluster, sizer_map) run_plugins_at_step(plugins, 'after_sizes') print_leadership("before", cluster, args.leadership) # Clone the cluster, and run the action to generate a new cluster state newcluster = cluster.clone() action_to_run = action_map[args.action](args, newcluster) action_to_run.process_cluster() run_plugins_at_step(plugins, 'set_new_cluster', action_to_run.cluster) print_leadership("after", newcluster, args.leadership) move_partitions = cluster.changed_partitions(action_to_run.cluster) batches = split_partitions_into_batches(move_partitions, batch_size=args.moves, use_class=Reassignment) run_plugins_at_step(plugins, 'set_batches', batches) log.info("Partition moves required: {0}".format(len(move_partitions))) log.info("Number of batches: {0}".format(len(batches))) dry_run = is_dry_run(args) for i, batch in enumerate(batches): log.info("Executing partition reassignment {0}/{1}: {2}".format(i + 1, len(batches), repr(batch))) batch.execute(i + 1, len(batches), args.zookeeper, tools_path, plugins, dry_run) run_plugins_at_step(plugins, 'before_ple') if not args.skip_ple: all_cluster_partitions = [p for p in action_to_run.cluster.partitions(args.exclude_topics)] batches = split_partitions_into_batches(all_cluster_partitions, batch_size=args.ple_size, use_class=ReplicaElection) log.info("Number of replica elections: {0}".format(len(batches))) run_preferred_replica_elections(batches, args, tools_path, plugins, dry_run) run_plugins_at_step(plugins, 'finished') if args.output_json: data = { 'before': cluster.to_dict(), 'after': action_to_run.cluster.to_dict() } sys.stdout.write(json.dumps(data, indent=4, sort_keys=True)) return os.EX_OK
def get_partition_sizes(self): # Get broker partition sizes for broker_id, broker in self.cluster.brokers.items(): _validate_broker(broker) log.info("Getting partition sizes via JMX for {0}".format(broker.hostname)) jmxurl = self._java_provider.javax.management.remote.JMXServiceURL( "service:jmx:rmi:///jndi/rmi://{0}:{1}/jmxrmi".format(broker.hostname, broker.jmx_port)) jmxsoc = self._java_provider.javax.management.remote.JMXConnectorFactory.connect(jmxurl, self._envhash) connection = jmxsoc.getMBeanServerConnection() beans = connection.queryNames(self._java_provider.javax.management.ObjectName("kafka.log:name=Size,*"), None) for bean in beans: self._fetch_bean(connection, bean) jmxsoc.close()
def process_cluster(self): log.info("Starting partition balance by rack") # Check if rack information is set for the cluster broker_racks = [ broker.rack for broker in self.cluster.brokers.values() ] if len(set(broker_racks)) == 1: raise BalanceException( "Cannot balance cluster by rack as it has no rack information") # Figure out the max RF for the cluster max_rf = self.cluster.max_replication_factor() # Balance partitions at each position separately for pos in range(max_rf): self._process_partitions_at_pos(pos)
def get_partition_sizes(self): # Get broker partition sizes for broker_id, broker in self.cluster.brokers.items(): _validate_broker(broker) log.info("Getting partition sizes via JMX for {0}".format( broker.hostname)) jmxurl = self._java_provider.javax.management.remote.JMXServiceURL( "service:jmx:rmi:///jndi/rmi://{0}:{1}/jmxrmi".format( broker.hostname, broker.jmx_port)) jmxsoc = self._java_provider.javax.management.remote.JMXConnectorFactory.connect( jmxurl, self._envhash) connection = jmxsoc.getMBeanServerConnection() beans = connection.queryNames( self._java_provider.javax.management.ObjectName( "kafka.log:name=Size,*"), None) for bean in beans: self._fetch_bean(connection, bean) jmxsoc.close()
def create_from_zookeeper(cls, zkconnect, default_retention=1, fetch_topics=True): log.info("Connecting to zookeeper {0}".format(zkconnect)) try: zk = KazooClient(zkconnect) zk.start() except Exception as e: raise ZookeeperException("Cannot connect to Zookeeper: {0}".format(e)) # Get broker list cluster = cls(retention=default_retention) add_brokers_from_zk(cluster, zk) # Get current partition state if fetch_topics: log.info("Getting partition list from Zookeeper") for topic in zk.get_children("/brokers/topics"): zdata, zstat = zk.get("/brokers/topics/{0}".format(topic)) add_topic_with_replicas(cluster, topic, json_loads(zdata)) set_topic_retention(cluster.topics[topic], zk) if cluster.num_topics() == 0: raise ZookeeperException("The cluster specified does not have any topics") log.info("Closing connection to zookeeper") zk.stop() zk.close() return cluster
def _execute(self, num, total, zookeeper, tools_path): with NamedTemporaryFile(mode='w') as assignfile: json.dump(self.dict_for_reassignment(), assignfile) assignfile.flush() FNULL = open(os.devnull, 'w') proc = subprocess.Popen(['{0}/kafka-reassign-partitions.sh'.format(tools_path), '--execute', '--zookeeper', zookeeper, '--reassignment-json-file', assignfile.name], stdout=FNULL, stderr=FNULL) proc.wait() # Wait until finished while True: remaining_partitions = self.check_completion(zookeeper, tools_path, assignfile.name) if remaining_partitions == 0: break log.info('Partition reassignment {0}/{1} in progress [ {2}/{3} partitions remain ]. Sleeping {4} seconds'.format(num, total, remaining_partitions, len(self.partitions), self.pause_time)) time.sleep(self.pause_time)
def main(): # Start by loading all the modules action_map = get_module_map(kafka.tools.assigner.actions, kafka.tools.assigner.actions.ActionModule) sizer_map = get_module_map(kafka.tools.assigner.sizers, kafka.tools.assigner.sizers.SizerModule) plugins = get_all_plugins() # Set up and parse all CLI arguments args = set_up_arguments(action_map, sizer_map, plugins) run_plugins_at_step(plugins, 'set_arguments', args) tools_path = get_tools_path(args.tools_path) check_java_home() cluster = Cluster.create_from_zookeeper( args.zookeeper, getattr(args, 'default_retention', 1)) run_plugins_at_step(plugins, 'set_cluster', cluster) # If the module needs the partition sizes, call a size module to get the information check_and_get_sizes(action_map[args.action], args, cluster, sizer_map) run_plugins_at_step(plugins, 'after_sizes') print_leadership("before", cluster, args.leadership) # Clone the cluster, and run the action to generate a new cluster state newcluster = cluster.clone() action_to_run = action_map[args.action](args, newcluster) action_to_run.process_cluster() run_plugins_at_step(plugins, 'set_new_cluster', action_to_run.cluster) print_leadership("after", newcluster, args.leadership) move_partitions = cluster.changed_partitions(action_to_run.cluster) batches = split_partitions_into_batches(move_partitions, batch_size=args.moves, use_class=Reassignment) run_plugins_at_step(plugins, 'set_batches', batches) log.info("Partition moves required: {0}".format(len(move_partitions))) log.info("Number of batches: {0}".format(len(batches))) dry_run = is_dry_run(args) for i, batch in enumerate(batches): log.info("Executing partition reassignment {0}/{1}: {2}".format( i + 1, len(batches), repr(batch))) batch.execute(i + 1, len(batches), args.zookeeper, tools_path, plugins, dry_run) run_plugins_at_step(plugins, 'before_ple') if not args.skip_ple: all_cluster_partitions = [ p for p in action_to_run.cluster.partitions(args.exclude_topics) ] batches = split_partitions_into_batches(all_cluster_partitions, batch_size=args.ple_size, use_class=ReplicaElection) log.info("Number of replica elections: {0}".format(len(batches))) run_preferred_replica_elections(batches, args, tools_path, plugins, dry_run) run_plugins_at_step(plugins, 'finished') return os.EX_OK
def connect(self): log.info("Connecting to {0} on port {1} using PLAINTEXT".format( self.hostname, self.port)) self._sock.connect((self.hostname, self.port))
def is_dry_run(args): if args.generate or not args.execute: log.info("--execute flag NOT specified. DRY RUN ONLY") return True return False
def print_leadership(type_str, cluster, dont_skip): if dont_skip: log.info("Cluster Leadership Balance ({0}):".format(type_str)) cluster.log_broker_summary()
def process_cluster(self): log.info("Starting partition balance by {0}".format(self._size_attr)) # Figure out the max RF for the cluster max_rf = self.cluster.max_replication_factor() # Calculate cluster information and sorted partition lists first partitions = {} sizes = {} targets = {} margins = {} for pos in range(max_rf): sizes[pos] = {} targets[pos] = {} margins[pos] = {} # Create a sorted list of partitions to use at this position (descending size) # Throw out partitions that are 4K or less in size, as they are effectively empty partitions[pos] = [p for p in self.cluster.partitions(self.args.exclude_topics) if (len(p.replicas) > pos) and (getattr(p, self._size_attr) > 4)] if len(partitions[pos]) == 0: continue partitions[pos].sort(key=attrgetter(self._size_attr), reverse=True) # Calculate broker size at this position for broker in self.cluster.brokers: if pos in self.cluster.brokers[broker].partitions: sizes[pos][broker] = sum([getattr(p, self._size_attr) for p in self.cluster.brokers[broker].partitions[pos]], 0) else: sizes[pos][broker] = 0 # Calculate the median size of partitions (margin is median/2) and the average size per broker to target # Yes, I know the median calculation is slightly broken (it keeps integers). This is OK targets[pos] = sum([getattr(p, self._size_attr) for p in partitions[pos]], 0) // len(self.cluster.brokers) sizelen = len(partitions[pos]) if not sizelen % 2: margins[pos] = (getattr(partitions[pos][sizelen // 2], self._size_attr) + getattr(partitions[pos][sizelen // 2 - 1], self._size_attr)) // 4 else: margins[pos] = getattr(partitions[pos][sizelen // 2], self._size_attr) // 2 # Balance partitions for each replica position separately for pos in range(max_rf): if len(sizes[pos]) == 0: continue log.info("Calculating ideal state for replica position {0}".format(pos)) log.debug("Target average size per-broker is {0} kibibytes (+/- {1})".format(targets[pos], margins[pos])) for broker_id in self.cluster.brokers: broker = self.cluster.brokers[broker_id] # Skip brokers that are larger than our minimum target size min_move = targets[pos] - margins[pos] - sizes[pos][broker_id] max_move = min_move + (margins[pos] * 2) if min_move <= 0: continue log.debug("Moving between {0} and {1} kibibytes to broker {2}".format(min_move, max_move, broker_id)) # Find partitions to move to this broker for partition in partitions[pos]: partition_size = getattr(partition, self._size_attr) # We can use this partition if all of the following are true: the partition has a replica at this position, # it's size is less than or equal to the max move size, the broker at this replica position would not go out # of range, and it doesn't already exist on this broker at this position if ((len(partition.replicas) <= pos) or (partition_size > max_move) or ((sizes[pos][partition.replicas[pos].id] - partition_size) < (targets[pos] - margins[pos])) or (partition.replicas[pos] == broker)): continue # We can only use a partition that this replica exists on if swapping positions wouldn't hurt balance of the other position or broker source = partition.replicas[pos] if broker in partition.replicas: other_pos = partition.replicas.index(broker) if ((sizes[other_pos][broker_id] - partition_size < targets[other_pos] - margins[other_pos]) or (sizes[other_pos][source.id] + partition_size > targets[pos] + margins[pos]) or (sizes[pos][broker_id] + partition_size > targets[pos] + margins[pos]) or (sizes[pos][source.id] - partition_size < targets[pos] - margins[pos])): continue partition.swap_replica_positions(source, broker) sizes[other_pos][broker_id] -= partition_size sizes[other_pos][source.id] += partition_size else: # Move the partition and adjust sizes partition.swap_replicas(source, broker) sizes[pos][broker_id] += partition_size sizes[pos][source.id] -= partition_size min_move -= partition_size max_move -= partition_size # If we have moved enough partitions, stop for this broker if min_move <= 0: break
def process_cluster(self): log.info("Starting partition balance by count") # Figure out the max RF for the cluster and sort all partition lists by size (ascending) max_pos = self.cluster.max_replication_factor() for broker in self.cluster.brokers: for pos in self.cluster.brokers[broker].partitions: self.cluster.brokers[broker].partitions[pos].sort(key=attrgetter('size')) # Calculate partition counts for each position first max_count = {} for pos in range(max_pos): # Calculate the maximum number of partitions each broker should have (floor(average)) # We'll also track a remainder and make sure they only go 1 per broker pcount = 0 for broker in self.cluster.brokers: if pos in self.cluster.brokers[broker].partitions: pcount += self.cluster.brokers[broker].num_partitions_at_position(pos) max_count[pos] = [pcount / len(self.cluster.brokers), pcount % len(self.cluster.brokers)] log.info("Calculating ideal state for replica position {0} - max {1} partitions".format(pos, max_count[pos][0] + 1)) # Balance partition counts for each replica position separately for pos in range(max_pos): for broker_id in self.cluster.brokers: broker = self.cluster.brokers[broker_id] # Figure out how many more partitions this broker needs diff = max_count[pos][0] if max_count[pos][1]: diff += 1 max_count[pos][1] -= 1 if pos in broker.partitions: diff -= broker.num_partitions_at_position(pos) if diff > 0: log.debug("Moving {0} partitions to broker {1}".format(diff, broker_id)) # Iterate through the largest brokers to find diff partitions to move to this broker for source_id in self.cluster.brokers: source = self.cluster.brokers[source_id] if diff == 0: break if pos not in source.partitions: continue iterlist = list(source.partitions[pos]) for partition in iterlist: # If we have moved enough partitions from this broker, exit out of the inner loop if (source.num_partitions_at_position(pos) < max_count[pos][0]) or (diff == 0): break # Skip topics that are being excluded if partition.topic.name in self.args.exclude_topics: continue # If the partition is already on the target, swap positions only if it makes the balance better if broker in partition.replicas: other_pos = partition.replicas.index(broker) if (other_pos in source.partitions) and (source.num_partitions_at_position(other_pos) < max_count[other_pos][0]): partition.swap_replica_positions(source, broker) else: partition.swap_replicas(source, broker) diff -= 1 log.debug("Finish broker {0} with {1} partitions".format(broker_id, broker.num_partitions_at_position(pos))) elif diff < 0: log.debug("Moving {0} partitions off broker {1}".format(-diff, broker_id)) # Iterate through the smallest brokers to find diff partitions to move off this broker for target_id in self.cluster.brokers: target = self.cluster.brokers[target_id] if diff == 0: break if (pos in target.partitions) and (target.num_partitions_at_position(pos) > (max_count[pos][0] + 1)): continue iterlist = list(broker.partitions[pos]) for partition in iterlist: # If we have moved enough partitions to this broker, exit out of the inner loop if ((pos in target.partitions) and (target.num_partitions_at_position(pos) >= max_count[pos][0])) or (diff == 0): break # Skip partitions that are already on the target broker or are being excluded if (target in partition.replicas) or (partition.topic.name in self.args.exclude_topics): continue partition.swap_replicas(broker, target) diff += 1 log.debug("Finish broker {0} with {1} partitions".format(broker, broker.num_partitions_at_position(pos))) else: log.debug("Skipping broker {0} which has {1} partitions".format(broker, broker.num_partitions_at_position(pos))) continue
def process_cluster(self): log.info("Starting even partition balance") # Initialize broker deques for each position for remainder assignment ordered_brokers = sorted(self.cluster.brokers.keys()) max_rf = self.cluster.max_replication_factor() remainder_brokers = [deque(ordered_brokers) for pos in range(max_rf)] for pos in range(max_rf): # Advance the deque by max_rf places so that we don't collide replicas remainder_brokers[pos].rotate(-pos) for topic_name in sorted(self.cluster.topics): topic = self.cluster.topics[topic_name] if not self.check_topic_ok(topic): continue # How many partitions per broker, and what's the last one that can be evenly balanced target = len(topic.partitions) // len(self.cluster.brokers) last_even_partition = len(topic.partitions) - (len(topic.partitions) % len(self.cluster.brokers)) - 1 # Initialize broker map for this topic. pmap = [dict.fromkeys(self.cluster.brokers.keys(), 0) for pos in range(len(topic.partitions[0].replicas))] for pnum in range(0, last_even_partition + 1): partition = topic.partitions[pnum] for i, replica in enumerate(partition.replicas): pmap[i][replica.id] += 1 # Balance all but the last remainder partitions while not pmap_matches_target(pmap, target): for pnum in range(0, last_even_partition + 1): partition = topic.partitions[pnum] for pos in range(len(partition.replicas)): # Current placement is fine (or low). Leave the replica where it is if pmap[pos][partition.replicas[pos].id] <= target: continue # Find a new replica for the partition at this position for bid in pmap[pos]: if pmap[pos][bid] >= target: continue broker = self.cluster.brokers[bid] source = partition.replicas[pos] if broker in partition.replicas: other_pos = partition.replicas.index(broker) partition.swap_replica_positions(source, broker) pmap[other_pos][broker.id] -= 1 pmap[other_pos][source.id] += 1 else: partition.swap_replicas(source, broker) pmap[pos][broker.id] += 1 pmap[pos][source.id] -= 1 break # Distribute the remainder partitions evenly among the brokers # This is a pretty dumb round robin distribution, but it will be stable for pnum in range(last_even_partition + 1, len(topic.partitions)): partition = topic.partitions[pnum] for pos in range(len(partition.replicas)): # Find a new replica for this partition proposed = remainder_brokers[pos].popleft() remainder_brokers[pos].append(proposed) partition.swap_replicas(partition.replicas[pos], self.cluster.brokers[proposed])
def process_cluster(self): log.info("Starting partition balance by count") # Figure out the max RF for the cluster and sort all partition lists by size (ascending) max_pos = self.cluster.max_replication_factor() for broker in self.cluster.brokers: for pos in self.cluster.brokers[broker].partitions: self.cluster.brokers[broker].partitions[pos].sort( key=attrgetter('size')) # Calculate partition counts for each position first max_count = {} for pos in range(max_pos): # Calculate the maximum number of partitions each broker should have (floor(average)) # We'll also track a remainder and make sure they only go 1 per broker pcount = 0 for broker in self.cluster.brokers: if pos in self.cluster.brokers[broker].partitions: pcount += self.cluster.brokers[ broker].num_partitions_at_position(pos) max_count[pos] = [ pcount / len(self.cluster.brokers), pcount % len(self.cluster.brokers) ] log.info( "Calculating ideal state for replica position {0} - max {1} partitions" .format(pos, max_count[pos][0] + 1)) # Balance partition counts for each replica position separately for pos in range(max_pos): for broker_id in self.cluster.brokers: broker = self.cluster.brokers[broker_id] # Figure out how many more partitions this broker needs diff = max_count[pos][0] if max_count[pos][1]: diff += 1 max_count[pos][1] -= 1 if pos in broker.partitions: diff -= broker.num_partitions_at_position(pos) if diff > 0: log.debug("Moving {0} partitions to broker {1}".format( diff, broker_id)) # Iterate through the largest brokers to find diff partitions to move to this broker for source_id in self.cluster.brokers: source = self.cluster.brokers[source_id] if diff == 0: break if pos not in source.partitions: continue iterlist = list(source.partitions[pos]) for partition in iterlist: # If we have moved enough partitions from this broker, exit out of the inner loop if (source.num_partitions_at_position(pos) < max_count[pos][0]) or (diff == 0): break # Skip topics that are being excluded if partition.topic.name in self.args.exclude_topics: continue # If the partition is already on the target, swap positions only if it makes the balance better if broker in partition.replicas: other_pos = partition.replicas.index(broker) if (other_pos in source.partitions ) and (source.num_partitions_at_position( other_pos) < max_count[other_pos][0]): partition.swap_replica_positions( source, broker) else: partition.swap_replicas(source, broker) diff -= 1 log.debug("Finish broker {0} with {1} partitions".format( broker_id, broker.num_partitions_at_position(pos))) elif diff < 0: log.debug("Moving {0} partitions off broker {1}".format( -diff, broker_id)) # Iterate through the smallest brokers to find diff partitions to move off this broker for target_id in self.cluster.brokers: target = self.cluster.brokers[target_id] if diff == 0: break if (pos in target.partitions) and ( target.num_partitions_at_position(pos) > (max_count[pos][0] + 1)): continue iterlist = list(broker.partitions[pos]) for partition in iterlist: # If we have moved enough partitions to this broker, exit out of the inner loop if ((pos in target.partitions) and (target.num_partitions_at_position(pos) >= max_count[pos][0])) or (diff == 0): break # Skip partitions that are already on the target broker or are being excluded if (target in partition.replicas) or ( partition.topic.name in self.args.exclude_topics): continue partition.swap_replicas(broker, target) diff += 1 log.debug("Finish broker {0} with {1} partitions".format( broker, broker.num_partitions_at_position(pos))) else: log.debug( "Skipping broker {0} which has {1} partitions".format( broker, broker.num_partitions_at_position(pos))) continue
def print_leadership(type_str, cluster, dont_skip): if dont_skip: log.info("Cluster Leadership Balance (before):") cluster.log_broker_summary()
def process_cluster(self): log.info("Starting partition balance by {0}".format(self._size_attr)) # Figure out the max RF for the cluster max_rf = self.cluster.max_replication_factor() # Calculate cluster information and sorted partition lists first partitions = {} sizes = {} targets = {} margins = {} for pos in range(max_rf): sizes[pos] = {} targets[pos] = {} margins[pos] = {} # Create a sorted list of partitions to use at this position (descending size) # Throw out partitions that are 4K or less in size, as they are effectively empty partitions[pos] = [ p for p in self.cluster.partitions(self.args.exclude_topics) if (len(p.replicas) > pos) and ( getattr(p, self._size_attr) > 4) ] if len(partitions[pos]) == 0: continue partitions[pos].sort(key=attrgetter(self._size_attr), reverse=True) # Calculate broker size at this position for broker in self.cluster.brokers: if pos in self.cluster.brokers[broker].partitions: sizes[pos][broker] = sum([ getattr(p, self._size_attr) for p in self.cluster.brokers[broker].partitions[pos] ], 0) else: sizes[pos][broker] = 0 # Calculate the median size of partitions (margin is median/2) and the average size per broker to target # Yes, I know the median calculation is slightly broken (it keeps integers). This is OK targets[pos] = sum( [getattr(p, self._size_attr) for p in partitions[pos]], 0) // len(self.cluster.brokers) sizelen = len(partitions[pos]) if not sizelen % 2: margins[pos] = ( getattr(partitions[pos][sizelen // 2], self._size_attr) + getattr(partitions[pos][sizelen // 2 - 1], self._size_attr)) // 4 else: margins[pos] = getattr(partitions[pos][sizelen // 2], self._size_attr) // 2 # Balance partitions for each replica position separately for pos in range(max_rf): if len(sizes[pos]) == 0: continue log.info( "Calculating ideal state for replica position {0}".format(pos)) log.debug( "Target average size per-broker is {0} kibibytes (+/- {1})". format(targets[pos], margins[pos])) for broker_id in self.cluster.brokers: broker = self.cluster.brokers[broker_id] # Skip brokers that are larger than our minimum target size min_move = targets[pos] - margins[pos] - sizes[pos][broker_id] max_move = min_move + (margins[pos] * 2) if min_move <= 0: continue log.debug("Moving between {0} and {1} kibibytes to broker {2}". format(min_move, max_move, broker_id)) # Find partitions to move to this broker for partition in partitions[pos]: partition_size = getattr(partition, self._size_attr) # We can use this partition if all of the following are true: the partition has a replica at this position, # it's size is less than or equal to the max move size, the broker at this replica position would not go out # of range, and it doesn't already exist on this broker at this position if ((len(partition.replicas) <= pos) or (partition_size > max_move) or ((sizes[pos][partition.replicas[pos].id] - partition_size) < (targets[pos] - margins[pos])) or (partition.replicas[pos] == broker)): continue # We can only use a partition that this replica exists on if swapping positions wouldn't hurt balance of the other position or broker source = partition.replicas[pos] if broker in partition.replicas: other_pos = partition.replicas.index(broker) if ((sizes[other_pos][broker_id] - partition_size < targets[other_pos] - margins[other_pos]) or (sizes[other_pos][source.id] + partition_size > targets[pos] + margins[pos]) or (sizes[pos][broker_id] + partition_size > targets[pos] + margins[pos]) or (sizes[pos][source.id] - partition_size < targets[pos] - margins[pos])): continue partition.swap_replica_positions(source, broker) sizes[other_pos][broker_id] -= partition_size sizes[other_pos][source.id] += partition_size else: # Move the partition and adjust sizes partition.swap_replicas(source, broker) sizes[pos][broker_id] += partition_size sizes[pos][source.id] -= partition_size min_move -= partition_size max_move -= partition_size # If we have moved enough partitions, stop for this broker if min_move <= 0: break