def _validate_ring(self, instance): """ Run nodetool to verify that a ring is valid. """ ring_output = exec_command("nodetool --host %s ring" % instance.private_dns_name) if ring_output.failed: return ring_output.return_code # some nodes can be down, but nodetool will still exit cleanly, # so doing some extra validation to ensure that all nodes of # the ring are "Up" and "Normal" and manually set a bad return # code otherwise retcode = 0 for node in ring_output.splitlines()[3:]: #host = node[:16].strip() #data_center = node[16:28].strip() #rack = node[28:40].strip() #status = node[40:47].strip() #state = node[47 nodesplit = node.split() self.logger.debug("Node %s is %s and %s" % (nodesplit[0], nodesplit[3], nodesplit[4])) if nodesplit[3].lower() != "up" and nodesplit[4].lower() != "normal": self.logger.debug("Node %s ring is not healthy" % nodesplit[0]) self.logger.debug("Ring status:") self.logger.debug(ring_output) retcode = 200 return retcode
def start_cassandra(self, instances=None, print_ring=True, retry=False): """Start Cassandra services on instances. To validate that Cassandra is running, this will check the output of nodetool ring, make sure that gossip and thrift are running, and check that nodetool info reports Normal mode. If these tests do not pass within the timeout threshold, it will retry up to self.MAX_RESTART_ATTEMPTS times to restart. If after meeting the max allowed, it will raise a TimeoutException. """ if retry: self.logger.info("Attempting to start again (%s of %s)" % (self.current_attempt-1, self.MAX_RESTART_ATTEMPTS)) print("Cassandra failed to start - attempting to start again (%s of %s)" % (self.current_attempt-1, self.MAX_RESTART_ATTEMPTS)) if instances is None: instances = self.get_instances() for instance in instances: with settings(host_string=instance.public_dns_name, warn_only=True): #, hide("everything"): errors = -1 self.logger.info("Starting Cassandra service on %s..." % instance.id) while True: try: # check to see if cassandra is running if self.is_running(instance): self.logger.info("Cassandra is running.") break # start it if this is the first time if errors < 0: self.logger.info("Cassandra is not running. Attempting to start now...") print("Cassandra is not running. Attempting to start now...") exec_command("service cassandra start", pty=False) elif errors >= 5: #tail = sudo("tail -n 50 /var/log/cassandra/output.log") #self.logger.error(tail) raise RuntimeError("Unable to start cassandra. Check the logs for more information.") self.logger.info("Error detecting Cassandra status...will try again in 3 seconds.") errors += 1 time.sleep(3) except SystemExit, e: self.logger.error(str(e))
def _run_nodetool(self, ntcommand, instance=None): if instance is None: instance = self.get_instances()[0] self.logger.debug("running nodetool on instance %s", instance.id) with settings(host_string=instance.public_dns_name, warn_only=True), hide("everything"): output = exec_command("nodetool -h %s %s" % (instance.private_dns_name, ntcommand)) return output
def stop_cassandra(self, instances=None): if instances is None: instances = self.get_instances() for instance in instances: self.logger.info("Stopping Cassandra on %s" % instance.id) with settings(host_string=instance.public_dns_name, warn_only=True), hide("everything"): result = exec_command("service cassandra stop") self.logger.info(result) self.logger.debug("Shutdown complete.")
def _discover_ring(self, instance=None): if instance is None: instance = self.get_instances()[0] with settings(host_string=instance.public_dns_name, warn_only=True), hide("everything"): status = exec_command("service cassandra status") if status.failed: raise RuntimeException("Cassandra does not appear to be running.") self.logger.debug("Discovering ring...") retcode, output = self._run_nodetool("ring", instance) self.logger.debug("node tool output:\n%s" % output) lines = output.split("\n")[2:] assert len(lines) > 0, "Ring output must have more than two lines." self.logger.debug("Found %d nodes" % len(lines)) return [parse_nodeline(line) for line in lines]
def _configure_cassandra_instance(self, instance, seed_ips, token, set_tokens=True, auto_bootstrap=False): self.logger.debug("Configuring %s..." % instance.id) yaml_file = os.path.join("/tmp", "cassandra.yaml") cassandra_home = self.get_cassandra_home(instance) self.logger.debug("Local cassandra.yaml file: %s" % yaml_file) with settings(host_string=instance.public_dns_name, warn_only=True): #, hide("everything"): cassandra_data = os.path.join("/mnt", "cassandra-data") cassandra_logs = os.path.join("/mnt", "cassandra-logs") # create directories and log files exec_command("mkdir -p %s" % cassandra_data) exec_command("mkdir -p %s" % cassandra_logs) # set permissions exec_command("chown -R cassandra:cassandra %s %s" % (cassandra_data, cassandra_logs)) try: # get yaml file get(os.path.join(cassandra_home, "conf", "cassandra.yaml"), "/tmp") # modify it f = open(yaml_file) yaml = parse_yaml(f) f.close() yaml['seed_provider'][0]['parameters'][0]['seeds'] = ",".join(seed_ips) if set_tokens is True : yaml['initial_token'] = token if auto_bootstrap : yaml['auto_bootstrap'] = 'true' yaml['data_file_directories'] = [cassandra_data] yaml['commitlog_directory'] = cassandra_logs yaml['listen_address'] = str(instance.private_dns_name) yaml['rpc_address'] = str(instance.public_dns_name) f = open(yaml_file, "w") f.write(dump_yaml(yaml)) f.close() # put modified yaml file put(yaml_file, os.path.join(cassandra_home, "conf", "cassandra.yaml"), use_sudo=use_sudo()) except SystemExit, e: raise pass
def get_cassandra_home(self, instance): with settings(host_string=instance.public_dns_name, warn_only=True): return exec_command("echo $CASSANDRA_HOME")
def is_running(self, instance): with settings(host_string=instance.public_dns_name), hide("everything"): return "is running" in exec_command("service cassandra status")
def get_cassandra_pid(self, instance): with settings(host_string=instance.public_dns_name, warn_only=True): pid = exec_command("cat /var/run/cassandra.pid") if pid.failed: return None return pid
except SystemExit, e: self.logger.error(str(e)) # test connection self.logger.debug("Testing connection to each Cassandra instance...") temp_instances = instances[:] while len(temp_instances) > 0: instance = temp_instances[-1] with settings(host_string=instance.public_dns_name, warn_only=True), hide("everything"): # does the ring look ok? ring_retcode = self._validate_ring(instance) # is gossip running? gossip_retcode = exec_command("nodetool -h %s info | grep Gossip | grep true" % instance.private_dns_name).return_code # are the netstats looking ok? netstats_retcode = exec_command("nodetool -h %s netstats | grep 'Mode: NORMAL'" % instance.private_dns_name).return_code # is thrift running? thrift_retcode = exec_command("/bin/netstat -an | grep 9160").return_code if ring_retcode == 0 and gossip_retcode == 0 and netstats_retcode == 0 and thrift_retcode == 0: temp_instances.pop() else: if ring_retcode != 0: self.logger.warn("Return code for 'nodetool ring' on '%s': %d" % (temp_instances[-1].id, ring_retcode)) if gossip_retcode != 0: self.logger.warn("Return code for 'nodetool info | grep Gossip' on '%s': %d" % (temp_instances[-1].id, gossip_retcode)) if netstats_retcode != 0: