def cluster_create(root, params): existing_cluster = root.clusters.find(display_name=params.display_name) if existing_cluster is not None: raise G.Invalid("Cluster name must be unique", "unique name", params.display_name, ["display_name"]) try: params["status"] = ClusterStatus.CREATING params["created"] = unix_timestamp() cluster = root.clusters.create(params) return cluster.serialize() except RecordValidationError as e: raise ApiException(str(e))
def work(self): for cluster in self.root.clusters: timed_out = False if not cluster.data.successfully_started: timed_out = cluster.data.created + CLUSTER_TIMEOUT < unix_timestamp() if timed_out: logger.warning( "Cluster %s has taken more than %d seconds to start up, " "so we are rolling it back" % (cluster.name, CLUSTER_TIMEOUT)) if timed_out or cluster.data.status == const.ClusterStatus.DELETING: self._delete_cluster(cluster) elif cluster.data.status == const.ClusterStatus.WAITING_FOR_AGENTS: self._deploy_memsql(cluster) elif cluster.data.status == const.ClusterStatus.WAITING_FOR_MEMSQL: self._check_memsql(cluster)
def run(scheduler_pid): log.setup() logger.info("Running resiliency test for MemSQL Mesos framework") num_initial_open_fds = len(os.listdir("/proc/%d/fd" % scheduler_pid)) num_initial_open_sockets_out = run_shell_command( [ "netstat", "-nap", "tcp" ]) num_initial_open_sockets = len(num_initial_open_sockets_out.split("\n")) created_clusters = [] next_cluster_id = 1 last_created_cluster_time = 0 start_time = unix_timestamp() try: now = unix_timestamp() while now < start_time + TEST_RUN_TIME: now = unix_timestamp() time.sleep(5) # Create a new cluster every five minutes. if len(created_clusters) < MAX_CLUSTERS and last_created_cluster_time + CLUSTER_CREATE_INTERVAL < now: last_created_cluster_time = now logger.info("Creating new cluster") cluster_data = { "display_name": "resiliency-test-cluster-%d" % next_cluster_id, "num_leaves": 1, "num_aggs": 1, "flavor": "small", "install_demo": True, "high_availability": False } try: new_cluster = call_scheduler_api("cluster/create", cluster_data) except Exception as e: logger.warning("Exception when calling cluster/create: %s" % str(e)) continue next_cluster_id += 1 created_clusters.append(new_cluster) if len(created_clusters) == MAX_CLUSTERS: cluster = created_clusters.pop(0) delete_cluster(cluster) except KeyboardInterrupt: pass for cluster in created_clusters: delete_cluster(cluster) print("Sleeping for 120 seconds to let TCP sockets close themselves") try: time.sleep(120) except KeyboardInterrupt: pass num_final_open_fds = len(os.listdir("/proc/%d/fd" % scheduler_pid)) num_final_open_sockets_out = run_shell_command([ "netstat", "-nap", "tcp" ]) num_final_open_sockets = len(num_final_open_sockets_out.split("\n")) now = unix_timestamp() print("MemSQL Mesos framework resiliency test ran for %s seconds" % (now - start_time)) print("We initially had:") print("%d open file descriptors" % num_initial_open_fds) print("%d open TCP sockets" % num_initial_open_sockets) print("We finished with:") print("%d open file descriptors" % num_final_open_fds) print("%d open TCP sockets" % num_final_open_sockets)