def test_client_failure_isolate_two_datanodes(): """ In this test, all datanodes are isolated from each other. two of the datanodes cannot communicate with any other node in the cluster. Expectation : Write should fail. Keys written before parition created can be read. """ test_key_name = "testkey1" ClusterUtils.put_key(FILE, TEST_BUCKET_NAME, TEST_VOLUME_NAME, "/etc/passwd", key_name=test_key_name, replication_factor='THREE') first_set = [OM[0], SCM[0], DATANODES[0], CLIENT[0]] second_set = [DATANODES[1]] third_set = [DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set, third_set) Blockade.blockade_status() exit_code, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Failed", output) is not None ClusterUtils.get_key(FILE, TEST_BUCKET_NAME, TEST_VOLUME_NAME, test_key_name, "/tmp/") key_checksum = ClusterUtils.find_checksum(FILE, "/tmp/%s" % test_key_name) assert key_checksum == ORIG_CHECKSUM
def setup_module(): global CONTAINER_LIST Blockade.blockade_destroy() CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output
def test_client_failure_isolate_one_datanode(): """ In this test, one of the datanodes is isolated from all other nodes. Expectation : Write should pass. Keys written before partition created can be read. """ test_key_name = "testkey2" ClusterUtils.put_key(FILE, TEST_BUCKET_NAME, TEST_VOLUME_NAME, "/etc/passwd", key_name=test_key_name, replication_factor='THREE') first_set = [OM[0], SCM[0], DATANODES[0], DATANODES[1], CLIENT[0]] second_set = [DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set) Blockade.blockade_status() exit_code, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("3 way commit failed", output) is not None assert re.search("Status: Success", output) is not None ClusterUtils.get_key(FILE, TEST_BUCKET_NAME, TEST_VOLUME_NAME, test_key_name, "/tmp/") key_checksum = ClusterUtils.find_checksum(FILE, "/tmp/%s" % test_key_name) assert key_checksum == ORIG_CHECKSUM
def stop(self): """ Stops the Ozone Cluster. """ Cluster.__logger__.info("Stopping Ozone Cluster") call([Command.docker_compose, "-f", self.docker_compose_file, "down"]) Blockade.blockade_destroy()
def test_scm_isolation_two_node(run_second_phase): """ In this test, two datanodes cannot communicate with SCM. Expectation : The container should eventually have at three closed replicas or, two open replicas and one quasi-closed replica. """ first_set = [OM[0], DATANODES[0], DATANODES[1], DATANODES[2]] second_set = [OM[0], SCM[0], DATANODES[1]] Blockade.blockade_create_partition(first_set, second_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) closed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'CLOSED' ] qausiclosed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'QUASI_CLOSED' ] count_open_container_datanodes = [ x for x in all_datanodes_container_status if x == 'OPEN' ] assert len(closed_container_datanodes) == 3 or \ (len(count_open_container_datanodes) == 2 and len(qausiclosed_container_datanodes) == 1), \ "The container should have three closed replicas or two open " \ "replicas and one quasi_closed replica." if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, INCREASED_SCALE) closed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'CLOSED' ] qausiclosed_container_datanodes = \ [x for x in all_datanodes_container_status if x == 'QUASI_CLOSED'] assert len(closed_container_datanodes) >= 3 or \ len(qausiclosed_container_datanodes) >= 3 Blockade.blockade_join() Blockade.blockade_status() if len(closed_container_datanodes) < 3: time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, INCREASED_SCALE) closed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'CLOSED' ] assert len(closed_container_datanodes) >= 3 _, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Success", output) is not None
def test_two_dns_isolate_scm_different_partition(run_second_phase): """ In this test, there are three datanodes, DN1, DN2, DN3 DN1 is on a network partition and DN2, DN3 are on a different network partition. DN1 and DN2 cannot communicate with SCM. Expectation : The container replica state in datanode DN1 should be open. The container replica states can be either 'closed' in DN2 and DN3, or, 'open' in DN2 and 'quasi-closed' in DN3. """ first_set = [OM[0], DATANODES[0]] second_set = [OM[0], DATANODES[1], DATANODES[2]] third_set = [SCM[0], DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set, third_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) first_datanode_status = all_datanodes_container_status[0] second_datanode_status = all_datanodes_container_status[1] third_datanode_status = all_datanodes_container_status[2] assert first_datanode_status == 'OPEN' assert (second_datanode_status == 'CLOSED' and third_datanode_status == 'CLOSED') or \ (second_datanode_status == 'OPEN' and third_datanode_status == 'QUASI_CLOSED') if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) count_qausi_closed_container_datanodes = filter( lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) >= 3 or \ len(count_qausi_closed_container_datanodes) >= 3 Blockade.blockade_join() Blockade.blockade_status() if len(count_closed_container_datanodes) < 3: time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) >= 3 _, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Success", output) is not None
def test_three_dns_isolate_threescmfailure(run_second_phase): """ In this test, all datanodes are isolated from each other and also cannot communicate with SCM. Expectation : The container replica state in first datanode should be open. The container replica state in second datanode should be open. The container replica state in third datanode should be open. """ first_set = [OM[0], DATANODES[0]] second_set = [OM[0], DATANODES[1]] third_set = [OM[0], DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set, third_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) first_datanode_status = all_datanodes_container_status[0] second_datanode_status = all_datanodes_container_status[1] third_datanode_status = all_datanodes_container_status[2] assert first_datanode_status == 'OPEN' assert second_datanode_status == 'OPEN' assert third_datanode_status == 'OPEN' if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) output = ClusterUtils.get_pipelines(FILE) if output: assert re.search("Factor:THREE", output) is None all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) datanodes_having_container_status = filter( lambda x: x != 'None', all_datanodes_container_status) assert len(datanodes_having_container_status) == 3, \ "Containers should not be replicated on addition of new nodes." Blockade.blockade_join() Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) == 3, \ "The container should have three closed replicas."
def test_three_dns_isolate_twoscmfailure(run_second_phase): """ In this test, all datanodes are isolated from each other. two datanodes cannot communicate with SCM (second datanode and third datanode) Expectation : The container replica state in first datanode should be quasi-closed. The container replica state in second datanode should be open. The container replica state in third datanode should be open. """ first_set = [OM[0], SCM[0], DATANODES[0]] second_set = [OM[0], DATANODES[1]] third_set = [OM[0], DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set, third_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) first_datanode_status = all_datanodes_container_status[0] second_datanode_status = all_datanodes_container_status[1] third_datanode_status = all_datanodes_container_status[2] assert first_datanode_status == 'QUASI_CLOSED' assert second_datanode_status == 'OPEN' assert third_datanode_status == 'OPEN' if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_quasi_closed_container_datanodes = filter( lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status) assert len(count_quasi_closed_container_datanodes) >= 3, \ "The container should have at least three quasi-closed replicas." Blockade.blockade_join() Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) == 3, \ "The container should have three closed replicas."
def setup(): global CONTAINER_LIST, OM, SCM, DATANODES Blockade.blockade_destroy() CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output OM, SCM, _, DATANODES = \ ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert exit_code == 0, "freon run failed with output=[%s]" % output
def test_isolatedatanode_singlenode(run_second_phase): """ In this test, one of the datanodes (first datanode) cannot communicate with other two datanodes. All datanodes can communicate with SCM. Expectation : The container replica state in first datanode should be quasi-closed. The container replica state in other datanodes should be closed. """ first_set = [OM[0], SCM[0], DATANODES[0]] second_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) first_datanode_status = all_datanodes_container_status[0] closed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'CLOSED' ] assert first_datanode_status == 'QUASI_CLOSED' assert len(closed_container_datanodes) == 2, \ "The container should have two closed replicas." if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, INCREASED_SCALE) closed_container_datanodes = [ x for x in all_datanodes_container_status if x == 'CLOSED' ] assert len(closed_container_datanodes) >= 3, \ "The container should have at least three closed replicas." Blockade.blockade_join() Blockade.blockade_status() _, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Success", output) is not None
def setup(): global CONTAINER_LIST, OM, SCM, DATANODES, CLIENT, ORIG_CHECKSUM, \ TEST_VOLUME_NAME, TEST_BUCKET_NAME epoch_time = int(time.time()) TEST_VOLUME_NAME = "%s%s" % ("volume", epoch_time) TEST_BUCKET_NAME = "%s%s" % ("bucket", epoch_time) Blockade.blockade_destroy() CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output OM, SCM, CLIENT, DATANODES = \ ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE", "ozone_client") assert exit_code == 0, "freon run failed with output=[%s]" % output ClusterUtils.create_volume(FILE, TEST_VOLUME_NAME) ClusterUtils.create_bucket(FILE, TEST_BUCKET_NAME, TEST_VOLUME_NAME) ORIG_CHECKSUM = ClusterUtils.find_checksum(FILE, "/etc/passwd")
def start(self): """ Start Ozone Cluster in docker containers. """ Cluster.__logger__.info("Starting Ozone Cluster") Blockade.blockade_destroy() call([ Command.docker_compose, "-f", self.docker_compose_file, "up", "-d", "--scale", "datanode=" + str(self.conf.datanode_count) ]) Cluster.__logger__.info("Waiting 10s for cluster start up...") # Remove the sleep and wait only till the cluster is out of safemode # time.sleep(10) output = subprocess.check_output( [Command.docker_compose, "-f", self.docker_compose_file, "ps"]) node_list = [] for out in output.split("\n")[2:-1]: node = out.split(" ")[0] node_list.append(node) Blockade.blockade_add(node) Blockade.blockade_status() self.om = filter(lambda x: 'om' in x, node_list)[0] self.scm = filter(lambda x: 'scm' in x, node_list)[0] self.datanodes = sorted( list(filter(lambda x: 'datanode' in x, node_list))) self.clients = filter(lambda x: 'ozone_client' in x, node_list) self.scm_uuid = self.__get_scm_uuid__() self.datanode_dir = self.get_conf_value("hdds.datanode.dir") assert node_list, "no node found in the cluster!" Cluster.__logger__.info("blockade created with nodes %s", ' '.join(node_list))
def test_flaky(flaky_node): """ In these tests, we make the network of the nodes as flaky using blockade. There are 4 tests : 1) one of the datanodes selected randomly and network of the datanode is made flaky. 2) scm network is made flaky. 3) om network is made flaky. 4) Network of all the nodes are made flaky. """ flaky_container_name = { "scm": cluster.scm, "om": cluster.om, "datanode": random.choice(cluster.datanodes), "all": "--all" }[flaky_node] Blockade.make_flaky(flaky_container_name) Blockade.blockade_status() exit_code, output = cluster.run_freon(1, 1, 1, 10240) assert exit_code == 0, "freon run failed with output=[%s]" % output
def test_one_dn_isolate_other_dn(run_second_phase): """ In this test, one of the datanodes (first datanode) cannot communicate other datanodes but can communicate with SCM. One of the other two datanodes (second datanode) cannot communicate with SCM. Expectation : The container replica state in first datanode can be either closed or quasi-closed. The container replica state in second datanode can be either closed or open. The container should eventually have at lease one closed replica. """ first_set = [OM[0], SCM[0], DATANODES[0]] second_set = [OM[0], DATANODES[1], DATANODES[2]] third_set = [SCM[0], DATANODES[2]] Blockade.blockade_create_partition(first_set, second_set, third_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', all_datanodes_container_status) first_datanode_status = all_datanodes_container_status[0] second_datanode_status = all_datanodes_container_status[1] assert first_datanode_status == 'CLOSED' or \ first_datanode_status == "QUASI_CLOSED" assert second_datanode_status == 'CLOSED' or \ second_datanode_status == "OPEN" assert len(count_closed_container_datanodes) >= 1, \ "The container should have at least one closed replica" if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) >= 3, \ "The container should have at least three closed replicas." _, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Success", output) is not None
def test_one_dn_isolate_scm_other_dn(run_second_phase): """ In this test, one of the datanodes cannot communicate with SCM and other datanodes. Other datanodes can communicate with each other and SCM . Expectation : The container should eventually have two closed replicas. """ first_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]] second_set = [OM[0], DATANODES[0]] Blockade.blockade_create_partition(first_set, second_set) Blockade.blockade_status() ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status(FILE, SCALE) count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) == 2, \ "The container should have two closed replicas." if str(run_second_phase).lower() == "true": ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ ClusterUtils.findall_container_status( FILE, INCREASED_SCALE) count_closed_container_datanodes = filter( lambda x: x == 'CLOSED', all_datanodes_container_status) assert len(count_closed_container_datanodes) >= 3, \ "The container should have at least three closed replicas." _, output = \ ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert re.search("Status: Success", output) is not None
def teardown(): logger.info("Inside teardown") Blockade.blockade_destroy()
def partition_network(self, *args): """ Partition the network which is used by the cluster. """ Blockade.blockade_create_partition(*args)
def restore_network(self): """ Restores the network partition. """ Blockade.blockade_join()
def teardown_module(): Blockade.blockade_destroy() ClusterUtils.cluster_destroy(FILE)
def teardown(): logger.info("Inside teardown") Blockade.blockade_fast_all() time.sleep(5)
def test_flaky(flaky_nodes): Blockade.make_flaky(flaky_nodes, CONTAINER_LIST) Blockade.blockade_status() exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") assert exit_code == 0, "freon run failed with output=[%s]" % output