Exemplo n.º 1
0
    def test_book_flight(self):
        num_tickets_to_book = 3
        flight_ids = self.__get_flight_ids("American Airlines")
        endpoint = "http://%s/%s" % (self.profile_endpoint, "confirmBooking")
        table = TableView(self.log.info)
        table.set_headers(["Username", "Booking ID", "Status",
                           "Class", "Num Seats"])
        for index in range(1, num_tickets_to_book):
            username = self.username_format % index
            booking_data = {
                "username": username,
                "password": self.password,
                "flightId": choice(flight_ids),
                "flightSeats": choice(range(1, 3)),
                "bookingClass": "economy",
                "bankAccount": hashlib.md5(
                    username.encode('utf-8')).hexdigest()
            }

            response = RestHelper.post_request(endpoint, booking_data)
            if response.status_code != 200:
                self.log.error("Request returned code %s: %s"
                               % (response.status_code, response.json()))
                self.fail("Booking failed")
            response = response.json()["Msg"]
            table.add_row([username, response["id"], response["status"],
                           response["bookingClass"], response["flightSeats"]])
        table.display("Booking details:")
Exemplo n.º 2
0
    def __get_purged_tombstone_from_last_run(self, nodes=None):
        """
        :return last_purged_tombstones: Dict of format,
            { node_ip: {'count': N, 'keys': [k1, k2, ..] }, ...}
        """
        tail_cmd = "cat %s/var/lib/couchbase/logs/debug.log " \
                   % self.couchbase_base_dir \
                   + "| sed -n '/%s/,$p'"
        purged_ts_count_pattern = ".*Purged ([0-9]+) ns_config tombstone"
        meta_kv_keys_pattern = ".*{metakv,[ ]*<<\"/([0-9a-zA-Z_\-\.]+)\">>"
        start_of_line = "^\[ns_server:"

        start_of_line = re.compile(start_of_line)
        meta_kv_keys_pattern = re.compile(meta_kv_keys_pattern)
        purged_ts_count_pattern = re.compile(purged_ts_count_pattern)
        tbl_view = TableView(self.log.info)
        tbl_view.set_headers(["Node", "Purged Keys"])

        last_purged_tombstones = dict()
        if nodes is None:
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster)
        for node in nodes:
            self.log.info("Processing debug logs from %s" % node.ip)
            shell = RemoteMachineShellConnection(node)
            output, _ = shell.execute_command(
                tail_cmd % self.ts_during_start[node.ip])
            if not output:
                output, _ = shell.execute_command(
                    " ".join(tail_cmd.split(' ')[:2]))
            self.log.debug("Tail stdout:\n%s" % output)
            o_len = len(output)
            target_buffer = ""
            total_ts_purged = 0
            for index in range(o_len-1, -1, -1):
                line = output[index]
                if not start_of_line.match(line):
                    target_buffer = line + target_buffer
                elif "tombstone_agent:purge:" in line:
                    total_ts_purged = \
                        purged_ts_count_pattern.match(line).group(1)
                    break
                else:
                    target_buffer = ""

            last_purged_tombstones[node.ip] = dict()
            last_purged_tombstones[node.ip]["count"] = int(total_ts_purged)
            last_purged_tombstones[node.ip]["keys"] = \
                meta_kv_keys_pattern.findall(target_buffer)
            tbl_view.add_row([node.ip, total_ts_purged])
            shell.disconnect()

        tbl_view.display("Purged_keys:")
        self.log.debug("Purged keys :: %s" % last_purged_tombstones)
        return last_purged_tombstones
Exemplo n.º 3
0
    def test_cancel_booking(self):
        endpoint = "http://%s/%s" % (self.profile_endpoint, "allBookings")
        booked_tickets = list()
        for index in range(1, 11):
            username = self.username_format % index
            auth_data = {"username": username,
                         "password": self.password}
            response = RestHelper.post_request(endpoint, auth_data)
            if response.status_code != 200:
                self.log.error("Request returned code %s: %s"
                               % (response.status_code, response.json()))
                self.fail("Fetching booking history failed")
            bookings = response.json()["Msg"][0]["bookings"]
            if len(bookings) > 0:
                booked_tickets.append([username, bookings])

        target_user = choice(booked_tickets)
        booking_id = choice(target_user[1])
        self.log.info("Cancel %s for user %s"
                      % (booking_id, target_user[0]))
        endpoint = "http://%s/%s" % (self.profile_endpoint, "cancelBooking")
        data = {"username": target_user[0], "password": self.password,
                "id": booking_id}

        response = RestHelper.post_request(endpoint, data)
        if response.status_code != 200:
            self.log.error("Request returned code %s: %s"
                           % (response.status_code, response.json()))
            self.fail("Fetching booking history failed")

        # Fetch booking status to confirm cancellation
        endpoint = "http://%s/%s" % (self.profile_endpoint, "getBooking")
        data = {"username": target_user[0], "password": self.password,
                "id": booking_id}
        response = RestHelper.post_request(endpoint, data)
        if response.status_code != 200:
            self.log.error("Request returned code %s: %s"
                           % (response.status_code, response.json()))
            self.fail("Fetching booking history failed")

        response = response.json()["Msg"]
        table = TableView(self.log.info)
        table.add_row(["Booking ID", response["id"]])
        table.add_row(["Flight", response["flightId"]])
        table.add_row(["Status", response["status"]])
        table.add_row(["Seats",
                       "%s (%s)" % (response["flightSeats"],
                                    ", ".join(response["TicketsBooked"]))])
        table.add_row(["Class", response["bookingClass"]])
        table.display("Ticket status:")

        self.assertEqual(response["status"], "Booking Cancelled")
Exemplo n.º 4
0
    def __get_deleted_key_count(self, check_if_zero=False):
        deleted_keys = self.cluster_util.get_ns_config_deleted_keys_count()
        tbl = TableView(self.log.info)
        tbl.set_headers(["Node", "Deleted_key_count"])
        for t_ip, k_count in deleted_keys.items():
            tbl.add_row(["%s" % t_ip, "%s" % k_count])
        tbl.display("Tombstone count on cluster nodes:")

        if not check_if_zero:
            return
        for t_ip, k_count in deleted_keys.items():
            if k_count != 0:
                self.fail("%s Deleted key count %s != 0" % (t_ip, k_count))
Exemplo n.º 5
0
 def print_spec_details(self, spec, cycles, elapsed_time):
     table = TableView(self.log.info)
     table.set_headers(["Operation", "Value"])
     table.add_row([
         "Collections dropped and recreated",
         str(spec[MetaCrudParams.COLLECTIONS_TO_RECREATE])
     ])
     table.add_row([
         "Scopes dropped and recreated",
         str(spec[MetaCrudParams.SCOPES_TO_RECREATE])
     ])
     table.add_row(["Cycles of data load", str(cycles)])
     table.add_row(["Time Elapsed in secs", str(elapsed_time)])
     table.display("Data load details")
Exemplo n.º 6
0
 def test_list_booking_history(self):
     table = TableView(self.log.info)
     table.set_headers(["Username", "Num tickets", "IDs"])
     num_tickets_to_book = 3
     endpoint = "http://%s/%s" % (self.profile_endpoint, "allBookings")
     for index in range(1, num_tickets_to_book):
         username = self.username_format % index
         auth_data = {"username": username,
                      "password": self.password}
         response = RestHelper.post_request(endpoint, auth_data)
         if response.status_code != 200:
             self.log.error("Request returned code %s: %s"
                            % (response.status_code, response.json()))
             self.fail("Fetching booking history failed")
         bookings = response.json()["Msg"][0]["bookings"]
         table.add_row([username, len(bookings), "\n".join(bookings)])
     table.display("Booking history:")
Exemplo n.º 7
0
        def check_replica_eviction():
            tbl = TableView(self.log.info)
            tbl.set_headers([
                "Node", "Memory", "WM_Threshold", "Itm_mem", "Meta_mem",
                "Evictable_mem", "A_rr", "R_rr"
            ])
            while self.test_failure is None and run_eviction_check:
                tbl.rows = []
                for kv_node in node_data.keys():
                    all_stats = \
                        node_data[kv_node]["cbstat"].all_stats(bucket.name)
                    bucket_mem = int(all_stats["ep_max_size"])
                    wm_threshold = \
                        (float(all_stats["ep_mem_high_wat_percent"])
                         - float(all_stats["ep_mem_low_wat_percent"]))*100
                    evictable_mem = \
                        int(all_stats["vb_replica_itm_memory"]) \
                        - int(all_stats["vb_replica_meta_data_memory"])
                    active_rr = int(all_stats["vb_active_perc_mem_resident"])
                    replica_rr = int(all_stats["vb_replica_perc_mem_resident"])

                    tbl.add_row([
                        kv_node.ip,
                        str(bucket_mem),
                        str(wm_threshold), all_stats["vb_replica_itm_memory"],
                        all_stats["vb_replica_meta_data_memory"],
                        str(evictable_mem),
                        str(active_rr),
                        str(replica_rr)
                    ])

                    if active_rr != 100 \
                            and evictable_mem > (bucket_mem/wm_threshold):
                        tbl.display("Node memory stats")
                        self.log_failure("%s - Active keys evicted before "
                                         "meeting the threshold: %s" %
                                         (kv_node.ip, all_stats))

                    if replica_rr > active_rr:
                        tbl.display("Node memory stats")
                        self.log_failure(
                            "%s: (active_rr) %s < %s (replica_rr)" %
                            (kv_node.ip, active_rr, replica_rr))
Exemplo n.º 8
0
 def print_cluster_stats(self):
     table = TableView(self.log.info)
     table.set_headers([
         "Node", "Services", "CPU_utilization", "Mem_total", "Mem_free",
         "Swap_mem_total", "Swap_mem_used"
     ])
     rest = RestConnection(self.cluster.master)
     cluster_stat = rest.get_cluster_stats()
     for cluster_node, node_stats in cluster_stat.items():
         row = list()
         row.append(cluster_node.split(':')[0])
         row.append(str(node_stats["services"]))
         row.append(str(node_stats["cpu_utilization"]))
         row.append(str(node_stats["mem_total"]))
         row.append(str(node_stats["mem_free"]))
         row.append(str(node_stats["swap_mem_total"]))
         row.append(str(node_stats["swap_mem_used"]))
         table.add_row(row)
     table.display("Cluster statistics")
Exemplo n.º 9
0
    def test_get_flights_for_airline(self):
        target_airline = "American Airlines"
        rest_url = "http://%s/%s/%s" % (self.inventory_endpoint, "flights",
                                        urllib.parse.quote(target_airline))
        response = RestHelper.get_request(rest_url)
        if response.status_code != 200:
            raise Exception("Requests status content:{0}".format(
                response.content))
        self.log.info("Flights for airline: %s" % target_airline)
        table = TableView(self.log.info)
        table.set_headers([
            "Flight Id", "Model", "Departure", "Arrival", "Departure Time",
            "Status"
        ])
        for f_data in response.json():
            f_data = f_data["flights"]
            table.add_row([
                f_data["flight_id"], f_data["model"],
                f_data["departing_airport"], f_data["arriving_airport"],
                f_data["departure_date"], f_data["status"]
            ])

        table.display("Flights for airline: %s" % target_airline)
Exemplo n.º 10
0
    def test_maxttl_with_sync_writes(self):
        """
        1. Load few docs without TTL
        2. Load few docs with TTL set in parallel to #1
        3. Validate docs get expiry after the TTL time
        :return:
        """

        def_bucket = self.cluster.buckets[0]
        self.maxttl = self.input.param("doc_ttl", self.maxttl)
        doc_ops_type = self.input.param("doc_ops_type", "sync;sync").split(";")

        # Create default doc_load options for TTL and non-TTL tasks
        non_ttl_task_property = dict()
        ttl_task_property = dict()

        # Create generators for TTL and non_TTL loading
        self.log.info("Creating doc_generators")
        ttl_gen_create = doc_generator(self.key,
                                       0,
                                       self.num_items,
                                       doc_size=self.doc_size,
                                       doc_type=self.doc_type,
                                       target_vbucket=self.target_vbucket,
                                       vbuckets=self.cluster.vbuckets)
        non_ttl_gen_create = doc_generator(self.key,
                                           self.num_items,
                                           self.num_items * 2,
                                           doc_size=self.doc_size,
                                           doc_type=self.doc_type,
                                           target_vbucket=self.target_vbucket,
                                           vbuckets=self.cluster.vbuckets)

        # Set durability levels based on doc_ops_type
        non_ttl_task_property["op_type"] = "create"
        ttl_task_property["op_type"] = "create"

        if doc_ops_type[0] == "sync":
            non_ttl_task_property["replicate_to"] = 0
            non_ttl_task_property["persist_to"] = 0
            non_ttl_task_property["durability"] = self.durability_level
        else:
            non_ttl_task_property["replicate_to"] = self.replicate_to
            non_ttl_task_property["persist_to"] = self.persist_to
            non_ttl_task_property["durability"] = "None"

        if doc_ops_type[1] == "sync":
            ttl_task_property["replicate_to"] = 0
            ttl_task_property["persist_to"] = 0
            ttl_task_property["durability"] = self.durability_level
        else:
            ttl_task_property["replicate_to"] = self.replicate_to
            ttl_task_property["persist_to"] = self.persist_to
            ttl_task_property["durability"] = "None"

        self.load_docs_in_parallel(def_bucket, non_ttl_gen_create,
                                   ttl_gen_create, non_ttl_task_property,
                                   ttl_task_property)
        # Validate doc_count before expiry of docs
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster,
                                                  self.num_items * 2)

        self.sleep(self.maxttl, "Sleep for maxTTL time")
        self.bucket_util._expiry_pager(self.cluster)
        self.sleep(25, "Waiting for items to be purged")

        # Read all expired docs to validate EONENT status
        ttl_task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            ttl_gen_create,
            "read",
            self.maxttl,
            batch_size=10,
            process_concurrency=8,
            timeout_secs=self.sdk_timeout,
            compression=self.sdk_compression,
            sdk_client_pool=self.sdk_client_pool)
        self.task.jython_task_manager.get_task_result(ttl_task)

        # Max-TTL doc expiry validation
        self.log.info("Validating expiry of docs")
        if len(ttl_task.success.keys()) != 0:
            self.fail("Items present after MaxTTL time: %s" %
                      ttl_task.success.keys())

        invalid_exception_tbl = TableView(self.log.info)
        invalid_exception_tbl.set_headers(["Doc_Key", "CAS"])
        for doc_key, result in ttl_task.fail.items():
            if result["cas"] != 0 and result["error"] is not None:
                invalid_exception_tbl.add_row([doc_key, result["cas"]])
        invalid_exception_tbl.display("Invalid exceptions for following keys")

        if len(invalid_exception_tbl.rows) != 0:
            self.fail("Seen invalid document exception")

        # Validate doc_count after doc_expiry
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)

        # Document mutations after doc_expiry
        non_ttl_task_property["op_type"] = "update"
        self.load_docs_in_parallel(def_bucket, non_ttl_gen_create,
                                   ttl_gen_create, non_ttl_task_property,
                                   ttl_task_property)
        # Validate doc_count before expiry of docs
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster,
                                                  self.num_items * 2)
Exemplo n.º 11
0
    def test_timeout_with_crud_failures(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operations succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        # Local method to validate vb_seqno
        def validate_vb_seqno_stats():
            """
            :return retry_validation: Boolean denoting to retry validation
            """
            retry_validation = False
            vb_info["post_timeout"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            for vb_num in range(self.vbuckets):
                vb_num = str(vb_num)
                if vb_num not in affected_vbs:
                    if vb_info["init"][node.ip][vb_num] \
                            != vb_info["post_timeout"][node.ip][vb_num]:
                        self.log_failure(
                            "Unaffected vb-%s stat updated: %s != %s" %
                            (vb_num, vb_info["init"][node.ip][vb_num],
                             vb_info["post_timeout"][node.ip][vb_num]))
                elif int(vb_num) in target_nodes_vbuckets["active"]:
                    if vb_info["init"][node.ip][vb_num] \
                            != vb_info["post_timeout"][node.ip][vb_num]:
                        self.log_failure(
                            err_msg %
                            (node.ip, "active", vb_num,
                             vb_info["init"][node.ip][vb_num],
                             vb_info["post_timeout"][node.ip][vb_num]))
                elif int(vb_num) in target_nodes_vbuckets["replica"]:
                    if vb_info["init"][node.ip][vb_num] \
                            == vb_info["post_timeout"][node.ip][vb_num]:
                        retry_validation = True
                        self.log.warning(
                            err_msg %
                            (node.ip, "replica", vb_num,
                             vb_info["init"][node.ip][vb_num],
                             vb_info["post_timeout"][node.ip][vb_num]))
            return retry_validation

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        target_nodes_vbuckets = dict()
        vb_info = dict()
        tasks = dict()
        doc_gen = dict()
        affected_vbs = list()

        target_nodes_vbuckets["active"] = []
        target_nodes_vbuckets["replica"] = []
        vb_info["init"] = dict()
        vb_info["post_timeout"] = dict()
        vb_info["afterCrud"] = dict()

        # Override crud_batch_size to minimum value for testing
        self.crud_batch_size = 5
        timeout_err_str = self.durability_helper.EXCEPTIONS["request_timeout"]
        ambiguous_err_str = self.durability_helper.EXCEPTIONS["ambiguous"]

        # Create required doc_generators
        doc_gen["insert"] = sub_doc_generator(
            self.key, self.num_items / 2,
            self.num_items / 2 + self.crud_batch_size)
        doc_gen["remove"] = sub_doc_generator_for_edit(self.key,
                                                       0,
                                                       self.crud_batch_size,
                                                       template_index=2)
        doc_gen["read"] = sub_doc_generator_for_edit(self.key,
                                                     0,
                                                     self.crud_batch_size,
                                                     template_index=0)
        doc_gen["upsert"] = sub_doc_generator_for_edit(
            self.key,
            int(self.num_items / 4),
            int(self.num_items / 4) + self.crud_batch_size,
            template_index=1)

        target_nodes = self.getTargetNodes()
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            target_nodes_vbuckets["active"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="active")
            target_nodes_vbuckets["replica"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="replica")
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        curr_time = int(time.time())
        expected_timeout = curr_time + self.sdk_timeout

        for op_type in doc_gen.keys():
            tasks[op_type] = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                start_task=False)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)

        for op_type in doc_gen.keys():
            self.task_manager.add_new_task(tasks[op_type])

        # Wait for document_loader tasks to complete
        for op_type in doc_gen.keys():
            self.task.jython_task_manager.get_task_result(tasks[op_type])

            # Validate task failures
            if op_type == "read":
                # Validation for read task
                for doc_id, crud_result in tasks[op_type].success.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.vbuckets)
                    if vb_num in target_nodes_vbuckets["active"]:
                        self.log_failure("Read succeeded for %s present in "
                                         "stopped active vbucket: %s" %
                                         (doc_id, vb_num))
                self.durability_helper.validate_durability_exception(
                    tasks[op_type].fail,
                    self.durability_helper.EXCEPTIONS["request_timeout"])
            else:
                # Validation of CRUDs - Update / Create / Delete
                if len(tasks[op_type].success.keys()) != 0:
                    self.log_failure("Few keys succeeded for %s: %s" %
                                     (op_type, tasks[op_type].success.keys()))
                for doc_id, crud_result in tasks[op_type].fail.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.vbuckets)
                    if vb_num in target_nodes_vbuckets["active"]:
                        if timeout_err_str not in str(crud_result["error"]):
                            self.log_failure(
                                "Invalid exception for doc %s, vb %s: %s" %
                                (doc_id, vb_num, crud_result))
                    else:
                        if ambiguous_err_str not in str(crud_result["error"]):
                            self.log_failure(
                                "Invalid exception for doc %s, vb %s: %s" %
                                (doc_id, vb_num, crud_result))

        # Revert the specified error scenario
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Check whether the timeout triggered properly
        if int(time.time()) < expected_timeout:
            self.log_failure("Timed-out before expected time")

        for op_type in doc_gen.keys():
            if op_type == "read":
                continue
            while doc_gen[op_type].has_next():
                doc_id, _ = doc_gen[op_type].next()
                affected_vbs.append(
                    str(
                        self.bucket_util.get_vbucket_num_for_key(
                            doc_id, self.vbuckets)))

        affected_vbs = list(set(affected_vbs))
        err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s"
        # Fetch latest stats and validate the seq_nos are not updated
        for node in target_nodes:
            retry_count = 0
            max_retry = 3
            while retry_count < max_retry:
                self.log.info("Trying to validate vbseq_no stats: %d" %
                              (retry_count + 1))
                retry_count += 1
                retry_required = validate_vb_seqno_stats()
                if not retry_required:
                    break
                self.sleep(5, "Sleep for vbseq_no stats to update")
            else:
                # This will be exited only if `break` condition is not met
                self.log_failure("validate_vb_seqno_stats verification failed")

        self.validate_test_failure()

        # If replicas+1 == total nodes, verify no mutation should have
        # succeeded with durability
        if self.nodes_init == self.num_replicas + 1:
            read_gen = doc_generator(self.key, 0, self.num_items)
            read_task = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                read_gen,
                "read",
                0,
                batch_size=500,
                process_concurrency=1,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(read_task)

            failed_keys = TableView(self.log.error)
            failed_keys.set_headers(["Key", "Error"])
            for doc_key, doc_info in read_task.success.items():
                mutated = json.loads(str(doc_info["value"]))["mutated"]
                if mutated != 0:
                    failed_keys.add_row([doc_key, doc_info])

            failed_keys.display("Affected mutations:")
            self.log.error(read_task.fail)

        # SDK client for retrying AMBIGUOUS for unexpected keys
        sdk_client = SDKClient(RestConnection(self.cluster.master),
                               self.bucket)

        # Doc error validation
        for op_type in doc_gen.keys():
            task = tasks[op_type]

            if self.nodes_init == 1 \
                    and len(task.fail.keys()) != (doc_gen[op_type].end
                                                  - doc_gen[op_type].start):
                self.log_failure(
                    "Failed keys %d are less than expected %d" %
                    (len(task.fail.keys()),
                     (doc_gen[op_type].end - doc_gen[op_type].start)))

            # Create table objects for display
            table_view = TableView(self.log.error)
            ambiguous_table_view = TableView(self.log.error)
            table_view.set_headers(["Key", "Exception"])
            ambiguous_table_view.set_headers(["Key", "vBucket"])

            # Iterate failed keys for validation
            for doc_key, doc_info in task.fail.items():
                vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key)

                if vb_for_key in target_nodes_vbuckets["active"]:
                    expected_exception = \
                        self.durability_helper.EXCEPTIONS["request_timeout"]
                elif vb_for_key in target_nodes_vbuckets["replica"]:
                    expected_exception = \
                        self.durability_helper.EXCEPTIONS["ambiguous"]
                else:
                    expected_exception = \
                        self.durability_helper.EXCEPTIONS["ambiguous"]
                    ambiguous_table_view.add_row([doc_key, vb_for_key])
                    retry_success = \
                        self.durability_helper.retry_for_ambiguous_exception(
                            sdk_client, op_type, doc_key, doc_info)
                    if not retry_success:
                        self.log_failure("%s failed in retry for %s" %
                                         (op_type, doc_key))

                if expected_exception not in str(doc_info["error"]):
                    table_view.add_row([doc_key, doc_info["error"]])

            # Display the tables (if any errors)
            table_view.display("Unexpected exception during %s" % op_type)
            ambiguous_table_view.display("Ambiguous exception during %s" %
                                         op_type)

        # Close the SDK connection
        sdk_client.close()

        # Verify doc count after expected CRUD failure
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.verify_stats_all_buckets(self.num_items)

        # Retry the same CRUDs after reverting the failure environment
        tasks = list()
        for op_type in doc_gen.keys():
            tasks.append(
                self.task.async_load_gen_docs(self.cluster,
                                              self.bucket,
                                              doc_gen[op_type],
                                              op_type,
                                              0,
                                              batch_size=10,
                                              process_concurrency=1,
                                              replicate_to=self.replicate_to,
                                              persist_to=self.persist_to,
                                              durability=self.durability_level,
                                              timeout_secs=self.sdk_timeout))

        # Wait for document_loader tasks to complete
        for task in tasks:
            self.task.jython_task_manager.get_task_result(task)
            if len(task.fail.keys()) != 0:
                self.log_failure(
                    "Failures with no error condition: {0}, {1}".format(
                        task.fail, task.fail.keys()))

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.verify_stats_all_buckets(self.num_items)

        # Fetch latest stats and validate the values are updated
        for node in target_nodes:
            vb_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                self.log_failure("vBucket seq_no stats not updated")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
Exemplo n.º 12
0
class volume(BaseTestCase):
    def setUp(self):
        self.input = TestInputSingleton.input
        self.input.test_params.update({"default_bucket": False})
        BaseTestCase.setUp(self)
        self.rest = RestConnection(self.servers[0])
        self.op_type = self.input.param("op_type", "create")
        self.available_servers = list()
        self.available_servers = self.cluster.servers[self.nodes_init:]
        self.num_buckets = self.input.param("num_buckets", 1)
        self.mutate = 0
        self.doc_ops = self.input.param("doc_ops", None)
        if self.doc_ops:
            self.doc_ops = self.doc_ops.split(';')
        self.iterations = self.input.param("iterations", 2)
        self.vbucket_check = self.input.param("vbucket_check", True)
        self.new_num_writer_threads = self.input.param(
            "new_num_writer_threads", 6)
        self.new_num_reader_threads = self.input.param(
            "new_num_reader_threads", 8)
        self.create_perc = 100
        self.update_perc = self.input.param("update_perc", 50)
        self.delete_perc = self.input.param("delete_perc", 50)
        self.expiry_perc = self.input.param("expiry_perc", 0)
        self.start = 0
        self.end = 0
        self.initial_items = self.start
        self.final_items = self.end
        self.create_end = 0
        self.create_start = 0
        self.update_end = 0
        self.update_start = 0
        self.delete_end = 0
        self.delete_start = 0
        self.expire_end = 0
        self.expire_start = 0
        self.num_collections = self.input.param("num_collections", 10)

    def create_required_buckets(self):
        self.log.info("Get the available memory quota")
        self.info = self.rest.get_nodes_self()
        threshold_memory = 100
        # threshold_memory_vagrant = 100
        total_memory_in_mb = self.info.mcdMemoryReserved
        total_available_memory_in_mb = total_memory_in_mb

        # If the mentioned service is already present,
        # we remove that much memory from available memory quota
        if "index" in self.info.services:
            total_available_memory_in_mb -= self.info.indexMemoryQuota
        if "fts" in self.info.services:
            total_available_memory_in_mb -= self.info.ftsMemoryQuota
        if "cbas" in self.info.services:
            total_available_memory_in_mb -= self.info.cbasMemoryQuota
        if "eventing" in self.info.services:
            total_available_memory_in_mb -= self.info.eventingMemoryQuota

        available_memory = total_available_memory_in_mb - threshold_memory

        self.rest.set_service_memoryQuota(service='memoryQuota',
                                          memoryQuota=available_memory)

        # Creating buckets for data loading purpose
        self.log.info("Create CB buckets")
        self.bucket_expiry = self.input.param("bucket_expiry", 0)
        ramQuota = self.input.param("ramQuota", available_memory)
        buckets = self.input.param("bucket_names", "GleamBookUsers").split(';')
        self.bucket_type = self.bucket_type.split(';')
        self.compression_mode = self.compression_mode.split(';')
        self.bucket_eviction_policy = self.bucket_eviction_policy
        for i in range(self.num_buckets):
            bucket = Bucket({
                Bucket.name: buckets[i],
                Bucket.ramQuotaMB: ramQuota / self.num_buckets,
                Bucket.maxTTL: self.bucket_expiry,
                Bucket.replicaNumber: self.num_replicas,
                Bucket.storageBackend: self.bucket_storage,
                Bucket.evictionPolicy: self.bucket_eviction_policy,
                Bucket.bucketType: self.bucket_type[i],
                Bucket.compressionMode: self.compression_mode[i]
            })
            self.bucket_util.create_bucket(bucket)

        # rebalance the new buckets across all nodes.
        self.log.info("Rebalance Starts")
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[])
        self.rest.monitorRebalance()
        return bucket

    def set_num_writer_and_reader_threads(self,
                                          num_writer_threads="default",
                                          num_reader_threads="default"):
        for node in self.cluster_util.get_kv_nodes():
            bucket_helper = BucketHelper(node)
            bucket_helper.update_memcached_settings(
                num_writer_threads=num_writer_threads,
                num_reader_threads=num_reader_threads)

    def generate_docs(self, doc_ops=None):
        self.gen_delete = None
        self.gen_create = None
        self.gen_update = None
        self.gen_expiry = None
        self.create_end = 0
        self.create_start = 0
        self.update_end = 0
        self.update_start = 0
        self.delete_end = 0
        self.delete_start = 0
        self.expire_end = 0
        self.expire_start = 0
        self.initial_items = self.final_items

        if doc_ops is None:
            doc_ops = self.doc_ops

        if "update" in doc_ops:
            self.update_start = 0
            self.update_end = self.num_items * self.update_perc / 100
            self.mutate += 1
            self.gen_update = doc_generator(
                "Users",
                self.update_start,
                self.update_end,
                doc_size=self.doc_size,
                doc_type=self.doc_type,
                target_vbucket=self.target_vbucket,
                vbuckets=self.cluster_util.vbuckets,
                key_size=self.key_size,
                randomize_doc_size=self.randomize_doc_size,
                randomize_value=self.randomize_value,
                mix_key_size=self.mix_key_size,
                mutate=self.mutate)

        if "delete" in doc_ops:
            self.delete_start = self.start
            self.delete_end = self.start + (self.num_items *
                                            self.delete_perc) / 100
            self.gen_delete = doc_generator(
                "Users",
                self.delete_start,
                self.delete_end,
                doc_size=self.doc_size,
                doc_type=self.doc_type,
                target_vbucket=self.target_vbucket,
                vbuckets=self.cluster_util.vbuckets,
                key_size=self.key_size,
                randomize_doc_size=self.randomize_doc_size,
                randomize_value=self.randomize_value,
                mix_key_size=self.mix_key_size)
            self.final_items -= (self.delete_end -
                                 self.delete_start) * self.num_collections

        if "expiry" in doc_ops and self.maxttl:
            self.expire_start = self.start + (self.num_items *
                                              self.delete_perc) / 100
            self.expire_end = self.start + self.num_items * (
                self.delete_perc + self.expiry_perc) / 100
            self.gen_expiry = doc_generator(
                "Users",
                self.expire_start,
                self.expire_end,
                doc_size=self.doc_size,
                doc_type=self.doc_type,
                target_vbucket=self.target_vbucket,
                vbuckets=self.cluster_util.vbuckets,
                key_size=self.key_size,
                randomize_doc_size=self.randomize_doc_size,
                randomize_value=self.randomize_value,
                mix_key_size=self.mix_key_size)
            self.final_items -= (self.expire_end -
                                 self.expire_start) * self.num_collections

        if "create" in doc_ops:
            self.start = self.end
            self.end += self.num_items * self.create_perc / 100
            self.create_start = self.start
            self.create_end = self.end
            self.gen_create = doc_generator(
                "Users",
                self.start,
                self.end,
                doc_size=self.doc_size,
                doc_type=self.doc_type,
                target_vbucket=self.target_vbucket,
                vbuckets=self.cluster_util.vbuckets,
                key_size=self.key_size,
                randomize_doc_size=self.randomize_doc_size,
                randomize_value=self.randomize_value,
                mix_key_size=self.mix_key_size)
            self.final_items += (self.end - self.start) * self.num_collections

    def doc_loader(self, op_type, kv_gen, exp=0, scope=None, collection=None):
        if scope is None:
            scope = CbServer.default_scope
        if collection is None:
            collection = CbServer.default_collection
        retry_exceptions = [
            SDKException.AmbiguousTimeoutException,
            SDKException.RequestCanceledException
        ]
        tasks_info = self.bucket_util._async_load_all_buckets(
            self.cluster,
            kv_gen,
            op_type,
            exp,
            batch_size=self.batch_size,
            process_concurrency=self.process_concurrency,
            persist_to=self.persist_to,
            replicate_to=self.replicate_to,
            durability=self.durability_level,
            pause_secs=5,
            timeout_secs=self.sdk_timeout,
            retries=self.sdk_retries,
            retry_exceptions=retry_exceptions,
            scope=scope,
            collection=collection)
        return tasks_info

    def data_load(self,
                  scope=CbServer.default_scope,
                  collections=[CbServer.default_scope]):
        tasks_info = dict()
        for collection in collections:
            if self.gen_update is not None:
                task_info = self.doc_loader("update",
                                            self.gen_update,
                                            scope=scope,
                                            collection=collection)
                tasks_info.update(task_info.items())
            if self.gen_create is not None:
                task_info = self.doc_loader("create",
                                            self.gen_create,
                                            scope=scope,
                                            collection=collection)
                tasks_info.update(task_info.items())
            if self.gen_delete is not None:
                task_info = self.doc_loader("delete",
                                            self.gen_delete,
                                            scope=scope,
                                            collection=collection)
                tasks_info.update(task_info.items())
            if self.gen_expiry is not None and self.maxttl:
                task_info = self.doc_loader("update",
                                            self.gen_expiry,
                                            self.maxttl,
                                            scope=scope,
                                            collection=collection)
                tasks_info.update(task_info.items())
        return tasks_info

    def data_validation(self,
                        tasks_info,
                        scope=CbServer.default_scope,
                        collections=[CbServer.default_scope],
                        check_docs=True):
        for task in tasks_info:
            self.task_manager.get_task_result(task)
        self.bucket_util.verify_doc_op_task_exceptions(tasks_info,
                                                       self.cluster)
        self.bucket_util.log_doc_ops_task_failures(tasks_info)
        for task, task_info in tasks_info.items():
            self.assertFalse(
                task_info["ops_failed"],
                "Doc ops failed for task: {}".format(task.thread_name))

        if check_docs:
            self.log.info("Validating Active/Replica Docs")
            self.check_replica = False
            for bucket in self.bucket_util.buckets:
                tasks = list()
                for collection in collections:
                    if self.gen_update is not None:
                        tasks.append(
                            self.task.async_validate_docs(
                                self.cluster,
                                bucket,
                                self.gen_update,
                                "update",
                                0,
                                batch_size=self.batch_size,
                                process_concurrency=self.process_concurrency,
                                pause_secs=5,
                                timeout_secs=self.sdk_timeout,
                                check_replica=self.check_replica,
                                scope=scope,
                                collection=collection))
                    if self.gen_create is not None:
                        tasks.append(
                            self.task.async_validate_docs(
                                self.cluster,
                                bucket,
                                self.gen_create,
                                "create",
                                0,
                                batch_size=self.batch_size,
                                process_concurrency=self.process_concurrency,
                                pause_secs=5,
                                timeout_secs=self.sdk_timeout,
                                check_replica=self.check_replica,
                                scope=scope,
                                collection=collection))
                    if self.gen_delete is not None:
                        tasks.append(
                            self.task.async_validate_docs(
                                self.cluster,
                                bucket,
                                self.gen_delete,
                                "delete",
                                0,
                                batch_size=self.batch_size,
                                process_concurrency=self.process_concurrency,
                                pause_secs=5,
                                timeout_secs=self.sdk_timeout,
                                check_replica=self.check_replica,
                                scope=scope,
                                collection=collection))
                    if self.gen_expiry is not None:
                        self.sleep(
                            self.maxttl,
                            "Wait for docs to expire until expiry time..")
                        tasks.append(
                            self.task.async_validate_docs(
                                self.cluster,
                                bucket,
                                self.gen_expiry,
                                "delete",
                                0,
                                batch_size=self.batch_size,
                                process_concurrency=self.process_concurrency,
                                pause_secs=5,
                                timeout_secs=self.sdk_timeout,
                                check_replica=self.check_replica,
                                scope=scope,
                                collection=collection))
                for task in tasks:
                    self.task.jython_task_manager.get_task_result(task)
        self.bucket_util._wait_for_stats_all_buckets()
#         self.bucket_util.verify_stats_all_buckets(self.final_items)

    def get_bucket_dgm(self, bucket):
        self.rest_client = BucketHelper(self.cluster.master)
        dgm = self.rest_client.fetch_bucket_stats(
            bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1]
        self.log.info("Active Resident Threshold of {0} is {1}".format(
            bucket.name, dgm))

    # Stopping and restarting the memcached process

    def stop_process(self):
        target_node = self.servers[2]
        remote = RemoteMachineShellConnection(target_node)
        error_sim = CouchbaseError(self.log, remote)
        error_to_simulate = "stop_memcached"
        # Induce the error condition
        error_sim.create(error_to_simulate)
        self.sleep(20, "Wait before reverting the error condition")
        # Revert the simulated error condition and close the ssh session
        error_sim.revert(error_to_simulate)
        remote.disconnect()

    def rebalance(self, nodes_in=0, nodes_out=0):
        servs_in = random.sample(self.available_servers, nodes_in)

        self.nodes_cluster = self.cluster.nodes_in_cluster[:]
        self.nodes_cluster.remove(self.cluster.master)
        servs_out = random.sample(self.nodes_cluster, nodes_out)

        if nodes_in == nodes_out:
            self.vbucket_check = False

        rebalance_task = self.task.async_rebalance(
            self.cluster.servers[:self.nodes_init],
            servs_in,
            servs_out,
            check_vbucket_shuffling=self.vbucket_check,
            retry_get_process_num=150)

        self.available_servers = [
            servs for servs in self.available_servers if servs not in servs_in
        ]
        self.available_servers += servs_out

        self.cluster.nodes_in_cluster.extend(servs_in)
        self.cluster.nodes_in_cluster = list(
            set(self.cluster.nodes_in_cluster) - set(servs_out))
        return rebalance_task

    def print_crud_stats(self):
        self.table = TableView(self.log.info)
        self.table.set_headers([
            "Initial Items", "Current Items", "Items Updated", "Items Created",
            "Items Deleted", "Items Expired"
        ])
        self.table.add_row([
            str(self.initial_items),
            str(self.final_items),
            str(self.update_start) + "-" + str(self.update_end),
            str(self.create_start) + "-" + str(self.create_end),
            str(self.delete_start) + "-" + str(self.delete_end),
            str(self.expire_start) + "-" + str(self.expire_end)
        ])
        self.table.display("Docs statistics")

    def Volume(self):
        #######################################################################
        self.log.info("Step1: Create a n node cluster")
        if self.nodes_init > 1:
            nodes_init = self.cluster.servers[1:self.nodes_init]
            self.task.rebalance([self.cluster.master], nodes_init, [])
            self.cluster.nodes_in_cluster.extend([self.cluster.master] +
                                                 nodes_init)

        #######################################################################
        self.log.info("Step 2 & 3: Create required buckets.")
        self.bucket = self.create_required_buckets()
        self.loop = 0
        scope_name = "VolumeScope"
        collection_prefix = "VolumeCollection"
        self.bucket_util.create_scope(self.cluster.master, self.bucket,
                                      {"name": scope_name})
        for i in range(self.num_collections):
            collection_name = collection_prefix + str(i)
            self.log.info("Creating scope::collection '%s::%s'" %
                          (scope_name, collection_name))
            self.bucket_util.create_collection(self.cluster.master,
                                               self.bucket, scope_name,
                                               {"name": collection_name})
            self.sleep(2)
        #######################################################################
        while self.loop < self.iterations:
            self.log.info("Step 4: Pre-Requisites for Loading of docs")
            self.bucket_util.add_rbac_user()
            self.generate_docs(doc_ops="create")
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            for task in tasks_info:
                self.task.jython_task_manager.get_task_result(task)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)
            self.create_perc = self.input.param("create_perc", 100)
            ###################################################################
            self.log.info("Step 5: Rebalance in with Loading of docs")
            self.generate_docs(doc_ops="create")
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=0)
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 6: Rebalance Out with Loading of docs")
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=0, nodes_out=1)
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 7: Rebalance In_Out with Loading of docs")
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=2, nodes_out=1)
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 8: Swap with Loading of docs")
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=1)
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)

            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 9: Updating the bucket replica to 2")
            bucket_helper = BucketHelper(self.cluster.master)
            for i in range(len(self.bucket_util.buckets)):
                bucket_helper.change_bucket_props(self.bucket_util.buckets[i],
                                                  replicaNumber=2)
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=0)
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 10: Stopping and restarting memcached process")
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            rebalance_task = self.task.async_rebalance(self.cluster.servers,
                                                       [], [])
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")

            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.stop_process()
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info(
                "Step 11: Failover a node and RebalanceOut that node \
            with loading in parallel")
            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.\
                get_and_compare_active_replica_data_set_all(
                    self.cluster.nodes_in_cluster, self.bucket_util.buckets,
                    path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master,
                                                       howmany=1)

            # Mark Node for failover
            self.generate_docs()
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id,
                                                           graceful=True)
            self.sleep(10)
            self.rest.monitorRebalance()
            self.nodes = self.rest.node_statuses()
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[self.chosen[0].id])
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                            msg="Rebalance failed")

            servs_out = [
                node for node in self.cluster.servers
                if node.ip == self.chosen[0].ip
            ]
            self.cluster.nodes_in_cluster = list(
                set(self.cluster.nodes_in_cluster) - set(servs_out))
            self.available_servers += servs_out

            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())

            self.bucket_util.compare_failovers_logs(
                prev_failover_stats, self.cluster.nodes_in_cluster,
                self.bucket_util.buckets)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset,
                disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets,
                path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std,
                total_vbuckets=self.cluster_util.vbuckets)
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=0)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 12: Failover a node and FullRecovery\
             that node")

            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.\
                get_and_compare_active_replica_data_set_all(
                    self.cluster.nodes_in_cluster,
                    self.bucket_util.buckets,
                    path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master,
                                                       howmany=1)

            self.generate_docs()
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            # Mark Node for failover
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id,
                                                           graceful=True)
            self.sleep(10)
            self.rest.monitorRebalance()
            # Mark Node for full recovery
            if self.success_failed_over:
                self.rest.set_recovery_type(otpNode=self.chosen[0].id,
                                            recoveryType="full")

            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)

            rebalance_task = self.task.async_rebalance(
                self.cluster.servers[:self.nodes_init], [], [])

            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")

            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")

            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())

            self.bucket_util.compare_failovers_logs(
                prev_failover_stats, self.cluster.nodes_in_cluster,
                self.bucket_util.buckets)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset,
                disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets,
                path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std,
                total_vbuckets=self.cluster_util.vbuckets)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            ###################################################################
            self.log.info("Step 13: Failover a node and DeltaRecovery that \
            node with loading in parallel")

            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.\
                get_and_compare_active_replica_data_set_all(
                    self.cluster.nodes_in_cluster,
                    self.bucket_util.buckets,
                    path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master,
                                                       howmany=1)

            self.generate_docs()
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            # Mark Node for failover
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id,
                                                           graceful=True)
            self.sleep(10)
            self.rest.monitorRebalance()
            if self.success_failed_over:
                self.rest.set_recovery_type(otpNode=self.chosen[0].id,
                                            recoveryType="delta")
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)

            rebalance_task = self.task.async_rebalance(
                self.cluster.servers[:self.nodes_init], [], [])
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")

            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())

            self.bucket_util.compare_failovers_logs(
                prev_failover_stats, self.cluster.nodes_in_cluster,
                self.bucket_util.buckets)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset,
                disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets,
                path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std,
                total_vbuckets=self.cluster_util.vbuckets)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            #######################################################################
            self.log.info("Step 14: Updating the bucket replica to 1")
            bucket_helper = BucketHelper(self.cluster.master)
            for i in range(len(self.bucket_util.buckets)):
                bucket_helper.change_bucket_props(self.bucket_util.buckets[i],
                                                  replicaNumber=1)
            self.generate_docs()
            self.set_num_writer_and_reader_threads(
                num_writer_threads=self.new_num_writer_threads,
                num_reader_threads=self.new_num_reader_threads)
            rebalance_task = self.task.async_rebalance(self.cluster.servers,
                                                       [], [])
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.set_num_writer_and_reader_threads(
                num_writer_threads="disk_io_optimized",
                num_reader_threads="disk_io_optimized")

            self.task.jython_task_manager.get_task_result(rebalance_task)
            self.assertTrue(rebalance_task.result, "Rebalance Failed")
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)

            #######################################################################
            self.log.info("Step 15: Flush the bucket and \
            start the entire process again")
            self.loop += 1
            if self.loop < self.iterations:
                # Flush the bucket
                self.bucket_util.flush_all_buckets(self.cluster.master)
                self.sleep(10)
                if len(self.cluster.nodes_in_cluster) > self.nodes_init:
                    nodes_cluster = self.cluster.nodes_in_cluster[:]
                    nodes_cluster.remove(self.cluster.master)
                    servs_out = random.sample(
                        nodes_cluster,
                        int(
                            len(self.cluster.nodes_in_cluster) -
                            self.nodes_init))
                    rebalance_task = self.task.async_rebalance(
                        self.cluster.servers[:self.nodes_init], [], servs_out)

                    self.task.jython_task_manager.get_task_result(
                        rebalance_task)
                    self.available_servers += servs_out
                    self.cluster.nodes_in_cluster = list(
                        set(self.cluster.nodes_in_cluster) - set(servs_out))
                    self.get_bucket_dgm(self.bucket)
            else:
                self.log.info("Volume Test Run Complete")
                self.get_bucket_dgm(self.bucket)

    def SteadyStateVolume(self):
        #######################################################################
        self.log.info("Step 1: Create a n node cluster")
        if self.nodes_init > 1:
            nodes_init = self.cluster.servers[1:self.nodes_init]
            self.task.rebalance([self.cluster.master], nodes_init, [])
            self.cluster.nodes_in_cluster.extend([self.cluster.master] +
                                                 nodes_init)

        #######################################################################
        self.log.info("Step 2: Create required buckets.")
        self.bucket = self.create_required_buckets()
        self.loop = 0
        scope_name = "VolumeScope"
        collection_prefix = "VolumeCollection"
        self.bucket_util.create_scope(self.cluster.master, self.bucket,
                                      {"name": scope_name})
        for i in range(self.num_collections):
            collection_name = collection_prefix + str(i)
            self.log.info("Creating scope::collection '%s::%s'" %
                          (scope_name, collection_name))
            self.bucket_util.create_collection(self.cluster.master,
                                               self.bucket, scope_name,
                                               {"name": collection_name})
            self.sleep(2)
        #######################################################################
        self.log.info("Step 3: Per-Requisites for Loading of docs")

        self.create_perc = 100
        _iter = 0
        while _iter < 2:
            self.generate_docs(doc_ops="create")
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.data_validation(tasks_info, check_docs=False)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)
            _iter += 1

        _iter = 0
        self.update_perc = 100
        while _iter < 10:
            self.generate_docs(doc_ops="update")
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)
            _iter += 1

        for i in range(1, self.num_collections, 2):
            collection_name = collection_prefix + str(i)
            self.bucket_util.drop_collection(self.cluster.master, self.bucket,
                                             scope_name, collection_name)
            self.bucket.scopes[scope_name].collections.pop(collection_name)

        self.update_perc = self.input.param("update_perc", 100)
        self.create_perc = self.input.param("create_perc", 100)
        _iter = 0
        while _iter < 10:
            self.generate_docs()
            tasks_info = self.data_load(
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.data_validation(
                tasks_info,
                scope=scope_name,
                collections=self.bucket.scopes[scope_name].collections.keys())
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(self.bucket)
            _iter += 1
Exemplo n.º 13
0
    def test_basic_ops(self):
        """
        Basic test for Sub-doc CRUD operations
        """
        doc_op = self.input.param("op_type", None)
        def_bucket = self.bucket_util.buckets[0]
        supported_d_levels = self.bucket_util.get_supported_durability_levels()

        # Stat validation reference variables
        verification_dict = dict()
        verification_dict["ops_create"] = self.num_items
        verification_dict["ops_update"] = 0
        verification_dict["ops_delete"] = 0
        verification_dict["rollback_item_count"] = 0
        verification_dict["sync_write_aborted_count"] = 0
        verification_dict["sync_write_committed_count"] = 0

        if self.durability_level in supported_d_levels:
            verification_dict["sync_write_committed_count"] += self.num_items

        # Initial validation
        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket,
            self.cluster_util.get_kv_nodes(),
            vbuckets=self.cluster_util.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        if self.target_vbucket and type(self.target_vbucket) is not list:
            self.target_vbucket = [self.target_vbucket]

        self.log.info("Creating doc_generator..")
        # Load basic docs into bucket
        doc_create = sub_doc_generator(self.key,
                                       0,
                                       self.num_items,
                                       key_size=self.key_size,
                                       doc_size=self.sub_doc_size,
                                       target_vbucket=self.target_vbucket,
                                       vbuckets=self.cluster_util.vbuckets)
        self.log.info("Loading {0} docs into the bucket: {1}".format(
            self.num_items, def_bucket))
        task = self.task.async_load_gen_sub_docs(
            self.cluster,
            def_bucket,
            doc_create,
            DocLoading.Bucket.SubDocOps.INSERT,
            self.maxttl,
            path_create=True,
            batch_size=10,
            process_concurrency=8,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            durability=self.durability_level,
            timeout_secs=self.sdk_timeout)
        self.task.jython_task_manager.get_task_result(task)

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets()

        # Update verification_dict and validate
        verification_dict["ops_update"] += self.num_items
        if self.durability_level in supported_d_levels:
            verification_dict["sync_write_committed_count"] += self.num_items

        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket,
            self.cluster_util.get_kv_nodes(),
            vbuckets=self.cluster_util.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        # Verify initial doc load count
        self.log.info("Validating doc_count in buckets")
        self.bucket_util.verify_stats_all_buckets(self.num_items)

        self.log.info("Creating doc_generator for doc_op")
        num_item_start_for_crud = int(self.num_items / 2)

        template_index = 0
        if doc_op == DocLoading.Bucket.SubDocOps.REMOVE:
            template_index = 2

        sub_doc_gen = sub_doc_generator_for_edit(self.key,
                                                 start=0,
                                                 end=num_item_start_for_crud,
                                                 key_size=self.key_size,
                                                 template_index=template_index)

        if doc_op == DocLoading.Bucket.SubDocOps.UPSERT:
            self.log.info("Performing 'upsert' mutation over the sub-docs")
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                def_bucket,
                sub_doc_gen,
                doc_op,
                self.maxttl,
                path_create=True,
                batch_size=10,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)
            verification_dict["ops_update"] += \
                (sub_doc_gen.end - sub_doc_gen.start
                 + len(task.fail.keys()))
            if self.durability_level in supported_d_levels:
                verification_dict["sync_write_committed_count"] += \
                    num_item_start_for_crud

            # Edit doc_gen template to read the mutated value as well
            sub_doc_gen.template = \
                sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}")
            # Read all the values to validate update operation
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                def_bucket,
                sub_doc_gen,
                "read",
                0,
                batch_size=100,
                process_concurrency=8,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Update failed key", "Value"])
            for key, value in task.success.items():
                doc_value = value["value"]
                failed_row = [key, doc_value]
                if doc_value[0] != 2:
                    op_failed_tbl.add_row(failed_row)
                elif doc_value[1] != "LastNameUpdate":
                    op_failed_tbl.add_row(failed_row)
                elif doc_value[2] != "TypeChange":
                    op_failed_tbl.add_row(failed_row)
                elif doc_value[3] != "CityUpdate":
                    op_failed_tbl.add_row(failed_row)
                elif json.loads(str(doc_value[4])) != ["get", "up"]:
                    op_failed_tbl.add_row(failed_row)

            op_failed_tbl.display("Update failed for keys:")
            if len(op_failed_tbl.rows) != 0:
                self.fail("Update failed for few keys")
        elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE:
            self.log.info("Performing 'remove' mutation over the sub-docs")
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                def_bucket,
                sub_doc_gen,
                doc_op,
                0,
                batch_size=10,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            verification_dict["ops_update"] += \
                (sub_doc_gen.end - sub_doc_gen.start
                 + len(task.fail.keys()))
            if self.durability_level in supported_d_levels:
                verification_dict["sync_write_committed_count"] += \
                    num_item_start_for_crud

            # Edit doc_gen template to read the mutated value as well
            sub_doc_gen.template = sub_doc_gen.template \
                .replace(" }}", ", \"mutated\": \"\" }}")
            # Read all the values to validate update operation
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                def_bucket,
                sub_doc_gen,
                "read",
                0,
                batch_size=100,
                process_concurrency=8,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Delete failed key", "Value"])

            for key, value in task.success.items():
                doc_value = value["value"]
                failed_row = [key, doc_value]
                if doc_value[0] != 2:
                    op_failed_tbl.add_row(failed_row)
                for index in range(1, len(doc_value)):
                    if doc_value[index] != "PATH_NOT_FOUND":
                        op_failed_tbl.add_row(failed_row)

            for key, value in task.fail.items():
                op_failed_tbl.add_row([key, value["value"]])

            op_failed_tbl.display("Delete failed for keys:")
            if len(op_failed_tbl.rows) != 0:
                self.fail("Delete failed for few keys")
        else:
            self.log.warning("Unsupported doc_operation")

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets()

        # Validate verification_dict and validate
        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket,
            self.cluster_util.get_kv_nodes(),
            vbuckets=self.cluster_util.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        self.log.info("Validating doc_count")
        self.bucket_util.verify_stats_all_buckets(self.num_items)
Exemplo n.º 14
0
    def test_basic_ops(self):
        """
        Basic test for Sub-doc CRUD operations

        A test in which `self.num_items` documents are created. Half of the
        documents are updated or deleted depending on the supplied `op_type`.
        """
        doc_op = self.input.param("op_type", None)
        def_bucket = self.cluster.buckets[0]

        # Stat validation reference variables
        verification_dict = dict()
        verification_dict["ops_create"] = self.num_items
        verification_dict["ops_update"] = 0
        verification_dict["ops_delete"] = 0
        verification_dict["rollback_item_count"] = 0
        verification_dict["sync_write_aborted_count"] = 0
        verification_dict["sync_write_committed_count"] = 0

        if self.is_sync_write_enabled:
            verification_dict["sync_write_committed_count"] += self.num_items

        # Initial validation
        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket, self.cluster_util.get_kv_nodes(self.cluster),
            vbuckets=self.cluster.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        if self.target_vbucket and type(self.target_vbucket) is not list:
            self.target_vbucket = [self.target_vbucket]

        self.log.info("Creating doc_generator..")
        # Insert `self.num_items` documents
        doc_create = sub_doc_generator(
            self.key, 0, self.num_items,
            key_size=self.key_size,
            doc_size=self.sub_doc_size,
            target_vbucket=self.target_vbucket,
            vbuckets=self.cluster.vbuckets)
        self.log.info("Loading {0} docs into the bucket: {1}"
                      .format(self.num_items, def_bucket))
        task = self.task.async_load_gen_sub_docs(
            self.cluster, def_bucket, doc_create,
            DocLoading.Bucket.SubDocOps.INSERT, self.maxttl,
            path_create=True,
            batch_size=10, process_concurrency=8,
            replicate_to=self.replicate_to, persist_to=self.persist_to,
            durability=self.durability_level,
            timeout_secs=self.sdk_timeout)
        self.task.jython_task_manager.get_task_result(task)

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)

        # The documents that could not be inserted
        insert_failures = len(task.fail.keys())

        # Update verification_dict and validate
        verification_dict["ops_update"] += self.num_items - insert_failures
        if self.is_sync_write_enabled:
            verification_dict["sync_write_committed_count"] += self.num_items - insert_failures
            verification_dict["sync_write_aborted_count"] += insert_failures

        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket, self.cluster_util.get_kv_nodes(self.cluster),
            vbuckets=self.cluster.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        # Verify initial doc load count
        self.log.info("Validating doc_count in buckets")
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)

        self.log.info("Creating doc_generator for doc_op")
        num_item_start_for_crud = int(self.num_items / 2)

        template_index = 0
        if doc_op == DocLoading.Bucket.SubDocOps.REMOVE:
            template_index = 2

        sub_doc_gen = sub_doc_generator_for_edit(
            self.key,
            start=0,
            end=num_item_start_for_crud,
            key_size=self.key_size,
            template_index=template_index)

        if doc_op == DocLoading.Bucket.SubDocOps.UPSERT:
            self.log.info("Performing 'upsert' mutation over the sub-docs")
            task = self.task.async_load_gen_sub_docs(
                self.cluster, def_bucket, sub_doc_gen, doc_op, self.maxttl,
                path_create=True,
                batch_size=10, process_concurrency=8,
                replicate_to=self.replicate_to, persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            # The documents keys for which the update failed
            update_failures = len(task.fail.keys())

            verification_dict["ops_update"] += \
                num_item_start_for_crud - update_failures

            if self.is_sync_write_enabled:
                verification_dict["sync_write_committed_count"] += \
                    num_item_start_for_crud - update_failures

            # Edit doc_gen template to read the mutated value as well
            sub_doc_gen.template = \
                sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}")
            # Read all the values to validate update operation
            task = self.task.async_load_gen_sub_docs(
                self.cluster, def_bucket, sub_doc_gen, "read", 0,
                batch_size=100, process_concurrency=8,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            # A set of expected values following a read operation
            expected_values = {'StateUpdate', 2, 'LastNameUpdate',
                               'TypeChange', 'CityUpdate', 'FirstNameUpdate'}

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Update failed key", "Value"])

            # If the values of attributes does not match the
            # expected value, append op to list of failed ops.
            for key, value in task.success.items():
                if expected_values != set(value["value"]):
                    op_failed_tbl.add_row([key, value["value"]])

            op_failed_tbl.display("Update failed for keys:")
            # Expect the non-updated values to match the update failures
            self.assertEqual(len(op_failed_tbl.rows), update_failures, "")
        elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE:
            self.log.info("Performing 'remove' mutation over the sub-docs")
            task = self.task.async_load_gen_sub_docs(
                self.cluster, def_bucket, sub_doc_gen, doc_op, 0,
                batch_size=10, process_concurrency=8,
                replicate_to=self.replicate_to, persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            # The number of documents that could not be removed
            remove_failures = len(task.fail.keys())

            verification_dict["ops_update"] += \
                num_item_start_for_crud - remove_failures

            if self.is_sync_write_enabled:
                verification_dict["sync_write_committed_count"] += \
                    num_item_start_for_crud - remove_failures

            # Edit doc_gen template to read the mutated value as well
            sub_doc_gen.template = sub_doc_gen.template \
                .replace(" }}", ", \"mutated\": \"\" }}")
            # Read all the values to validate update operation
            task = self.task.async_load_gen_sub_docs(
                self.cluster, def_bucket, sub_doc_gen, "read", 0,
                batch_size=100, process_concurrency=8,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Delete failed key", "Value"])

            # Collect read operations that failed
            for key, value in task.fail.items():
                op_failed_tbl.add_row([key, value["error"]])

            op_failed_tbl.display("Delete succeeded for keys:")

            # Expect the reads to have failed indicating the sub-documents are
            # no longer accessible.
            self.assertEqual(len(op_failed_tbl.rows),
                             num_item_start_for_crud, "Delete failed for few keys")
        else:
            self.log.warning("Unsupported doc_operation")

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)

        # Validate verification_dict and validate
        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket, self.cluster_util.get_kv_nodes(self.cluster),
            vbuckets=self.cluster.vbuckets,
            expected_val=verification_dict)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        self.log.info("Validating doc_count")
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
    def test_timeout_with_crud_failures(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operations succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        # Local methods to validate vb_seqno

        def compare_vb_stat(stat_1, stat_2, vb, comparison="!="):
            keys_to_check = ["high_seqno", "high_completed_seqno"]
            result = True
            for key in keys_to_check:
                if vb in stat_1.keys():
                    if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]:
                        self.log_failure(
                            "Mismatch in vb-%s UUID. %s != %s" %
                            (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"]))
                    if comparison == "!=":
                        if stat_1[vb][key] != stat_2[vb][key]:
                            result = False
                            self.log.warning(
                                "Mismatch in vb-%s stat %s. %s != %s" %
                                (vb, key, stat_1[vb][key], stat_2[vb][key]))
                    elif stat_1[vb][key] == stat_2[vb][key]:
                        result = False
                        self.log.warning(
                            "Stat not updated for vb-%s stat %s. "
                            "%s == %s" %
                            (vb, key, stat_1[vb][key], stat_2[vb][key]))
            return result

        def validate_vb_seqno_stats():
            """
            :return retry_validation: Boolean denoting to retry validation
            """
            retry_validation = False
            vb_info["post_timeout"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            for tem_vb_num in range(self.cluster_util.vbuckets):
                tem_vb_num = str(tem_vb_num)
                if tem_vb_num not in affected_vbs:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num) is False:
                        self.log_failure("Unaffected vb-%s stat" % tem_vb_num)
                elif int(tem_vb_num) in target_nodes_vbuckets["active"]:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num) is False:
                        self.log.warning("%s - mismatch in %s vb-%s seq_no" %
                                         (node.ip, "active", tem_vb_num))
                elif int(tem_vb_num) in target_nodes_vbuckets["replica"]:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num,
                                       comparison="==") is False:
                        retry_validation = True
                        self.log.warning("%s - mismatch in %s vb-%s seq_no" %
                                         (node.ip, "replica", tem_vb_num))
            return retry_validation

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        target_nodes_vbuckets = dict()
        vb_info = dict()
        tasks = dict()
        doc_gen = dict()
        affected_vbs = list()

        target_nodes_vbuckets["active"] = []
        target_nodes_vbuckets["replica"] = []
        vb_info["init"] = dict()
        vb_info["post_timeout"] = dict()
        vb_info["afterCrud"] = dict()

        # Override crud_batch_size to minimum value for testing
        self.crud_batch_size = 5
        self.key = "test_collections"
        self.sdk_timeout = 3

        # Select target vbucket type to load_docs
        target_vb_type = "replica"
        if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \
                and self.durability_level \
                == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
            target_vb_type = "active"

        # Create required scope/collection for successful CRUD operation
        if self.scope_name != CbServer.default_scope:
            self.scope_name = self.bucket_util.get_random_name()
        self.collection_name = self.bucket_util.get_random_name()
        self.log.info("Creating scope::collection %s::%s" %
                      (self.scope_name, self.collection_name))
        self.create_scope_collection()

        # Load docs into created collection
        self.log.info("Loading data into created collection")
        load_gen = doc_generator(self.key, 0, self.num_items)
        task = self.task.async_load_gen_docs(
            self.cluster,
            self.bucket,
            load_gen,
            "create",
            0,
            scope=self.scope_name,
            collection=self.collection_name,
            sdk_client_pool=self.sdk_client_pool,
            batch_size=200,
            process_concurrency=8,
            timeout_secs=60)
        self.task_manager.get_task_result(task)
        if self.subdoc_test:
            load_gen = sub_doc_generator(self.key, 0, self.num_items / 2)
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                load_gen,
                Bucket_Op.SubDocOps.INSERT,
                timeout_secs=self.sdk_timeout,
                compression=self.sdk_compression,
                path_create=True,
                batch_size=100,
                process_concurrency=8,
                durability=self.durability_level,
                scope=self.scope_name,
                collection=self.collection_name,
                sdk_client_pool=self.sdk_client_pool)
            self.task_manager.get_task_result(task)

        self.bucket.scopes[self.scope_name].collections[
            self.collection_name].num_items = self.num_items

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            target_nodes_vbuckets["active"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="active")
            target_nodes_vbuckets["replica"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="replica")
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        curr_time = int(time.time())
        expected_timeout = curr_time + self.sdk_timeout

        if target_vb_type == "active":
            target_vbs = list(
                set(target_nodes_vbuckets[target_vb_type]).difference(
                    set(target_nodes_vbuckets["replica"])))
        else:
            target_vbs = list(
                set(target_nodes_vbuckets[target_vb_type]).difference(
                    set(target_nodes_vbuckets["active"])))

        # Create required doc_generators
        doc_gen["create"] = doc_generator(self.key,
                                          self.num_items,
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)
        doc_gen["delete"] = doc_generator(self.key,
                                          0,
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)
        doc_gen["read"] = doc_generator(self.key,
                                        int(self.num_items / 3),
                                        self.crud_batch_size,
                                        target_vbucket=target_vbs)
        doc_gen["update"] = doc_generator(self.key,
                                          int(self.num_items / 2),
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)

        # Create required subdoc generators
        doc_gen["insert"] = sub_doc_generator(self.key,
                                              int(self.num_items / 2),
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs)
        doc_gen["upsert"] = sub_doc_generator_for_edit(
            self.key,
            0,
            self.crud_batch_size,
            template_index=1,
            target_vbucket=target_vbs)
        doc_gen["remove"] = sub_doc_generator(self.key,
                                              0,
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)
        self.sleep(5, "Wait for error_simulation to take effect")

        ops_to_perform = [
            Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE,
            Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE
        ]
        if self.subdoc_test:
            ops_to_perform = [
                Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT,
                Bucket_Op.SubDocOps.REMOVE
            ]

        for op_type in ops_to_perform:
            self.log.info("Starting doc op %s" % op_type)
            if op_type in Bucket_Op.DOC_OPS:
                tasks[op_type] = self.task.async_load_gen_docs(
                    self.cluster,
                    self.bucket,
                    doc_gen[op_type],
                    op_type,
                    0,
                    scope=self.scope_name,
                    collection=self.collection_name,
                    sdk_client_pool=self.sdk_client_pool,
                    batch_size=1,
                    process_concurrency=8,
                    durability=self.durability_level,
                    timeout_secs=self.sdk_timeout,
                    suppress_error_table=True,
                    print_ops_rate=False,
                    skip_read_on_error=True)
            else:
                tasks[op_type] = self.task.async_load_gen_sub_docs(
                    self.cluster,
                    self.bucket,
                    doc_gen[op_type],
                    op_type,
                    0,
                    scope=self.scope_name,
                    collection=self.collection_name,
                    sdk_client_pool=self.sdk_client_pool,
                    path_create=True,
                    batch_size=1,
                    process_concurrency=8,
                    durability=self.durability_level,
                    timeout_secs=self.sdk_timeout,
                    print_ops_rate=False)

            self.task.jython_task_manager.get_task_result(tasks[op_type])

            # Validate task failures
            if op_type == Bucket_Op.DocOps.READ:
                # Validation for read task
                if len(tasks[op_type].fail.keys()) != 0:
                    self.log_failure("Read failed for few docs: %s" %
                                     tasks[op_type].fail.keys())
            else:
                # Validation of CRUDs - Update / Create / Delete
                for doc_id, crud_result in tasks[op_type].fail.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.cluster_util.vbuckets)
                    if SDKException.DurabilityAmbiguousException \
                            not in str(crud_result["error"]):
                        self.log_failure(
                            "Invalid exception for doc %s, vb %s: %s" %
                            (doc_id, vb_num, crud_result))

        # Revert the specified error scenario
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Check whether the timeout triggered properly
        if int(time.time()) < expected_timeout:
            self.log_failure("Timed-out before expected time")

        for op_type in ops_to_perform:
            if op_type == Bucket_Op.DocOps.READ:
                continue
            while doc_gen[op_type].has_next():
                doc_id, _ = doc_gen[op_type].next()
                affected_vbs.append(
                    str(
                        self.bucket_util.get_vbucket_num_for_key(
                            doc_id, self.cluster_util.vbuckets)))

        affected_vbs = list(set(affected_vbs))
        # Fetch latest stats and validate the seq_nos are not updated
        for node in target_nodes:
            retry_count = 0
            max_retry = 3
            while retry_count < max_retry:
                self.log.info("Trying to validate vbseq_no stats: %d" %
                              (retry_count + 1))
                retry_count += 1
                retry_required = validate_vb_seqno_stats()
                if not retry_required:
                    break
                self.sleep(5, "Sleep for vbseq_no stats to update")
            else:
                # This will be exited only if `break` condition is not met
                self.log_failure("validate_vb_seqno_stats verification failed")

        self.validate_test_failure()

        # Get SDK Client from client_pool
        sdk_client = self.sdk_client_pool.get_client_for_bucket(
            self.bucket, self.scope_name, self.collection_name)

        # Doc error validation
        for op_type in ops_to_perform:
            task = tasks[op_type]

            if self.nodes_init == 1 \
                    and op_type != Bucket_Op.DocOps.READ \
                    and len(task.fail.keys()) != (doc_gen[op_type].end
                                                  - doc_gen[op_type].start):
                self.log_failure(
                    "Failed keys %d are less than expected %d" %
                    (len(task.fail.keys()),
                     (doc_gen[op_type].end - doc_gen[op_type].start)))

            # Create table objects for display
            table_view = TableView(self.log.error)
            ambiguous_table_view = TableView(self.log.info)
            table_view.set_headers(["Key", "vBucket", "Exception"])
            ambiguous_table_view.set_headers(["Key", "vBucket"])

            # Iterate failed keys for validation
            for doc_key, doc_info in task.fail.items():
                vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key)

                if SDKException.DurabilityAmbiguousException \
                        not in str(doc_info["error"]):
                    table_view.add_row(
                        [doc_key, vb_for_key, doc_info["error"]])

                ambiguous_table_view.add_row([doc_key, str(vb_for_key)])
                if op_type not in Bucket_Op.SUB_DOC_OPS:
                    retry_success = \
                        self.durability_helper.retry_for_ambiguous_exception(
                            sdk_client, op_type, doc_key, doc_info)
                    if not retry_success:
                        self.log_failure("%s failed in retry for %s" %
                                         (op_type, doc_key))

            # Display the tables (if any errors)
            table_view.display("Unexpected exception during %s" % op_type)
            ambiguous_table_view.display("D_Ambiguous exception during %s" %
                                         op_type)

        # Release the acquired client
        self.sdk_client_pool.release_client(sdk_client)

        # Verify doc count after expected CRUD failure
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()

        # Fetch latest stats and validate the values are updated
        for node in target_nodes:
            vb_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                self.log_failure("vBucket seq_no stats not updated")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
Exemplo n.º 16
0
class ConcurrentFailoverTests(AutoFailoverBaseTest):
    def setUp(self):
        super(ConcurrentFailoverTests, self).setUp()

        self.log_setup_status(self.__class__.__name__, "started",
                              self.setUp.__name__)

        #######################################################################
        # List of params to be used for failover
        # self.timeout from AutoFailoverBaseTest
        # self.max_count from AutoFailoverBaseTest

        # To track the triggered failover events
        self.fo_events = 0

        # failover_order to be used for failover_order_tests
        # Format:
        #   kv:kv-kv:index_query
        #   * Iteration marked by '-'
        #   * Nodes marked by ':'
        #   * Service within a node denoted by '_' (underscore)
        # In the above case,
        # - Loop #0 :: 2 KV nodes will be failed
        # - Loop #1 :: 1 KV + 1 node running n1ql+index will be failed
        self.failover_order = \
            self.input.param("failover_order", "kv").split("-")
        self.failover_method = \
            self.input.param("failover_method", CouchbaseError.STOP_MEMCACHED)
        # Failover type determines the order of FO (Auto/Graceful/Hard).
        # Length of this should match the len(self.failover_order)
        # Example -> auto-graceful-auto
        # This expects first set of nodes from failover_order to undergo
        # AUTO FO followed by GRACEFUL FO of nodes through API and then
        # followed by AUTO FO of 3rd set of nodes as defined by failover_order
        self.failover_type = \
            self.input.param("failover_type",
                             CbServer.Failover.Type.AUTO).split("-")
        # End of params to be used for failover
        #######################################################################

        self.load_during_fo = self.input.param("load_during_fo", False)

        self.log.info("Updating Auto-failover settings")
        self.rest.update_autofailover_settings(enabled=True,
                                               timeout=self.timeout,
                                               maxCount=self.max_count)

        # Find the bucket with least replica to check the Auto-FO possibility
        self.min_bucket_replica = Bucket.ReplicaNum.THREE
        for bucket in self.cluster.buckets:
            if bucket.replicaNumber < self.min_bucket_replica:
                self.min_bucket_replica = bucket.replicaNumber

        # Hold the dict of {node_obj_to_fail: failover_type, ...}
        self.nodes_to_fail = None

        # To display test execution status
        self.test_status_tbl = TableView(self.log.critical)
        self.auto_fo_settings_tbl = TableView(self.log.critical)
        self.test_status_tbl.set_headers(
            ["Node", "Services", "Node status", "Failover type"])
        self.auto_fo_settings_tbl.set_headers([
            "Enabled", "Auto FO count", "Max Events configured",
            "Auto FO timeout", "Disk Auto FO", "Disk Auto FO timeout"
        ])

        self.validate_failover_settings(True, self.timeout, 0, self.max_count)

        # Init sdk_client_pool if not initialized before
        if self.sdk_client_pool is None:
            self.init_sdk_pool_object()
            CollectionBase.create_sdk_clients(
                self.task_manager.number_of_threads, self.cluster.master,
                self.cluster.buckets, self.sdk_client_pool,
                self.sdk_compression)

        # Perform initial collection load
        self.__load_initial_collection_data()

        self.log_setup_status(self.__class__.__name__, "complete",
                              self.setUp.__name__)

    def tearDown(self):
        self.log_setup_status(self.__class__.__name__, "started",
                              self.tearDown.__name__)
        # Select KV node as a cluster master to perform tearDown rebalance out
        self.cluster_util.update_cluster_nodes_service_list(self.cluster)
        self.cluster.master = self.cluster.kv_nodes[0]

        self.log.info("Resetting auto-failover settings to default")
        self.rest.update_autofailover_settings(enabled=True,
                                               timeout=120,
                                               maxCount=1)
        self.log_setup_status(self.__class__.__name__, "complete",
                              self.tearDown.__name__)

        super(ConcurrentFailoverTests, self).tearDown()

    def __get_collection_load_spec(self, doc_ttl=0):
        """
        Set doc_ttl for loading doc during failover operations
        """
        d_level = Bucket.DurabilityLevel.NONE
        if self.num_replicas != Bucket.ReplicaNum.THREE:
            random.seed(round(time() * 1000))
            # Since durability is not supported with replicas=3
            d_level = choice([
                Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY,
                Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE,
                Bucket.DurabilityLevel.PERSIST_TO_MAJORITY
            ])
        return {
            # Scope/Collection ops params
            MetaCrudParams.COLLECTIONS_TO_DROP:
            3,
            MetaCrudParams.SCOPES_TO_DROP:
            1,
            MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET:
            3,
            MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES:
            5,
            MetaCrudParams.COLLECTIONS_TO_ADD_PER_BUCKET:
            10,
            MetaCrudParams.BUCKET_CONSIDERED_FOR_OPS:
            "all",
            MetaCrudParams.SCOPES_CONSIDERED_FOR_OPS:
            "all",
            MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_OPS:
            "all",

            # Doc loading params
            "doc_crud": {
                MetaCrudParams.DocCrud.COMMON_DOC_KEY: "test_collections",
                MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS: 5000,
                MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION: 20,
                MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION: 10,
                MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION: 10,
                MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION: 10,
            },

            # Doc_loading task options
            MetaCrudParams.DOC_TTL:
            doc_ttl,
            MetaCrudParams.DURABILITY_LEVEL:
            d_level,
            MetaCrudParams.SKIP_READ_ON_ERROR:
            True,
            MetaCrudParams.SUPPRESS_ERROR_TABLE:
            False,
            # The below is to skip populating success dictionary for reads
            MetaCrudParams.SKIP_READ_SUCCESS_RESULTS:
            True,
            MetaCrudParams.RETRY_EXCEPTIONS: [],
            MetaCrudParams.IGNORE_EXCEPTIONS: [],
            MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD:
            "all",
            MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD:
            "all",
            MetaCrudParams.BUCKETS_CONSIDERED_FOR_CRUD:
            "all"
        }

    @property
    def num_nodes_to_be_failover(self):
        def is_safe_to_fo(service):
            is_safe = False

            # Reference doc:
            # https://docs.couchbase.com/server/7.0/learn/clusters-and-availability/automatic-failover.html#failover-policy
            # Service / Data loss check
            if service == CbServer.Services.KV:
                if self.min_bucket_replica > 0 \
                        and node_count[CbServer.Services.KV] > 2:
                    is_safe = True
            # elif service == CbServer.Services.INDEX:
            #     if node_count[CbServer.Services.INDEX] > 1:
            #         is_safe = True
            else:
                # All other services require at least 2 nodes to FO
                if node_count[service] > 1:
                    is_safe = True
            return is_safe

        def decr_node_count(service):
            node_count[service] -= 1
            if service == CbServer.Services.KV:
                self.min_bucket_replica -= 1

        fo_nodes = set()
        num_unreachable_nodes = 0
        active_cluster_nodes = len(self.rest.get_nodes(inactive=False))
        total_nodes = active_cluster_nodes + self.fo_events + self.nodes_in
        min_nodes_for_quorum = int(total_nodes / 2) + 1
        max_allowed_unreachable_nodes = total_nodes - min_nodes_for_quorum

        # Quorum check before checking individual services
        for _, failure_type in self.nodes_to_fail.items():
            if failure_type in ["stop_couchbase", "network_split"]:
                num_unreachable_nodes += 1
        if num_unreachable_nodes > max_allowed_unreachable_nodes:
            return 0
        # End of quorum check

        node_count = dict()
        node_count[CbServer.Services.KV] = len(self.cluster.kv_nodes)
        node_count[CbServer.Services.INDEX] = len(self.cluster.index_nodes)
        node_count[CbServer.Services.N1QL] = len(self.cluster.query_nodes)
        node_count[CbServer.Services.EVENTING] = len(
            self.cluster.eventing_nodes)
        node_count[CbServer.Services.BACKUP] = len(self.cluster.backup_nodes)

        kv_nodes = dict()
        non_kv_nodes = dict()
        for node, failure_type in self.nodes_to_fail.items():
            if CbServer.Services.KV in node.services:
                kv_nodes[node] = failure_type
            else:
                non_kv_nodes[node] = failure_type

        kv_service = CbServer.Services.KV
        for node, failure_type in kv_nodes.items():
            if kv_service in node.services:
                # KV takes priority over other nodes in deciding the Auto-FO
                if self.max_count > (len(fo_nodes) + self.fo_events) \
                        and is_safe_to_fo(kv_service):
                    fo_nodes.add(node)
                    for service_type in node.services:
                        # Decrement the node count for the service
                        decr_node_count(service_type)
                else:
                    self.log.warning("KV failover not possible")
                    # No nodes should be FO'ed if KV FO is not possible
                    fo_nodes = set()
                    # Break to make sure no other service failover
                    # will be expected
                    break
        else:
            nodes_not_failed = set()
            for node, failure_type in non_kv_nodes.items():
                # For other nodes, we need to check if the node running
                # other services are also safe to failover
                for service_type in node.services:
                    if self.max_count == (len(fo_nodes) + self.fo_events):
                        # Check to see whether the max_fo count is reached
                        self.log.info("Max auto-fo count already reached")
                        break
                    if not is_safe_to_fo(service_type):
                        self.log.warning("Service '%s' not safe to failover" %
                                         service_type)
                        for t_node in fo_nodes:
                            if service_type in t_node.services \
                                    and kv_service not in t_node.services:
                                nodes_not_failed.add(t_node)
                        break
                else:
                    fo_nodes.add(node)
                    for service_type in node.services:
                        # Decrement the node count for the service
                        decr_node_count(service_type)
            fo_nodes = fo_nodes.difference(nodes_not_failed)
        expected_num_nodes = len(fo_nodes)
        self.log.info("Expected nodes to be failed over: %d" %
                      expected_num_nodes)
        return expected_num_nodes

    def __get_server_obj(self, node):
        for server in self.cluster.servers:
            if server.ip == node.ip:
                return server

    def __update_server_obj(self):
        temp_data = self.nodes_to_fail
        self.nodes_to_fail = dict()
        for node_obj, fo_type in temp_data.items():
            self.nodes_to_fail[self.__get_server_obj(node_obj)] = fo_type

    def __load_initial_collection_data(self):
        load_spec = self.__get_collection_load_spec()
        load_spec[MetaCrudParams.SCOPES_TO_DROP] = 0
        load_spec[MetaCrudParams.COLLECTIONS_TO_DROP] = 0
        load_spec[MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET] = 2
        load_spec[MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES] = 5
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS] = 10000

    def __perform_doc_ops(self, durability=None, validate_num_items=True):
        load_spec = self.__get_collection_load_spec()
        if durability and self.num_replicas != Bucket.ReplicaNum.THREE:
            load_spec[MetaCrudParams.DURABILITY_LEVEL] = durability

        self.log.info("Performing doc_ops with durability level=%s" %
                      load_spec[MetaCrudParams.DURABILITY_LEVEL])
        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.cluster.buckets,
                load_spec,
                mutation_num=0,
                batch_size=self.batch_size,
                process_concurrency=self.process_concurrency)

        if doc_loading_task.result is False:
            self.fail("Collection CRUDs failure")

        if validate_num_items:
            # Verify initial doc load count
            self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                         self.cluster.buckets,
                                                         timeout=1200)
            self.bucket_util.validate_docs_per_collections_all_buckets(
                self.cluster)

    def get_nodes_to_fail(self, services_to_fail, dynamic_fo_method=False):
        nodes = dict()
        # Update the list of service-nodes mapping in the cluster object
        self.cluster_util.update_cluster_nodes_service_list(self.cluster)
        nodes_in_cluster = self.rest.get_nodes()
        for services in services_to_fail:
            node_services = set(services.split("_"))
            for index, node in enumerate(nodes_in_cluster):
                if node_services == set(node.services):
                    fo_type = self.failover_method
                    if dynamic_fo_method:
                        fo_type = "stop_couchbase"
                        if CbServer.Services.KV in node_services:
                            fo_type = CouchbaseError.STOP_MEMCACHED
                    nodes[node] = fo_type
                    # Remove the node to be failed to avoid double insertion
                    nodes_in_cluster.pop(index)
                    break
        return nodes

    def validate_failover_settings(self, enabled, timeout, count, max_count):
        settings = self.rest.get_autofailover_settings()
        self.auto_fo_settings_tbl.rows = list()
        self.auto_fo_settings_tbl.rows.append([
            str(settings.enabled),
            str(settings.count),
            str(settings.maxCount),
            str(settings.timeout),
            str(settings.failoverOnDataDiskIssuesEnabled),
            str(settings.failoverOnDataDiskIssuesTimeout)
        ])
        self.auto_fo_settings_tbl.display("Auto failover status:")

        err_msg = "Mismatch in '%s' field. " \
                  "Cluster FO data: " + str(settings.__dict__)
        self.assertEqual(settings.enabled, enabled, err_msg % "enabled")
        self.assertEqual(settings.timeout, timeout, err_msg % "timeout")
        self.assertEqual(settings.count, count, err_msg % "count")
        self.assertEqual(settings.maxCount, max_count, err_msg % "maxCount")

    def __display_failure_node_status(self, message):
        self.test_status_tbl.rows = list()
        cluster_nodes = self.rest.get_nodes(inactive=True)
        for node, fo_type in self.nodes_to_fail.items():
            node = [
                t_node for t_node in cluster_nodes if t_node.ip == node.ip
            ][0]
            self.test_status_tbl.add_row([
                node.ip, ",".join(node.services), node.clusterMembership,
                fo_type
            ])
        self.test_status_tbl.display(message)

    def __update_unaffected_node(self):
        cluster_nodes = self.rest.get_nodes()
        for cluster_node in cluster_nodes:
            for failure_node in self.nodes_to_fail:
                if cluster_node.ip == failure_node.ip:
                    break
            else:
                self.orchestrator = cluster_node
                self.rest = RestConnection(self.orchestrator)
                self.cluster.master = cluster_node
                self.log.info("Node for REST APIs: %s" % cluster_node.ip)
                break

    def test_max_events_range(self):
        """
        - Try setting max_events 1 to 100 (Valid)
        - Try setting 0 > max_events > 100 (Invalid - negative)
        - Current timeout_range (5-120seconds) should work"
        """

        self.log.info("Testing max_event counts")
        enable_failover = True
        timeout_val = 10
        max_plus_1 = CbServer.Failover.MAX_EVENTS + 1

        # Set max_events between (min, max)
        for num_events in range(CbServer.Failover.MIN_EVENTS, max_plus_1):
            status = self.rest.update_autofailover_settings(
                enable_failover, timeout_val, maxCount=num_events)
            self.assertTrue(status, "Failed to set max events=%s" % num_events)
            self.validate_failover_settings(enable_failover, timeout_val, 0,
                                            num_events)

        for num_events in [0, max_plus_1]:
            self.log.info("Testing max_event_count=%s" % num_events)
            status = self.rest.update_autofailover_settings(
                enable_failover, timeout_val, maxCount=max_plus_1)
            self.assertFalse(status, "Able to set max events=%s" % num_events)
            self.validate_failover_settings(enable_failover, timeout_val, 0,
                                            CbServer.Failover.MAX_EVENTS)

    def __run_test(self):
        # Validate count before the start of failover procedure
        self.validate_failover_settings(True, self.timeout, self.fo_events,
                                        self.max_count)

        # Before failure - nodes' information
        self.__display_failure_node_status("Nodes to be failed")

        try:
            rest_nodes = self.rest.get_nodes()
            if self.current_fo_strategy == CbServer.Failover.Type.AUTO:
                expected_fo_nodes = self.num_nodes_to_be_failover
                self.fo_events += expected_fo_nodes
                self.__update_server_obj()
                failover_task = ConcurrentFailoverTask(
                    task_manager=self.task_manager,
                    master=self.orchestrator,
                    servers_to_fail=self.nodes_to_fail,
                    expected_fo_nodes=self.fo_events,
                    task_type="induce_failure")
                self.task_manager.add_new_task(failover_task)
                self.task_manager.get_task_result(failover_task)
                if failover_task.result is False:
                    self.fail("Failure during concurrent failover procedure")
            elif self.current_fo_strategy == CbServer.Failover.Type.GRACEFUL:
                for node in self.nodes_to_fail:
                    node = [
                        t_node for t_node in rest_nodes if t_node.ip == node.ip
                    ][0]
                    status = self.rest.fail_over(node.id, graceful=True)
                    if status is False:
                        self.fail("Graceful failover failed for %s" % node)
                    self.sleep(5, "Wait for failover to start")
                    reb_result = self.rest.monitorRebalance()
                    self.assertTrue(reb_result, "Graceful failover failed")
            elif self.current_fo_strategy == CbServer.Failover.Type.FORCEFUL:
                for node in self.nodes_to_fail:
                    node = [
                        t_node for t_node in rest_nodes if t_node.ip == node.ip
                    ][0]
                    status = self.rest.fail_over(node.id, graceful=False)
                    if status is False:
                        self.fail("Hard failover failed for %s" % node)
                    self.sleep(5, "Wait for failover to start")
                    reb_result = self.rest.monitorRebalance()
                    self.assertTrue(reb_result, "Hard failover failed")
        except Exception as e:
            self.log.error("Exception occurred: %s" % str(e))
        finally:
            # Disable auto-fo after the expected time limit
            self.rest.update_autofailover_settings(enabled=False,
                                                   timeout=self.timeout,
                                                   maxCount=self.max_count)

            if self.current_fo_strategy == CbServer.Failover.Type.AUTO:
                failover_task = ConcurrentFailoverTask(
                    task_manager=self.task_manager,
                    master=self.orchestrator,
                    servers_to_fail=self.nodes_to_fail,
                    task_type="revert_failure")
                self.task_manager.add_new_task(failover_task)
                self.task_manager.get_task_result(failover_task)
                if failover_task.result is False:
                    self.fail("Failure during failover operation")

            # Enable back prev auto_fo settings
            self.sleep(15, "Wait before enabling back auto-fo")
            self.rest.update_autofailover_settings(enabled=True,
                                                   timeout=self.timeout,
                                                   maxCount=self.max_count)

        # After failure - failed nodes' information
        self.__display_failure_node_status("Nodes status failure")

        self.bucket_util.print_bucket_stats(self.cluster)
        # Validate count at the end of failover procedure
        self.validate_failover_settings(True, self.timeout, self.fo_events,
                                        self.max_count)

    def test_concurrent_failover(self):
        """
        Common code to run failover tests
        """
        self.current_fo_strategy = None
        load_data_after_fo = self.input.param("post_failover_data_load", True)
        exception = None
        for index, services_to_fo in enumerate(self.failover_order):
            self.current_fo_strategy = self.failover_type[index]
            # servers_to_fail -> kv:index / kv:index_kv / index:n1ql
            services_to_fo = services_to_fo.split(":")
            # servers_to_fail -> [kv, index] / [kv, index_kv]
            self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo)
            self.__update_unaffected_node()
            try:
                self.__run_test()
            except Exception as e:
                # Making sure to remove failed nodes before failing the test
                self.cluster_util.rebalance(self.cluster)
                self.fail("Exception occurred: %s" % str(e))

            # Perform collection crud + doc_ops before rebalance operation
            if load_data_after_fo:
                try:
                    self.__perform_doc_ops(durability="NONE",
                                           validate_num_items=False)
                except Exception as e:
                    exception = e
                    break

        self.sleep(20, "Wait for failed nodes to recover completely")
        if choice([True, False]):
            # Add back all nodes and rebalance
            self.log.info("Performing node add back operation")
            rest_nodes = self.rest.get_nodes(inactive=True)
            for node in rest_nodes:
                if node.clusterMembership == "inactiveFailed":
                    self.rest.add_back_node(node.id)
                    if CbServer.Services.KV in node.services:
                        self.rest.set_recovery_type(node.id, "delta")
            result = self.cluster_util.rebalance(self.cluster)
        else:
            # Eject nodes and rebalance
            self.log.info("Ejecting all failed nodes from the cluster")
            result = self.cluster_util.rebalance(self.cluster)

        if exception:
            self.fail(exception)
        self.assertTrue(result, "Final rebalance failed")

        # Validate count is reset back to 0 after rebalance operation
        self.validate_failover_settings(True, self.timeout, 0, self.max_count)

        # Perform collection crud + doc_ops
        if load_data_after_fo:
            durability_val = None
            for bucket in self.cluster.buckets:
                # If we have bucket_replica=3, force use level=NONE
                if bucket.replicaNumber == Bucket.ReplicaNum.THREE:
                    durability_val = Bucket.DurabilityLevel.NONE
                    break
                # If we have ephemeral bucket, force use level=MAJORITY
                if bucket.bucketType == Bucket.Type.EPHEMERAL:
                    durability_val = Bucket.DurabilityLevel.MAJORITY
            self.__perform_doc_ops(durability=durability_val)

    def test_split_brain(self):
        """
        Test params:
        split_nodes - Accepts string of pattern 'a_b:c-b_a:d'
                      This creates a barriers like,
                      Node running services a_b & c to ignore anything from
                      nodes running services b_a & d and vice versa
        """
        def get_nodes_based_on_services(services):
            nodes = list()
            services = services.split(":")
            for t_service in services:
                t_service = t_service.split("_")
                t_service.sort()
                for c_node in cluster_nodes:
                    if c_node.services == t_service:
                        nodes.append(self.__get_server_obj(c_node))
                        # Remove nodes from cluster_nodes once picked
                        # to avoid picking same node again
                        cluster_nodes.remove(c_node)
                        break
            return nodes

        def create_split_between_nodes(dest_nodes, src_nodes):
            for ssh_node in dest_nodes:
                shell_conn = RemoteMachineShellConnection(ssh_node)
                for src_node in src_nodes:
                    shell_conn.execute_command(
                        "iptables -A INPUT -s %s -j DROP" % src_node.ip)
                shell_conn.disconnect()

        def get_num_nodes_to_fo(num_nodes_affected,
                                service_count_affected_nodes,
                                service_count_unaffected_nodes):
            nodes_to_fo = num_nodes_affected
            for t_server, count in service_count_affected_nodes.items():
                if t_server not in service_count_unaffected_nodes \
                        or service_count_unaffected_nodes[t_server] < 1:
                    nodes_to_fo -= service_count_affected_nodes[t_server]
            return nodes_to_fo

        def recover_from_split(node_list):
            self.log.info("Flushing iptables rules from all nodes")
            for ssh_node in node_list:
                ssh_shell = RemoteMachineShellConnection(ssh_node)
                ssh_shell.execute_command("iptables -F")
                ssh_shell.disconnect()
            self.sleep(5, "Wait for nodes to be reachable")

        def post_failover_procedure():
            self.rest.monitorRebalance()
            self.validate_failover_settings(True, self.timeout,
                                            num_nodes_to_fo, self.max_count)
            recover_from_split(node_split_1 + node_split_2)
            self.log.info("Rebalance out failed nodes")
            rebalance_res = self.cluster_util.rebalance(self.cluster)
            self.assertTrue(rebalance_res, "Post failover rebalance failed")

            # Validate failover count reset post rebalance
            self.validate_failover_settings(True, self.timeout, 0,
                                            self.max_count)

        fo_happens = self.input.param("fo_happens", True)
        nodes_to_split = self.input.param("split_nodes", None).split('-')

        if nodes_to_split is None:
            self.fail("Nothing to test. split_nodes is None")

        # Validate count before the start of failover procedure
        self.validate_failover_settings(True, self.timeout, self.fo_events,
                                        self.max_count)

        self.log.info("Fetching current cluster_nodes")
        self.cluster_util.find_orchestrator(self.cluster)
        # cluster_nodes holds servers which are not yet selected for nw split
        cluster_nodes = self.rest.get_nodes()
        for node in cluster_nodes:
            node.services.sort()

        # Fetch actual nodes from given service list to create a split
        node_split_1 = get_nodes_based_on_services(nodes_to_split[0])
        node_split_2 = get_nodes_based_on_services(nodes_to_split[1])

        service_count = [dict(), dict()]
        for index, split_services in enumerate(nodes_to_split):
            for node_services in nodes_to_split[index].split(':'):
                for service in node_services.split("_"):
                    if service not in service_count[index]:
                        service_count[index][service] = 0
                    service_count[index][service] += 1

        if len(node_split_1) > len(node_split_2):
            num_nodes_to_fo = get_num_nodes_to_fo(len(node_split_2),
                                                  service_count[1],
                                                  service_count[0])
        else:
            num_nodes_to_fo = get_num_nodes_to_fo(len(node_split_1),
                                                  service_count[0],
                                                  service_count[1])

        self.log.info(
            "N/w split between -> [%s] || [%s]. Expect %s fo_events" %
            ([n.ip for n in node_split_1], [n.ip for n in node_split_2
                                            ], num_nodes_to_fo))
        try:
            create_split_between_nodes(node_split_1, node_split_2)
            create_split_between_nodes(node_split_2, node_split_1)

            self.sleep(self.timeout, "Wait for configured fo_timeout")
            self.sleep(15, "Extra sleep to avoid fail results")

            if fo_happens:
                self.log.info("Expecting failover to be triggered")
                post_failover_procedure()
            elif len([
                    t_serv for t_serv in self.services_init.split("-")
                    if CbServer.Services.KV in t_serv
            ]) > 2:
                self.log.info("Expecting no failover will be triggered")
                self.validate_failover_settings(True, self.timeout, 0,
                                                self.max_count)
                if (self.nodes_init % 2) == 1:
                    # Pick new master based on split network
                    new_master = node_split_1[0]
                    if len(node_split_2) > len(node_split_1):
                        new_master = node_split_2[0]

                    self.sleep(10, "FO expected wrt node %s" % new_master.ip)
                    self.rest = RestConnection(new_master)
                    self.cluster.master = new_master

                    post_failover_procedure()
        finally:
            recover_from_split(node_split_1 + node_split_2)
            self.sleep(5, "Wait for n/w split to heal")
            rest_nodes = self.rest.get_nodes(inactive=True)
            for t_node in rest_nodes:
                if t_node.clusterMembership == "active":
                    for node in self.cluster.servers:
                        if node.ip == t_node.ip:
                            self.cluster.master = node
                            break
                    break
            reb_result = self.cluster_util.rebalance(self.cluster)
            self.assertTrue(reb_result, "Final rebalance failed")

    def test_concurrent_failover_timer_reset(self):
        """
        1. Trigger failure on destined nodes
        2. Wait for little less time than failover_timeout
        3. Bring back few nodes back online for few seconds
        4. Make sure no auto failover triggered till next failover timeout
        5. Validate auto failovers after new timeout
        """

        services_to_fo = self.failover_order[0].split(":")
        self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo,
                                                    dynamic_fo_method=True)
        expected_fo_nodes = self.num_nodes_to_be_failover
        self.__update_server_obj()
        rand_node = choice(self.nodes_to_fail.keys())
        self.__update_unaffected_node()
        self.__display_failure_node_status("Nodes to be failed")
        try:
            self.log.info("Starting auto-failover procedure")
            failover_task = ConcurrentFailoverTask(
                task_manager=self.task_manager,
                master=self.orchestrator,
                servers_to_fail=self.nodes_to_fail,
                expected_fo_nodes=expected_fo_nodes,
                task_type="induce_failure")
            self.task_manager.add_new_task(failover_task)
            self.sleep(int(self.timeout * 0.7),
                       "Wait before bringing back the failed nodes")

            self.log.info("Bringing back '%s' for some time" % rand_node.ip)
            new_timer = None
            shell = RemoteMachineShellConnection(rand_node)
            cb_err = CouchbaseError(self.log, shell)
            if self.nodes_to_fail[rand_node] == CouchbaseError.STOP_MEMCACHED:
                cb_err.revert(CouchbaseError.STOP_MEMCACHED)
                self.sleep(10, "Wait before creating failure again")
                cb_err.create(CouchbaseError.STOP_MEMCACHED)
                new_timer = time()
            elif self.nodes_to_fail[rand_node] == "stop_couchbase":
                cb_err.revert(CouchbaseError.STOP_SERVER)
                self.sleep(10, "Wait before creating failure again")
                cb_err.create(CouchbaseError.STOP_SERVER)
                new_timer = time()
            shell.disconnect()

            # Validate the previous auto-failover task failed
            # due to the random_node coming back online
            self.task_manager.get_task_result(failover_task)
            self.assertFalse(failover_task.result,
                             "Nodes failed over though nodes became active")

            # Validate auto_failover_settings
            self.validate_failover_settings(True, self.timeout, 0,
                                            self.max_count)

            # Make sure the new auto-failover timing is honoured
            new_timer = new_timer + self.timeout
            while int(time()) < new_timer:
                settings = self.rest.get_autofailover_settings()
                if settings.count != 0:
                    self.fail("Nodes failed over before new failover time")

            self.sleep(10, "Wait for failover rebalance to trigger")
            self.rest.monitorRebalance()

            # Validate auto_failover_settings after actual auto failover
            self.validate_failover_settings(True, self.timeout,
                                            expected_fo_nodes, self.max_count)
        finally:
            # Recover all nodes from induced failures
            failover_task = ConcurrentFailoverTask(
                task_manager=self.task_manager,
                master=self.orchestrator,
                servers_to_fail=self.nodes_to_fail,
                expected_fo_nodes=expected_fo_nodes,
                task_type="revert_failure")
            self.task_manager.add_new_task(failover_task)
            self.task_manager.get_task_result(failover_task)

        self.log.info("Rebalance out the failed nodes")
        result = self.cluster_util.rebalance(self.cluster)
        self.assertTrue(result, "Final rebalance failed")

        # Perform collection crud + doc_ops after rebalance operation
        self.__perform_doc_ops()

    def test_failover_during_rebalance(self):
        """
        1. Start rebalance operation on the active cluster
        2. Introduce failures on target nodes to trigger auto-failover
        3. Validate rebalance succeeds after auto-fo trigger
        """
        def get_reb_out_nodes():
            nodes = list()
            nodes_with_services = dict()
            cluster_nodes = self.rest.get_nodes()
            for node in cluster_nodes:
                node.services.sort()
                d_key = '_'.join(node.services)
                if d_key not in nodes_with_services:
                    nodes_with_services[d_key] = list()
                nodes_with_services[d_key].append(node)

            for services in out_nodes:
                services = services.split("_")
                services.sort()
                services = "_".join(services)
                rand_node = choice(nodes_with_services[services])
                nodes_with_services[services].remove(rand_node)
                nodes.append(rand_node)
            return nodes

        self.nodes_in = self.input.param("nodes_in", 0)

        add_nodes = list()
        remove_nodes = list()
        # Format - kv:kv_index -> 2 nodes with services [kv, kv:index]
        out_nodes = self.input.param("out_nodes", "kv").split(":")
        # Can take any of (in/out/swap)
        rebalance_type = self.input.param("rebalance_type", "in")
        services_to_fo = self.failover_order[0].split(":")
        self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo,
                                                    dynamic_fo_method=True)
        loader_task = None
        reader_task = None

        if rebalance_type == "in":
            add_nodes = self.cluster.servers[self.nodes_init:self.nodes_init +
                                             self.nodes_in]
            self.cluster.kv_nodes.extend(add_nodes)
        elif rebalance_type == "out":
            remove_nodes = get_reb_out_nodes()
        elif rebalance_type == "swap":
            remove_nodes = get_reb_out_nodes()
            add_nodes = self.cluster.servers[self.nodes_init:self.nodes_init +
                                             self.nodes_in]
            self.cluster.kv_nodes.extend(add_nodes)

        expected_fo_nodes = self.num_nodes_to_be_failover
        self.__update_server_obj()

        # Start doc_ops in background
        if self.load_during_fo:
            doc_gen = doc_generator("fo_docs", 0, 200000)
            loader_task = self.task.async_continuous_doc_ops(
                self.cluster,
                self.cluster.buckets[0],
                doc_gen,
                DocLoading.Bucket.DocOps.UPDATE,
                exp=5,
                process_concurrency=1)
            reader_task = self.task.async_continuous_doc_ops(
                self.cluster,
                self.cluster.buckets[0],
                doc_gen,
                DocLoading.Bucket.DocOps.READ,
                process_concurrency=1)

        self.__update_unaffected_node()
        self.__display_failure_node_status("Nodes to be failed")

        # Create Auto-failover task but won't start it
        failover_task = ConcurrentFailoverTask(
            task_manager=self.task_manager,
            master=self.orchestrator,
            servers_to_fail=self.nodes_to_fail,
            expected_fo_nodes=expected_fo_nodes,
            task_type="induce_failure")

        # Start rebalance operation
        self.log.info("Starting rebalance operation")
        rebalance_task = self.task.async_rebalance(self.cluster,
                                                   to_add=add_nodes,
                                                   to_remove=remove_nodes)

        self.sleep(max(10, 4 * self.nodes_in),
                   "Wait for rebalance to start before failover")
        self.task_manager.add_new_task(failover_task)

        try:
            self.log.info("Wait for failover task to complete")
            self.task_manager.get_task_result(failover_task)

            failure_msg = "Auto-failover task failed"
            if expected_fo_nodes == 0:
                # Task is expected to fail since no failover is triggered
                self.assertFalse(failover_task.result, failure_msg)
            else:
                self.assertTrue(failover_task.result, failure_msg)
        finally:
            # Disable auto-fo after the expected time limit
            self.rest.update_autofailover_settings(enabled=False,
                                                   timeout=self.timeout,
                                                   maxCount=self.max_count)

            # Recover all nodes from induced failures
            recovery_task = ConcurrentFailoverTask(
                task_manager=self.task_manager,
                master=self.orchestrator,
                servers_to_fail=self.nodes_to_fail,
                expected_fo_nodes=expected_fo_nodes,
                task_type="revert_failure")
            self.task_manager.add_new_task(recovery_task)
            self.task_manager.get_task_result(recovery_task)
            self.task_manager.stop_task(rebalance_task)

            # Enable back prev auto_fo settings
            self.sleep(5, "Wait before enabling back auto-fo")
            self.rest.update_autofailover_settings(enabled=True,
                                                   timeout=self.timeout,
                                                   maxCount=self.max_count)

        # Validate auto_failover_settings after failover
        self.validate_failover_settings(True, self.timeout, expected_fo_nodes,
                                        self.max_count)

        # Stop background doc_ops
        if self.load_during_fo:
            for task in [loader_task, reader_task]:
                task.end_task()
                self.task_manager.get_task_result(task)

        # Perform collection crud + doc_ops before rebalance operation
        self.__perform_doc_ops(durability="NONE", validate_num_items=False)

        # Rebalance the cluster to remove failed nodes
        result = self.cluster_util.rebalance(self.cluster)
        self.assertTrue(result, "Rebalance failed")

        # Validate auto_failover_settings after rebalance operation
        self.validate_failover_settings(True, self.timeout, 0, self.max_count)

        # Perform collection crud + doc_ops after rebalance operation
        self.__perform_doc_ops()
Exemplo n.º 17
0
class OPD:
    def __init__(self):
        pass

    def threads_calculation(self):
        self.process_concurrency = self.input.param("pc",
                                                    self.process_concurrency)
        self.doc_loading_tm = TaskManager(self.process_concurrency)

    def get_memory_footprint(self):
        out = subprocess.Popen(
            ['ps', 'v', '-p', str(os.getpid())],
            stdout=subprocess.PIPE).communicate()[0].split(b'\n')
        vsz_index = out[0].split().index(b'RSS')
        mem = float(out[1].split()[vsz_index]) / 1024
        self.PrintStep("RAM FootPrint: %s" % str(mem))
        return mem

    def create_required_buckets(self, cluster):
        if self.cluster.cloud_cluster:
            return
        self.log.info("Get the available memory quota")
        rest = RestConnection(cluster.master)
        self.info = rest.get_nodes_self()

        # threshold_memory_vagrant = 100
        kv_memory = self.info.memoryQuota - 100

        # Creating buckets for data loading purpose
        self.log.info("Create CB buckets")
        self.bucket_expiry = self.input.param("bucket_expiry", 0)
        ramQuota = self.input.param("ramQuota", kv_memory)
        buckets = ["GleamBookUsers"] * self.num_buckets
        bucket_type = self.bucket_type.split(';') * self.num_buckets
        compression_mode = self.compression_mode.split(';') * self.num_buckets
        self.bucket_eviction_policy = self.bucket_eviction_policy
        for i in range(self.num_buckets):
            bucket = Bucket({
                Bucket.name: buckets[i] + str(i),
                Bucket.ramQuotaMB: ramQuota / self.num_buckets,
                Bucket.maxTTL: self.bucket_expiry,
                Bucket.replicaNumber: self.num_replicas,
                Bucket.storageBackend: self.bucket_storage,
                Bucket.evictionPolicy: self.bucket_eviction_policy,
                Bucket.bucketType: bucket_type[i],
                Bucket.flushEnabled: Bucket.FlushBucket.ENABLED,
                Bucket.compressionMode: compression_mode[i],
                Bucket.fragmentationPercentage: self.fragmentation
            })
            self.bucket_util.create_bucket(cluster, bucket)

        # rebalance the new buckets across all nodes.
        self.log.info("Rebalance Starts")
        self.nodes = rest.node_statuses()
        rest.rebalance(otpNodes=[node.id for node in self.nodes],
                       ejectedNodes=[])
        rest.monitorRebalance()

    def create_required_collections(self, cluster, num_scopes,
                                    num_collections):
        self.scope_name = self.input.param("scope_name", "_default")
        if self.scope_name != "_default":
            self.bucket_util.create_scope(cluster, self.bucket,
                                          {"name": self.scope_name})
        if num_scopes > 1:
            self.scope_prefix = self.input.param("scope_prefix", "VolumeScope")
            for bucket in cluster.buckets:
                for i in range(num_scopes):
                    scope_name = self.scope_prefix + str(i)
                    self.log.info("Creating scope: %s" % (scope_name))
                    self.bucket_util.create_scope(cluster.master, bucket,
                                                  {"name": scope_name})
                    self.sleep(0.5)
            self.num_scopes += 1
        for bucket in cluster.buckets:
            for scope in bucket.scopes.keys():
                if num_collections > 0:
                    self.collection_prefix = self.input.param(
                        "collection_prefix", "VolumeCollection")

                    for i in range(num_collections):
                        collection_name = self.collection_prefix + str(i)
                        self.bucket_util.create_collection(
                            cluster.master, bucket, scope,
                            {"name": collection_name})
                        self.sleep(0.5)

        self.collections = cluster.buckets[0].scopes[
            self.scope_name].collections.keys()
        self.log.debug("Collections list == {}".format(self.collections))

    def stop_purger(self, tombstone_purge_age=60):
        """
        1. Disable ts purger
        2. Create fts indexes (to create metakv, ns_config entries)
        3. Delete fts indexes
        4. Grep ns_config for '_deleted' to get total deleted keys count
        5. enable ts purger and age = 1 mins
        6. Sleep for 2 minutes
        7. Grep for debug.log and check for latest tombstones purged count
        8. Validate step4 count matches step 7 count for all nodes
        """
        self.rest.update_tombstone_purge_age_for_removal(tombstone_purge_age)
        self.rest.disable_tombstone_purger()

    def get_bucket_dgm(self, bucket):
        self.rest_client = BucketHelper(self.cluster.master)
        dgm = self.rest_client.fetch_bucket_stats(
            bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1]
        self.log.info("Active Resident Threshold of {0} is {1}".format(
            bucket.name, dgm))
        return dgm

    def _induce_error(self, error_condition, nodes=[]):
        nodes = nodes or [self.cluster.master]
        for node in nodes:
            if error_condition == "stop_server":
                self.cluster_util.stop_server(node)
            elif error_condition == "enable_firewall":
                self.cluster_util.start_firewall_on_node(node)
            elif error_condition == "kill_memcached":
                shell = RemoteMachineShellConnection(node)
                shell.kill_memcached()
                shell.disconnect()
            elif error_condition == "reboot_server":
                shell = RemoteMachineShellConnection(node)
                shell.reboot_node()
            elif error_condition == "kill_erlang":
                shell = RemoteMachineShellConnection(node)
                shell.kill_erlang()
                shell.disconnect()
            else:
                self.fail("Invalid error induce option")

    def _recover_from_error(self, error_condition):
        for node in self.cluster.nodes_in_cluster:
            if error_condition == "stop_server" or error_condition == "kill_erlang":
                self.cluster_util.start_server(node)
            elif error_condition == "enable_firewall":
                self.cluster_util.stop_firewall_on_node(node)

        for node in self.cluster.kv_nodes + [self.cluster.master]:
            self.check_warmup_complete(node)
            result = self.cluster_util.wait_for_ns_servers_or_assert(
                [node], wait_time=1200)
            self.assertTrue(result, "Server warmup failed")

    def rebalance(self,
                  nodes_in=0,
                  nodes_out=0,
                  services=[],
                  retry_get_process_num=3000):
        self.servs_in = list()
        self.nodes_cluster = self.cluster.nodes_in_cluster[:]
        self.nodes_cluster.remove(self.cluster.master)
        self.servs_out = list()
        services = services or ["kv"]
        print "KV nodes in cluster: %s" % [
            server.ip for server in self.cluster.kv_nodes
        ]
        print "CBAS nodes in cluster: %s" % [
            server.ip for server in self.cluster.cbas_nodes
        ]
        print "INDEX nodes in cluster: %s" % [
            server.ip for server in self.cluster.index_nodes
        ]
        print "FTS nodes in cluster: %s" % [
            server.ip for server in self.cluster.fts_nodes
        ]
        print "QUERY nodes in cluster: %s" % [
            server.ip for server in self.cluster.query_nodes
        ]
        print "EVENTING nodes in cluster: %s" % [
            server.ip for server in self.cluster.eventing_nodes
        ]
        print "AVAILABLE nodes for cluster: %s" % [
            server.ip for server in self.available_servers
        ]
        if nodes_out:
            if "cbas" in services:
                servers = random.sample(self.cluster.cbas_nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.cbas_nodes.remove(server)
            if "index" in services:
                servers = random.sample(self.cluster.index_nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.index_nodes.remove(server)
            if "fts" in services:
                servers = random.sample(self.cluster.fts_nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.fts_nodes.remove(server)
            if "query" in services:
                servers = random.sample(self.cluster.query_nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.query_nodes.remove(server)
            if "eventing" in services:
                servers = random.sample(self.cluster.eventing_nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.eventing_nodes.remove(server)
            if "kv" in services:
                nodes = [
                    node for node in self.cluster.kv_nodes
                    if node.ip != self.cluster.master.ip
                ]
                servers = random.sample(nodes, nodes_out)
                self.servs_out.extend(servers)
                for server in servers:
                    self.cluster.kv_nodes.remove(server)

        if nodes_in:
            if "cbas" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.cbas_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]
            if "index" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.index_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]
            if "fts" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.fts_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]
            if "query" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.query_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]
            if "eventing" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.eventing_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]
            if "kv" in services:
                servers = random.sample(self.available_servers, nodes_in)
                self.servs_in.extend(servers)
                self.cluster.kv_nodes.extend(servers)
                self.available_servers = [
                    servs for servs in self.available_servers
                    if servs not in servers
                ]

        print "Servers coming in : %s with services: %s" % (
            [server.ip for server in self.servs_in], services)
        print "Servers going out : %s" % (
            [server.ip for server in self.servs_out])
        self.available_servers.extend(self.servs_out)
        print "NEW AVAILABLE nodes for cluster: %s" % (
            [server.ip for server in self.available_servers])
        if nodes_in == nodes_out:
            self.vbucket_check = False

        rebalance_task = self.task.async_rebalance(
            self.cluster,
            self.servs_in,
            self.servs_out,
            services=services,
            check_vbucket_shuffling=self.vbucket_check,
            retry_get_process_num=retry_get_process_num)

        return rebalance_task

    def generate_docs(self,
                      doc_ops=None,
                      create_end=None,
                      create_start=None,
                      update_end=None,
                      update_start=None,
                      delete_end=None,
                      delete_start=None,
                      expire_end=None,
                      expire_start=None,
                      read_end=None,
                      read_start=None):
        self.get_memory_footprint()
        self.create_end = 0
        self.create_start = 0
        self.read_end = 0
        self.read_start = 0
        self.update_end = 0
        self.update_start = 0
        self.delete_end = 0
        self.delete_start = 0
        self.expire_end = 0
        self.expire_start = 0
        self.initial_items = self.final_items

        doc_ops = doc_ops or self.doc_ops
        self.mutations_to_validate = doc_ops

        if "read" in doc_ops:
            if read_start is not None:
                self.read_start = read_start
            else:
                self.read_start = 0
            if read_end is not None:
                self.read_end = read_end
            else:
                self.read_end = self.num_items * self.mutation_perc / 100

        if "update" in doc_ops:
            if update_start is not None:
                self.update_start = update_start
            else:
                self.update_start = 0
            if update_end is not None:
                self.update_end = update_end
            else:
                self.update_end = self.num_items * self.mutation_perc / 100
            self.mutate += 1

        if "delete" in doc_ops:
            if delete_start is not None:
                self.delete_start = delete_start
            else:
                self.delete_start = self.start
            if delete_end is not None:
                self.delete_end = delete_end
            else:
                self.delete_end = self.start + self.num_items * self.mutation_perc / 100
            self.final_items -= (self.delete_end - self.delete_start
                                 ) * self.num_collections * self.num_scopes

        if "expiry" in doc_ops:
            if self.maxttl == 0:
                self.maxttl = self.input.param("maxttl", 10)
            if expire_start is not None:
                self.expire_start = expire_start
            else:
                self.expire_start = self.delete_end
            if expire_end is not None:
                self.expire_end = expire_end
            else:
                self.expire_end = self.expire_start + self.num_items * self.mutation_perc / 100
            self.final_items -= (self.expire_end - self.expire_start
                                 ) * self.num_collections * self.num_scopes

        if "create" in doc_ops:
            if create_start is not None:
                self.create_start = create_start
            else:
                self.create_start = self.end
            self.start = self.create_start

            if create_end is not None:
                self.create_end = create_end
            else:
                self.create_end = self.end + (
                    self.expire_end - self.expire_start) + (self.delete_end -
                                                            self.delete_start)
            self.end = self.create_end

            self.final_items += (abs(self.create_end - self.create_start)
                                 ) * self.num_collections * self.num_scopes

        print "Read Start: %s" % self.read_start
        print "Read End: %s" % self.read_end
        print "Update Start: %s" % self.update_start
        print "Update End: %s" % self.update_end
        print "Expiry Start: %s" % self.expire_start
        print "Expiry End: %s" % self.expire_end
        print "Delete Start: %s" % self.delete_start
        print "Delete End: %s" % self.delete_end
        print "Create Start: %s" % self.create_start
        print "Create End: %s" % self.create_end
        print "Final Start: %s" % self.start
        print "Final End: %s" % self.end

    def _loader_dict(self, cmd={}):
        self.loader_map = dict()
        for bucket in self.cluster.buckets:
            for scope in bucket.scopes.keys():
                for collection in bucket.scopes[scope].collections.keys():
                    if collection == "_default" and scope == "_default":
                        continue
                    ws = WorkLoadSettings(
                        cmd.get("keyPrefix", self.key),
                        cmd.get("keySize", self.key_size),
                        cmd.get("docSize", self.doc_size),
                        cmd.get("cr", self.create_perc),
                        cmd.get("rd", self.read_perc),
                        cmd.get("up", self.update_perc),
                        cmd.get("dl", self.delete_perc),
                        cmd.get("ex", self.expiry_perc),
                        cmd.get("workers", self.process_concurrency),
                        cmd.get("ops", self.ops_rate),
                        cmd.get("loadType", None), cmd.get("keyType", None),
                        cmd.get("valueType", None), cmd.get("validate", False),
                        cmd.get("gtm", False), cmd.get("deleted", False),
                        cmd.get("mutated", 0))
                    hm = HashMap()
                    hm.putAll({
                        DRConstants.create_s: self.create_start,
                        DRConstants.create_e: self.create_end,
                        DRConstants.update_s: self.update_start,
                        DRConstants.update_e: self.update_end,
                        DRConstants.expiry_s: self.expire_start,
                        DRConstants.expiry_e: self.expire_end,
                        DRConstants.delete_s: self.delete_start,
                        DRConstants.delete_e: self.delete_end,
                        DRConstants.read_s: self.read_start,
                        DRConstants.read_e: self.read_end
                    })
                    dr = DocRange(hm)
                    ws.dr = dr
                    dg = DocumentGenerator(ws, self.key_type, self.val_type)
                    self.loader_map.update(
                        {bucket.name + scope + collection: dg})

    def wait_for_doc_load_completion(self, tasks, wait_for_stats=True):
        self.doc_loading_tm.getAllTaskResult()
        self.get_memory_footprint()
        for task in tasks:
            task.result = True
            unique_str = "{}:{}:{}:".format(task.sdk.bucket, task.sdk.scope,
                                            task.sdk.collection)
            for optype, failures in task.failedMutations.items():
                for failure in failures:
                    if failure is not None:
                        print("Test Retrying {}: {}{} -> {}".format(
                            optype, unique_str, failure.id(),
                            failure.err().getClass().getSimpleName()))
                        try:
                            if optype == "create":
                                task.docops.insert(failure.id(),
                                                   failure.document(),
                                                   task.sdk.connection,
                                                   task.setOptions)
                            if optype == "update":
                                task.docops.upsert(failure.id(),
                                                   failure.document(),
                                                   task.sdk.connection,
                                                   task.upsertOptions)
                            if optype == "delete":
                                task.docops.delete(failure.id(),
                                                   task.sdk.connection,
                                                   task.removeOptions)
                        except (ServerOutOfMemoryException,
                                TimeoutException) as e:
                            print("Retry {} failed for key: {} - {}".format(
                                optype, failure.id(), e))
                            task.result = False
                        except (DocumentNotFoundException,
                                DocumentExistsException) as e:
                            pass
            try:
                task.sdk.disconnectCluster()
            except Exception as e:
                print(e)
            self.assertTrue(task.result,
                            "Task Failed: {}".format(task.taskName))
        if wait_for_stats:
            try:
                self.bucket_util._wait_for_stats_all_buckets(
                    self.cluster, self.cluster.buckets, timeout=14400)
                if self.track_failures:
                    self.bucket_util.verify_stats_all_buckets(self.cluster,
                                                              self.final_items,
                                                              timeout=14400)
            except Exception as e:
                if not self.cluster.cloud_cluster:
                    self.get_gdb()
                raise e

    def get_gdb(self):
        for node in self.cluster.nodes_in_cluster:
            gdb_shell = RemoteMachineShellConnection(node)
            gdb_out = gdb_shell.execute_command(
                'gdb -p `(pidof memcached)` -ex "thread apply all bt" -ex detach -ex quit'
            )[0]
            print node.ip
            print gdb_out
            gdb_shell.disconnect()

    def data_validation(self):
        self.get_memory_footprint()
        doc_ops = self.mutations_to_validate
        pc = min(self.process_concurrency, 20)
        if self._data_validation:
            self.log.info("Validating Active/Replica Docs")
            cmd = dict()
            self.ops_rate = self.input.param("ops_rate", 2000)
            master = Server(self.cluster.master.ip, self.cluster.master.port,
                            self.cluster.master.rest_username,
                            self.cluster.master.rest_password,
                            str(self.cluster.master.memcached_port))
            self.loader_map = dict()
            for bucket in self.cluster.buckets:
                for scope in bucket.scopes.keys():
                    for collection in bucket.scopes[scope].collections.keys():
                        if collection == "_default" and scope == "_default":
                            continue
                        for op_type in doc_ops:
                            cmd.update({"deleted": False})
                            hm = HashMap()
                            if op_type == "create":
                                hm.putAll({
                                    DRConstants.read_s: self.create_start,
                                    DRConstants.read_e: self.create_end
                                })
                            elif op_type == "update":
                                hm.putAll({
                                    DRConstants.read_s: self.update_start,
                                    DRConstants.read_e: self.update_end
                                })
                            elif op_type == "delete":
                                hm.putAll({
                                    DRConstants.read_s: self.delete_start,
                                    DRConstants.read_e: self.delete_end
                                })
                                cmd.update({"deleted": True})
                            else:
                                continue
                            dr = DocRange(hm)
                            ws = WorkLoadSettings(
                                cmd.get("keyPrefix", self.key),
                                cmd.get("keySize", self.key_size),
                                cmd.get("docSize", self.doc_size),
                                cmd.get("cr", 0), cmd.get("rd", 100),
                                cmd.get("up", 0), cmd.get("dl", 0),
                                cmd.get("ex", 0), cmd.get("workers", pc),
                                cmd.get("ops", self.ops_rate),
                                cmd.get("loadType", None),
                                cmd.get("keyType", None),
                                cmd.get("valueType", None),
                                cmd.get("validate",
                                        True), cmd.get("gtm", False),
                                cmd.get("deleted", False),
                                cmd.get("mutated", 0))
                            ws.dr = dr
                            dg = DocumentGenerator(ws, self.key_type,
                                                   self.val_type)
                            self.loader_map.update({
                                bucket.name + scope + collection + op_type:
                                dg
                            })

            tasks = list()
            i = pc
            while i > 0:
                for bucket in self.cluster.buckets:
                    for scope in bucket.scopes.keys():
                        for collection in bucket.scopes[
                                scope].collections.keys():
                            if collection == "_default" and scope == "_default":
                                continue
                            for op_type in doc_ops:
                                if op_type not in [
                                        "create", "update", "delete"
                                ]:
                                    continue
                                client = NewSDKClient(master, bucket.name,
                                                      scope, collection)
                                client.initialiseSDK()
                                self.sleep(1)
                                taskName = "Validate_%s_%s_%s_%s_%s_%s" % (
                                    bucket.name, scope, collection, op_type,
                                    str(i), time.time())
                                task = WorkLoadGenerate(
                                    taskName,
                                    self.loader_map[bucket.name + scope +
                                                    collection + op_type],
                                    client, "NONE", self.maxttl,
                                    self.time_unit, self.track_failures, 0)
                                tasks.append(task)
                                self.doc_loading_tm.submit(task)
                                i -= 1
        self.doc_loading_tm.getAllTaskResult()
        for task in tasks:
            try:
                task.sdk.disconnectCluster()
            except Exception as e:
                print(e)
        for task in tasks:
            self.assertTrue(task.result,
                            "Validation Failed for: %s" % task.taskName)
        self.get_memory_footprint()

    def print_crud_stats(self):
        self.table = TableView(self.log.info)
        self.table.set_headers([
            "Initial Items", "Current Items", "Items Updated", "Items Created",
            "Items Deleted", "Items Expired"
        ])
        self.table.add_row([
            str(self.initial_items),
            str(self.final_items),
            str(abs(self.update_start)) + "-" + str(abs(self.update_end)),
            str(abs(self.create_start)) + "-" + str(abs(self.create_end)),
            str(abs(self.delete_start)) + "-" + str(abs(self.delete_end)),
            str(abs(self.expire_start)) + "-" + str(abs(self.expire_end))
        ])
        self.table.display("Docs statistics")

    def perform_load(self,
                     crash=False,
                     num_kills=1,
                     wait_for_load=True,
                     validate_data=True):
        self.get_memory_footprint()
        self._loader_dict()
        master = Server(self.cluster.master.ip, self.cluster.master.port,
                        self.cluster.master.rest_username,
                        self.cluster.master.rest_password,
                        str(self.cluster.master.memcached_port))
        tasks = list()
        i = self.process_concurrency
        while i > 0:
            for bucket in self.cluster.buckets:
                for scope in bucket.scopes.keys():
                    for collection in bucket.scopes[scope].collections.keys():
                        if collection == "_default" and scope == "_default":
                            continue
                        client = NewSDKClient(master, bucket.name, scope,
                                              collection)
                        client.initialiseSDK()
                        self.sleep(1)
                        self.get_memory_footprint()
                        taskName = "Loader_%s_%s_%s_%s_%s" % (
                            bucket.name, scope, collection, str(i),
                            time.time())
                        task = WorkLoadGenerate(
                            taskName,
                            self.loader_map[bucket.name + scope + collection],
                            client, self.durability_level, self.maxttl,
                            self.time_unit, self.track_failures, 0)
                        tasks.append(task)
                        self.doc_loading_tm.submit(task)
                        i -= 1

        if wait_for_load:
            self.wait_for_doc_load_completion(tasks)
            self.get_memory_footprint()
        else:
            return tasks

        if crash:
            self.kill_memcached(num_kills=num_kills)

        if validate_data:
            self.data_validation()

        self.print_stats()

        if self.cluster.cloud_cluster:
            return

        result = self.check_coredump_exist(self.cluster.nodes_in_cluster)
        if result:
            self.PrintStep("CRASH | CRITICAL | WARN messages found in cb_logs")
            if self.assert_crashes_on_load:
                self.task_manager.abort_all_tasks()
                self.doc_loading_tm.abortAllTasks()
                self.assertFalse(result)

    def get_magma_disk_usage(self, bucket=None):
        if bucket is None:
            bucket = self.bucket
        servers = self.cluster.nodes_in_cluster
        kvstore = 0
        wal = 0
        keyTree = 0
        seqTree = 0
        data_files = 0

        for server in servers:
            shell = RemoteMachineShellConnection(server)
            bucket_path = os.path.join(
                RestConnection(server).get_data_path(), bucket.name)
            kvstore += int(
                shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\
            " % os.path.join(bucket_path, "magma.*/kv*"))[0][0].split('\n')[0])
            wal += int(
                shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\
            " % os.path.join(bucket_path, "magma.*/wal"))[0][0].split('\n')[0])
            keyTree += int(
                shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\
            " % os.path.join(bucket_path,
                             "magma.*/kv*/rev*/key*"))[0][0].split('\n')[0])
            seqTree += int(
                shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\
            " % os.path.join(bucket_path,
                             "magma.*/kv*/rev*/seq*"))[0][0].split('\n')[0])

            cmd = 'find ' + bucket_path + '/magma*/ -maxdepth 1 -type d \
            -print0 | while read -d "" -r dir; do files=("$dir"/*/*/*); \
            printf "%d,%s\n" "${#files[@]}" "$dir"; done'

            data_files = shell.execute_command(cmd)[0]
            for files in data_files:
                if "kvstore" in files and int(files.split(",")[0]) >= 300:
                    self.log.warn("Number of files in {}--{} is {}".format(
                        server.ip,
                        files.split(",")[1].rstrip(),
                        files.split(",")[0]))
            shell.disconnect()
        self.log.debug("Total Disk usage for kvstore is {}MB".format(kvstore))
        self.log.debug("Total Disk usage for wal is {}MB".format(wal))
        self.log.debug("Total Disk usage for keyTree is {}MB".format(keyTree))
        self.log.debug("Total Disk usage for seqTree is {}MB".format(seqTree))
        return kvstore, wal, keyTree, seqTree

    def print_stats(self):
        self.bucket_util.print_bucket_stats(self.cluster)
        self.cluster_util.print_cluster_stats(self.cluster)
        self.print_crud_stats()
        for bucket in self.cluster.buckets:
            self.get_bucket_dgm(bucket)
            if bucket.storageBackend == Bucket.StorageBackend.magma and not self.cluster.cloud_cluster:
                self.get_magma_disk_usage(bucket)
                self.check_fragmentation_using_magma_stats(bucket)
                self.check_fragmentation_using_kv_stats(bucket)

    def PrintStep(self, msg=None):
        print "\n"
        print "\t", "#" * 60
        print "\t", "#"
        print "\t", "#  %s" % msg
        print "\t", "#"
        print "\t", "#" * 60
        print "\n"

    def check_fragmentation_using_kv_stats(self, bucket, servers=None):
        result = dict()
        if servers is None:
            servers = self.cluster.kv_nodes + [self.cluster.master]
        if type(servers) is not list:
            servers = [servers]
        for server in servers:
            frag_val = self.bucket_util.get_fragmentation_kv(
                self.cluster, bucket, server)
            self.log.debug("Current Fragmentation for node {} is {} \
            ".format(server.ip, frag_val))
            result.update({server.ip: frag_val})
        self.log.info("KV stats fragmentation values {}".format(result))

    def dump_magma_stats(self, server, bucket, shard, kvstore):
        if bucket.storageBackend != Bucket.StorageBackend.magma or self.cluster.cloud_cluster:
            return
        shell = RemoteMachineShellConnection(server)
        data_path = RestConnection(server).get_data_path()
        while not self.stop_stats:
            for bucket in self.cluster.buckets:
                self.log.info(
                    self.get_magma_stats(bucket, server, "rw_0:magma"))
                self.dump_seq_index(shell, data_path, bucket.name, shard,
                                    kvstore)
            self.sleep(600)
        shell.disconnect()

    def dump_seq_index(self, shell, data_path, bucket, shard, kvstore):
        magma_path = os.path.join(data_path, bucket, "magma.{}")
        magma = magma_path.format(shard)
        cmd = '/opt/couchbase/bin/magma_dump {}'.format(magma)
        cmd += ' --kvstore {} --tree seq'.format(kvstore)
        result = shell.execute_command(cmd)[0]
        self.log.info("Seq Tree for {}:{}:{}:{}: \n{}".format(
            shell.ip, bucket, shard, kvstore, result))

    def check_fragmentation_using_magma_stats(self, bucket, servers=None):
        result = dict()
        stats = list()
        if servers is None:
            servers = self.cluster.kv_nodes + [self.cluster.master]
        if type(servers) is not list:
            servers = [servers]
        for server in servers:
            fragmentation_values = list()
            shell = RemoteMachineShellConnection(server)
            output = shell.execute_command(
                "lscpu | grep 'CPU(s)' | head -1 | awk '{print $2}'"
            )[0][0].split('\n')[0]
            shell.disconnect()
            self.log.debug("machine: {} - core(s): {}".format(
                server.ip, output))
            for i in range(min(int(output), 64)):
                grep_field = "rw_{}:magma".format(i)
                _res = self.get_magma_stats(bucket, server)
                fragmentation_values.append(
                    json.loads(_res[server.ip][grep_field])["Fragmentation"])
                stats.append(_res)
            result.update({server.ip: fragmentation_values})
        self.log.info(stats[0])
        res = list()
        for value in result.values():
            res.append(max(value))
        if max(res) < float(self.fragmentation) / 100:
            self.log.info("magma stats fragmentation result {} \
            ".format(result))
            return True
        self.log.info("magma stats fragmentation result {} \
        ".format(result))
        return False

    def get_magma_stats(self, bucket, server=None):
        magma_stats_for_all_servers = dict()
        cbstat_obj = Cbstats(server)
        result = cbstat_obj.magma_stats(bucket.name)
        magma_stats_for_all_servers[server.ip] = result
        return magma_stats_for_all_servers

    def pause_rebalance(self):
        rest = RestConnection(self.cluster.master)
        i = 1
        self.sleep(10, "Let the rebalance begin!")
        expected_progress = 20
        while expected_progress < 100:
            expected_progress = 20 * i
            reached = self.cluster_util.rebalance_reached(
                rest, expected_progress)
            self.assertTrue(
                reached, "Rebalance failed or did not reach {0}%".format(
                    expected_progress))
            if not self.cluster_util.is_cluster_rebalanced(rest):
                self.log.info("Stop the rebalance")
                stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout /
                                              3)
                self.assertTrue(stopped, msg="Unable to stop rebalance")
                rebalance_task = self.task.async_rebalance(
                    self.cluster, [], [], retry_get_process_num=3000)
                self.sleep(
                    10, "Rebalance % ={}. Let the rebalance begin!".format(
                        expected_progress))
            i += 1
        return rebalance_task

    def abort_rebalance(self, rebalance, error_type="kill_memcached"):
        self.sleep(30, "Let the rebalance begin!")
        rest = RestConnection(self.cluster.master)
        i = 1
        expected_progress = 20
        rebalance_task = rebalance
        while expected_progress < 80:
            expected_progress = 20 * i
            reached = self.cluster_util.rebalance_reached(rest,
                                                          expected_progress,
                                                          wait_step=10,
                                                          num_retry=3600)
            self.assertTrue(
                reached, "Rebalance failed or did not reach {0}%".format(
                    expected_progress))

            if not self.cluster_util.is_cluster_rebalanced(rest):
                self.log.info("Abort rebalance")
                self._induce_error(error_type, self.cluster.nodes_in_cluster)
                result = self.check_coredump_exist(
                    self.cluster.nodes_in_cluster)
                if result:
                    self.task_manager.abort_all_tasks()
                    self.doc_loading_tm.abortAllTasks()
                    self.assertFalse(
                        result,
                        "CRASH | CRITICAL | WARN messages found in cb_logs")
                self.sleep(60, "Sleep after error introduction")
                self._recover_from_error(error_type)
                result = self.check_coredump_exist(
                    self.cluster.nodes_in_cluster)
                if result:
                    self.task_manager.abort_all_tasks()
                    self.doc_loading_tm.abortAllTasks()
                    self.assertFalse(
                        result,
                        "CRASH | CRITICAL | WARN messages found in cb_logs")
                try:
                    self.task_manager.get_task_result(rebalance_task)
                except RebalanceFailedException:
                    pass
                if rebalance.result:
                    self.log.error(
                        "Rebalance passed/finished which is not expected")
                    self.log.info(
                        "Rebalance % after rebalance finished = {}".format(
                            expected_progress))
                    return None
                else:
                    self.log.info(
                        "Restarting Rebalance after killing at {}".format(
                            expected_progress))
                    rebalance_task = self.task.async_rebalance(
                        self.cluster, [],
                        self.servs_out,
                        retry_get_process_num=3000)
                    self.sleep(120, "Let the rebalance begin after abort")
                    self.log.info("Rebalance % = {}".format(
                        self.rest._rebalance_progress()))
            i += 1
        return rebalance_task

    def crash_memcached(self, nodes=None, num_kills=1, graceful=False):
        self.stop_crash = False
        self.crash_count = 0
        if not nodes:
            nodes = self.cluster.kv_nodes + [self.cluster.master]

        while not self.stop_crash:
            self.get_memory_footprint()
            sleep = random.randint(60, 120)
            self.sleep(
                sleep, "Iteration:{} waiting to kill memc on all nodes".format(
                    self.crash_count))
            self.kill_memcached(nodes,
                                num_kills=num_kills,
                                graceful=graceful,
                                wait=True)
            self.crash_count += 1
            if self.crash_count > self.crashes:
                self.stop_crash = True
        self.sleep(300)

    def kill_memcached(self,
                       servers=None,
                       num_kills=1,
                       graceful=False,
                       wait=True):
        if not servers:
            servers = self.cluster.kv_nodes + [self.cluster.master]

        for server in servers:
            for _ in xrange(num_kills):
                if num_kills > 1:
                    self.sleep(
                        2,
                        "Sleep for 2 seconds b/w cont memc kill on same node.")
                shell = RemoteMachineShellConnection(server)
                if graceful:
                    shell.restart_couchbase()
                else:
                    shell.kill_memcached()
                shell.disconnect()
            self.sleep(
                5, "Sleep for 5 seconds before killing memc on next node.")

        result = self.check_coredump_exist(self.cluster.nodes_in_cluster)
        if result:
            self.stop_crash = True
            self.task_manager.abort_all_tasks()
            self.doc_loading_tm.abortAllTasks()
            self.assertFalse(
                result, "CRASH | CRITICAL | WARN messages found in cb_logs")

        if wait:
            for server in servers:
                self.check_warmup_complete(server)

    def check_warmup_complete(self, server):
        for bucket in self.cluster.buckets:
            start_time = time.time()
            result = self.bucket_util._wait_warmup_completed(
                [server],
                self.cluster.buckets[0],
                wait_time=self.wait_timeout * 20)
            if not result:
                self.stop_crash = True
                self.task_manager.abort_all_tasks()
                self.doc_loading_tm.abortAllTasks()
                self.assertTrue(
                    result,
                    "Warm-up failed in %s seconds" % (self.wait_timeout * 20))
            else:
                self.log.info("Bucket:%s warm-up completed in %s." %
                              (bucket.name, str(time.time() - start_time)))

    def set_num_writer_and_reader_threads(self,
                                          num_writer_threads="default",
                                          num_reader_threads="default",
                                          num_storage_threads="default"):
        bucket_helper = BucketHelper(self.cluster.master)
        bucket_helper.update_memcached_settings(
            num_writer_threads=num_writer_threads,
            num_reader_threads=num_reader_threads,
            num_storage_threads=num_storage_threads)
Exemplo n.º 18
0
    def test_doc_size(self):
        def check_durability_failures():
            self.log.error(task.sdk_acked_curd_failed.keys())
            self.log.error(task.sdk_exception_crud_succeed.keys())
            self.assertTrue(
                len(task.sdk_acked_curd_failed) == 0,
                "Durability failed for docs: %s" %
                task.sdk_acked_curd_failed.keys())
            self.assertTrue(
                len(task.sdk_exception_crud_succeed) == 0,
                "Durability failed for docs: %s" %
                task.sdk_acked_curd_failed.keys())

        """
        Basic tests for document CRUD operations using JSON docs
        """
        doc_op = self.input.param("doc_op", None)
        def_bucket = self.bucket_util.buckets[0]
        ignore_exceptions = list()
        retry_exceptions = list()

        # Stat validation reference variables
        verification_dict = dict()
        ref_val = dict()
        ref_val["ops_create"] = 0
        ref_val["ops_update"] = 0
        ref_val["ops_delete"] = 0
        ref_val["rollback_item_count"] = 0
        ref_val["sync_write_aborted_count"] = 0
        ref_val["sync_write_committed_count"] = 0

        one_less_node = self.nodes_init == self.num_replicas

        if self.durability_level:
            pass
            #ignore_exceptions.append(
            #    "com.couchbase.client.core.error.RequestTimeoutException")

        if self.target_vbucket and type(self.target_vbucket) is not list:
            self.target_vbucket = [self.target_vbucket]

        self.log.info("Creating doc_generator..")
        # Load basic docs into bucket
        doc_create = doc_generator(self.key,
                                   0,
                                   self.num_items,
                                   doc_size=self.doc_size,
                                   doc_type=self.doc_type,
                                   target_vbucket=self.target_vbucket,
                                   vbuckets=self.vbuckets)
        self.log.info("Loading {0} docs into the bucket: {1}".format(
            self.num_items, def_bucket))
        task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            doc_create,
            "create",
            0,
            batch_size=self.batch_size,
            process_concurrency=self.process_concurrency,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            durability=self.durability_level,
            timeout_secs=self.sdk_timeout,
            ryow=self.ryow,
            check_persistence=self.check_persistence)
        self.task.jython_task_manager.get_task_result(task)

        if self.ryow:
            check_durability_failures()

        # Retry doc_exception code
        self.log.info("Validating failed doc's (if any) exceptions")
        doc_op_info_dict = dict()
        doc_op_info_dict[task] = self.bucket_util.get_doc_op_info_dict(
            def_bucket,
            "create",
            exp=0,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            durability=self.durability_level,
            timeout=self.sdk_timeout,
            time_unit="seconds",
            ignore_exceptions=ignore_exceptions,
            retry_exceptions=retry_exceptions)
        self.bucket_util.verify_doc_op_task_exceptions(doc_op_info_dict,
                                                       self.cluster)

        if len(doc_op_info_dict[task]["unwanted"]["fail"].keys()) != 0:
            self.fail("Failures in retry doc CRUDs: {0}".format(
                doc_op_info_dict[task]["unwanted"]["fail"]))

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets()

        # Update ref_val
        ref_val["ops_create"] = self.num_items + len(task.fail.keys())
        ref_val["sync_write_committed_count"] = self.num_items
        # Validate vbucket stats
        verification_dict["ops_create"] = ref_val["ops_create"]
        verification_dict["rollback_item_count"] = \
            ref_val["rollback_item_count"]
        if self.durability_level:
            verification_dict["sync_write_aborted_count"] = \
                ref_val["sync_write_aborted_count"]
            verification_dict["sync_write_committed_count"] = \
                ref_val["sync_write_committed_count"]

        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket,
            self.cluster_util.get_kv_nodes(),
            vbuckets=self.vbuckets,
            expected_val=verification_dict,
            one_less_node=one_less_node)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        # Verify initial doc load count
        self.log.info("Validating doc_count in buckets")
        self.bucket_util.verify_stats_all_buckets(self.num_items)

        self.log.info("Creating doc_generator for doc_op")
        num_item_start_for_crud = int(self.num_items / 2)
        doc_update = doc_generator(self.key,
                                   0,
                                   num_item_start_for_crud,
                                   doc_size=self.doc_size,
                                   doc_type=self.doc_type,
                                   target_vbucket=self.target_vbucket,
                                   vbuckets=self.vbuckets)

        expected_num_items = self.num_items
        num_of_mutations = 1

        if doc_op == "update":
            self.log.info("Performing 'update' mutation over the docs")
            task = self.task.async_load_gen_docs(
                self.cluster,
                def_bucket,
                doc_update,
                "update",
                0,
                batch_size=self.batch_size,
                process_concurrency=self.process_concurrency,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                ryow=self.ryow,
                check_persistence=self.check_persistence)
            self.task.jython_task_manager.get_task_result(task)
            ref_val["ops_update"] = (doc_update.end - doc_update.start +
                                     len(task.fail.keys()))
            if self.durability_level:
                ref_val["sync_write_committed_count"] += \
                    (doc_update.end - doc_update.start)
            if self.ryow:
                check_durability_failures()

            # Read all the values to validate update operation
            task = self.task.async_load_gen_docs(
                self.cluster,
                def_bucket,
                doc_update,
                "read",
                0,
                batch_size=self.batch_size,
                process_concurrency=self.process_concurrency,
                timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Update failed key", "CAS", "Value"])
            for key, value in task.success.items():
                if json.loads(str(value["value"]))["mutated"] != 1:
                    op_failed_tbl.add_row([key, value["cas"], value["value"]])

            op_failed_tbl.display("Update failed for keys:")
            if len(op_failed_tbl.rows) != 0:
                self.fail("Update failed for few keys")
        elif doc_op == "delete":
            self.log.info("Performing 'delete' mutation over the docs")
            task = self.task.async_load_gen_docs(
                self.cluster,
                def_bucket,
                doc_update,
                "delete",
                0,
                batch_size=self.batch_size,
                process_concurrency=self.process_concurrency,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                ryow=self.ryow,
                check_persistence=self.check_persistence)
            self.task.jython_task_manager.get_task_result(task)
            expected_num_items = self.num_items \
                                 - (self.num_items - num_item_start_for_crud)
            ref_val["ops_delete"] = (doc_update.end - doc_update.start +
                                     len(task.fail.keys()))
            if self.durability_level:
                ref_val["sync_write_committed_count"] += \
                    (doc_update.end - doc_update.start)
            if self.ryow:
                check_durability_failures()

            # Read all the values to validate update operation
            task = self.task.async_load_gen_docs(self.cluster,
                                                 def_bucket,
                                                 doc_update,
                                                 "read",
                                                 0,
                                                 batch_size=10,
                                                 process_concurrency=8,
                                                 timeout_secs=self.sdk_timeout)
            self.task.jython_task_manager.get_task_result(task)

            op_failed_tbl = TableView(self.log.error)
            op_failed_tbl.set_headers(["Delete failed key", "CAS", "Value"])
            for key, value in task.success.items():
                op_failed_tbl.add_row([key, value["cas"], value["value"]])

            op_failed_tbl.display("Delete failed for keys:")
            if len(op_failed_tbl.rows) != 0:
                self.fail("Delete failed for few keys")
        else:
            self.log.warning("Unsupported doc_operation")

        self.log.info("Wait for ep_all_items_remaining to become '0'")
        self.bucket_util._wait_for_stats_all_buckets()

        # Validate vbucket stats
        verification_dict["ops_create"] = ref_val["ops_create"]
        verification_dict["ops_update"] = ref_val["ops_update"]
        verification_dict["ops_delete"] = ref_val["ops_delete"]

        verification_dict["rollback_item_count"] = \
            ref_val["rollback_item_count"]
        if self.durability_level:
            verification_dict["sync_write_aborted_count"] = \
                ref_val["sync_write_aborted_count"]
            verification_dict["sync_write_committed_count"] = \
                ref_val["sync_write_committed_count"]

        failed = self.durability_helper.verify_vbucket_details_stats(
            def_bucket,
            self.cluster_util.get_kv_nodes(),
            vbuckets=self.vbuckets,
            expected_val=verification_dict,
            one_less_node=one_less_node)
        if failed:
            self.fail("Cbstat vbucket-details verification failed")

        self.log.info("Validating doc_count")
        self.bucket_util.verify_stats_all_buckets(expected_num_items)
Exemplo n.º 19
0
    def test_timeout_with_crud_failures(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operations succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        # Local method to validate vb_seqno
        def validate_vb_seqno_stats():
            """
            :return retry_validation: Boolean denoting to retry validation
            """
            retry_validation = False
            vb_info["post_timeout"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            for vb_id in range(self.cluster.vbuckets):
                vb_id = str(vb_id)
                if vb_id not in affected_vbs:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            != vb_info["post_timeout"][node.ip][vb_id]:
                        self.log_failure(
                            "Unaffected vb-%s stat updated: %s != %s" %
                            (vb_id, vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
                elif int(vb_id) \
                        in target_nodes_vbuckets[Bucket.vBucket.ACTIVE]:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            != vb_info["post_timeout"][node.ip][vb_id]:
                        self.log.warning(
                            err_msg %
                            (node.ip, Bucket.vBucket.ACTIVE, vb_id,
                             vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
                elif int(vb_id) \
                        in target_nodes_vbuckets[Bucket.vBucket.REPLICA]:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            == vb_info["post_timeout"][node.ip][vb_id]:
                        retry_validation = True
                        self.log.warning(
                            err_msg %
                            (node.ip, Bucket.vBucket.REPLICA, vb_id,
                             vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
            return retry_validation

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        target_nodes_vbuckets = dict()
        vb_info = dict()
        tasks = dict()
        doc_gen = dict()
        affected_vbs = list()

        target_nodes_vbuckets[Bucket.vBucket.ACTIVE] = list()
        target_nodes_vbuckets[Bucket.vBucket.REPLICA] = list()
        vb_info["init"] = dict()
        vb_info["post_timeout"] = dict()
        vb_info["afterCrud"] = dict()

        # Override crud_batch_size to minimum value for testing
        self.crud_batch_size = 5

        target_nodes = self.getTargetNodes()
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            target_nodes_vbuckets[Bucket.vBucket.ACTIVE] += \
                cbstat_obj[node.ip].vbucket_list(
                    self.bucket.name, vbucket_type=Bucket.vBucket.ACTIVE)
            target_nodes_vbuckets[Bucket.vBucket.REPLICA] += \
                cbstat_obj[node.ip].vbucket_list(
                    self.bucket.name, vbucket_type=Bucket.vBucket.REPLICA)
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        curr_time = int(time.time())
        expected_timeout = curr_time + self.sdk_timeout

        target_vbs = target_nodes_vbuckets[Bucket.vBucket.ACTIVE]
        if self.nodes_init == 1:
            pass
        elif self.durability_level \
                == Bucket.DurabilityLevel.PERSIST_TO_MAJORITY:
            target_vbs = target_nodes_vbuckets[Bucket.vBucket.REPLICA]

        # Create required doc_generators
        doc_gen["insert"] = sub_doc_generator(self.key,
                                              self.num_items / 2,
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs,
                                              key_size=self.key_size)
        doc_gen["remove"] = sub_doc_generator_for_edit(
            self.key,
            0,
            self.crud_batch_size,
            key_size=self.key_size,
            template_index=2,
            target_vbucket=target_vbs)
        doc_gen["read"] = sub_doc_generator_for_edit(self.key,
                                                     0,
                                                     self.crud_batch_size,
                                                     key_size=self.key_size,
                                                     template_index=0,
                                                     target_vbucket=target_vbs)
        doc_gen["upsert"] = sub_doc_generator_for_edit(
            self.key,
            int(self.num_items / 4),
            self.crud_batch_size,
            key_size=self.key_size,
            template_index=1,
            target_vbucket=target_vbs)

        for op_type in doc_gen.keys():
            tasks[op_type] = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                start_task=False)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)

        for op_type in doc_gen.keys():
            self.task_manager.add_new_task(tasks[op_type])

        # Wait for document_loader tasks to complete
        for op_type in doc_gen.keys():
            self.task.jython_task_manager.get_task_result(tasks[op_type])

            # Validate task failures
            if op_type == DocLoading.Bucket.DocOps.READ:
                # Validation for read task
                if len(tasks[op_type].fail.keys()) != 0:
                    self.log_failure("Read failed for few docs: %s" %
                                     tasks[op_type].fail.keys())
            else:
                # Validation of CRUDs - Update / Create / Delete
                for doc_id, crud_result in tasks[op_type].fail.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.cluster.vbuckets)
                    if SDKException.DurabilityAmbiguousException \
                            not in str(crud_result["error"]):
                        self.log_failure(
                            "Invalid exception for doc %s, vb %s: %s" %
                            (doc_id, vb_num, crud_result))

        # Revert the specified error scenario
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Check whether the timeout triggered properly
        if int(time.time()) < expected_timeout:
            self.log_failure("Timed-out before expected time")

        for op_type in doc_gen.keys():
            if op_type == DocLoading.Bucket.DocOps.READ:
                continue
            while doc_gen[op_type].has_next():
                doc_id, _ = doc_gen[op_type].next()
                affected_vbs.append(
                    str(
                        self.bucket_util.get_vbucket_num_for_key(
                            doc_id, self.cluster.vbuckets)))

        affected_vbs = list(set(affected_vbs))
        err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s"
        # Fetch latest stats and validate the seq_nos are not updated
        for node in target_nodes:
            retry_count = 0
            max_retry = 3
            while retry_count < max_retry:
                self.log.info("Trying to validate vbseq_no stats: %d" %
                              (retry_count + 1))
                retry_count += 1
                retry_required = validate_vb_seqno_stats()
                if not retry_required:
                    break
                self.sleep(5, "Sleep for vbseq_no stats to update")
            else:
                # This will be exited only if `break` condition is not met
                self.log_failure("validate_vb_seqno_stats verification failed")

        self.validate_test_failure()

        # If replicas+1 == total nodes, verify no mutation should have
        # succeeded with durability
        if self.nodes_init == self.num_replicas + 1:
            read_gen = doc_generator(self.key, 0, self.num_items)
            read_task = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                read_gen,
                DocLoading.Bucket.DocOps.READ,
                0,
                batch_size=500,
                process_concurrency=1,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(read_task)

            failed_keys = TableView(self.log.error)
            failed_keys.set_headers(["Key", "Error"])
            half_of_num_items = self.num_items / 2
            for doc_key, doc_info in read_task.success.items():
                key_index = int(doc_key.split("-")[1])
                expected_mutated_val = 0
                if key_index < half_of_num_items:
                    expected_mutated_val = 1
                mutated = json.loads(str(doc_info["value"]))["mutated"]
                if mutated != expected_mutated_val:
                    failed_keys.add_row([doc_key, doc_info])

            failed_keys.display("Affected mutations:")
            self.log.error(read_task.fail)

        # Doc error validation
        for op_type in doc_gen.keys():
            task = tasks[op_type]

            retry_task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(retry_task)
            retry_failures = set(retry_task.fail.keys())
            initial_failures = set(task.fail.keys())

            if len(list(retry_failures.difference(initial_failures))) != 0:
                self.log_failure("Docs failed during retry task for %s: %s" %
                                 (op_type, retry_task.fail))

        # Verify doc count after expected CRUD failure
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)

        # Fetch latest stats and validate the values are updated
        for node in target_nodes:
            vb_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                self.log_failure("vBucket seq_no stats not updated")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
Exemplo n.º 20
0
class volume(BaseTestCase):
    # will add the __init__ functions after the test has been stabilised
    def setUp(self):
        self.input = TestInputSingleton.input
        self.input.test_params.update({"default_bucket":False})
        BaseTestCase.setUp(self)
        self.rest = RestConnection(self.servers[0])
        self.op_type = self.input.param("op_type", "create")
        self.tasks = []         # To have all tasks running in parallel.
        self._iter_count = 0    # To keep a check of how many items are deleted
        self.available_servers = list()
        self.available_servers = self.cluster.servers[self.nodes_init:]
        self.num_buckets = self.input.param("num_buckets", 1)
        self.mutate = 0
        self.doc_ops = self.input.param("doc_ops", None)
        if self.doc_ops:
            self.doc_ops = self.doc_ops.split(';')
        self.iterations = self.input.param("iterations", 2)
        self.vbucket_check = self.input.param("vbucket_check", True)
        self.new_num_writer_threads = self.input.param(
            "new_num_writer_threads", 6)
        self.new_num_reader_threads = self.input.param(
            "new_num_reader_threads", 8)

    def create_required_buckets(self):
        self.log.info("Get the available memory quota")
        self.info = self.rest.get_nodes_self()
        threshold_memory = 100
        # threshold_memory_vagrant = 100
        total_memory_in_mb = self.info.mcdMemoryReserved
        total_available_memory_in_mb = total_memory_in_mb
        active_service = self.info.services

        # If the mentioned service is already present,
        # we remove that much memory from available memory quota
        if "index" in active_service:
            total_available_memory_in_mb -= self.info.indexMemoryQuota
        if "fts" in active_service:
            total_available_memory_in_mb -= self.info.ftsMemoryQuota
        if "cbas" in active_service:
            total_available_memory_in_mb -= self.info.cbasMemoryQuota
        if "eventing" in active_service:
            total_available_memory_in_mb -= self.info.eventingMemoryQuota

        available_memory = total_available_memory_in_mb - threshold_memory
        # available_memory =  total_available_memory_in_mb - threshold_memory_vagrant
        self.rest.set_service_memoryQuota(service='memoryQuota',
                                          memoryQuota=available_memory)

        # Creating buckets for data loading purpose
        self.log.info("Create CB buckets")
        duration = self.input.param("bucket_expiry", 0)
        eviction_policy = self.input.param("eviction_policy", Bucket.EvictionPolicy.VALUE_ONLY)
        self.bucket_type = self.input.param("bucket_type", Bucket.Type.MEMBASE) # Bucket.bucket_type.EPHEMERAL
        compression_mode = self.input.param("compression_mode", Bucket.CompressionMode.PASSIVE)  # Bucket.bucket_compression_mode.ACTIVE
        ramQuota = self.input.param("ramQuota", available_memory)
        bucket_names = self.input.param("bucket_names", "GleamBookUsers")
        if bucket_names:
            bucket_names = bucket_names.split(';')
        if self.bucket_type:
            self.bucket_type = self.bucket_type.split(';')
        if compression_mode:
            compression_mode = compression_mode.split(';')
        if eviction_policy:
            eviction_policy = eviction_policy.split(';')
        if self.num_buckets == 1:
            bucket = Bucket({"name": "GleamBookUsers", "ramQuotaMB": ramQuota, "maxTTL": duration, "replicaNumber":self.num_replicas,
                            "evictionPolicy": eviction_policy[0], "bucketType":self.bucket_type[0], "compressionMode":compression_mode[0]})
            self.bucket_util.create_bucket(bucket)
        elif 1 < self.num_buckets == len(bucket_names):
            for i in range(self.num_buckets):
                bucket = Bucket({"name": bucket_names[i], "ramQuotaMB": ramQuota/self.num_buckets, "maxTTL": duration, "replicaNumber":self.num_replicas,
                             "evictionPolicy": eviction_policy[i], "bucketType":self.bucket_type[i], "compressionMode":compression_mode[i]})
                self.bucket_util.create_bucket(bucket)
        else:
            self.fail("Number of bucket/Names not sufficient")

        # rebalance the new buckets across all nodes.
        self.log.info("Rebalance Starts")
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[])
        self.rest.monitorRebalance()
        return bucket

    def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default"):
        for node in self.cluster_util.get_kv_nodes():
            bucket_helper = BucketHelper(node)
            bucket_helper.update_memcached_settings(num_writer_threads=num_writer_threads,
                                                    num_reader_threads=num_reader_threads)

    def volume_doc_generator_users(self, key, start, end):
        template = '{{ "id":"{0}", "alias":"{1}", "name":"{2}", "user_since":"{3}", "employment":{4} }}'
        return GleamBookUsersDocumentGenerator(key, template,
                                               start=start, end=end)

    def volume_doc_generator_messages(self, key, start, end):
        template = '{{ "message_id": "{0}", "author_id": "{1}", "send_time": "{2}" }}'
        return GleamBookMessagesDocumentGenerator(key, template,
                                                  start=start, end=end)

    def initial_data_load(self, initial_load):
        if self.atomicity:
            task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets,
                                                            initial_load, "create" , exp=0,
                                                            batch_size=10,
                                                            process_concurrency=self.process_concurrency,
                                                            replicate_to=self.replicate_to,
                                                            persist_to=self.persist_to, timeout_secs=self.sdk_timeout,
                                                            retries=self.sdk_retries,update_count=self.mutate, transaction_timeout=self.transaction_timeout,
                                                            commit=self.transaction_commit,durability=self.durability_level,sync=self.sync)
            self.task.jython_task_manager.get_task_result(task)
        else:
            tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, initial_load,
                                                            "create", exp=0,
                                                            persist_to = self.persist_to,
                                                            replicate_to=self.replicate_to,
                                                            batch_size= 10,
                                                            pause_secs = 5,
                                                            timeout_secs=30,
                                                            durability=self.durability_level,
                                                            process_concurrency = self.process_concurrency,
                                                            retries=self.sdk_retries)

            for task, task_info in tasks_info.items():
                self.task_manager.get_task_result(task)
        self.sleep(10)

    # Loading documents in 2 buckets in parallel through transactions
    def doc_load_using_txns(self):
        if "update" in self.doc_ops and self.gen_update_users is not None:
            self.tasks.append(self.doc_loader_txn("update", self.gen_update_users))
        if "create" in self.doc_ops and self.gen_create_users is not None:
            self.tasks.append(self.doc_loader_txn("create", self.gen_create_users))
        if "delete" in self.doc_ops and self.gen_delete_users is not  None:
            self.tasks.append(self.doc_loader_txn("delete", self.gen_delete_users))
        self.sleep(20)
        for task in self.tasks:
            self.task.jython_task_manager.get_task_result(task)

    def doc_loader_txn(self, op_type, kv_gen):
        if op_type == "update":
            print("Value of Mutated is", self.mutate)
            self.sleep(5)
        process_concurrency = self.process_concurrency
        # if op_type == "update":
        #     if "create" not in self.doc_ops:
        #         self.create_perc = 0
        #     if "delete" not in self.doc_ops:
        #         self.delete_perc = 0
        #     process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        # if op_type == "create":
        #     if "update" not in self.doc_ops:
        #         self.update_perc = 0
        #     if "delete" not in self.doc_ops:
        #         self.delete_perc = 0
        #     process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        # if op_type == "delete":
        #     if "create" not in self.doc_ops:
        #         self.create_perc = 0
        #     if "update" not in self.doc_ops:
        #         self.update_perc = 0
        #     process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets,
                                                       kv_gen, op_type, exp=0,
                                                       batch_size=10,
                                                       process_concurrency=process_concurrency,
                                                       replicate_to=self.replicate_to,
                                                       persist_to=self.persist_to, timeout_secs=self.sdk_timeout,
                                                       retries=self.sdk_retries, update_count=self.mutate,
                                                       transaction_timeout=self.transaction_timeout,
                                                       commit=self.transaction_commit, durability=self.durability_level,
                                                       sync=self.sync, defer=self.defer)
        return task

    # Loading documents through normal doc loader
    def normal_doc_loader(self):
        tasks_info = dict()
        if "update" in self.doc_ops and self.gen_update_users is not None:
            task_info = self.doc_loader("update", self.gen_update_users)
            tasks_info.update(task_info.items())
        if "create" in self.doc_ops and self.gen_create_users is not None:
            task_info = self.doc_loader("create", self.gen_create_users)
            tasks_info.update(task_info.items())
        if "delete" in self.doc_ops and self.gen_delete_users is not None:
            task_info = self.doc_loader("delete", self.gen_delete_users)
            tasks_info.update(task_info.items())
        return tasks_info

    def doc_loader(self, op_type, kv_gen):
        process_concurrency = self.process_concurrency
        if op_type == "update":
            if "create" not in self.doc_ops:
                self.create_perc = 0
            if "delete" not in self.doc_ops:
                self.delete_perc = 0
            process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        if op_type == "create":
            if "update" not in self.doc_ops:
                self.update_perc = 0
            if "delete" not in self.doc_ops:
                self.delete_perc = 0
            process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        if op_type == "delete":
            if "create" not in self.doc_ops:
                self.create_perc = 0
            if "update" not in self.doc_ops:
                self.update_perc = 0
            process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc)
        retry_exceptions = [
            SDKException.AmbiguousTimeoutException,
            SDKException.RequestCanceledException,
            SDKException.DurabilityAmbiguousException,
            SDKException.DurabilityImpossibleException,
        ]
        tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, kv_gen,
                                                              op_type, 0, batch_size=20,
                                                              persist_to=self.persist_to, replicate_to=self.replicate_to,
                                                              durability=self.durability_level, pause_secs=5,
                                                              timeout_secs=30, process_concurrency=process_concurrency,
                                                              retries=self.sdk_retries,
                                                              retry_exceptions=retry_exceptions)
        return tasks_info

    # Stopping and restarting the memcached process
    def stop_process(self):
        target_node = self.servers[2]
        remote = RemoteMachineShellConnection(target_node)
        error_sim = CouchbaseError(self.log, remote)
        error_to_simulate = "stop_memcached"
        # Induce the error condition
        error_sim.create(error_to_simulate)
        self.sleep(20, "Wait before reverting the error condition")
        # Revert the simulated error condition and close the ssh session
        error_sim.revert(error_to_simulate)
        remote.disconnect()

    def rebalance(self, nodes_in=0, nodes_out=0):
        servs_in = random.sample(self.available_servers, nodes_in)

        self.nodes_cluster = self.cluster.nodes_in_cluster[:]
        self.nodes_cluster.remove(self.cluster.master)
        servs_out = random.sample(self.nodes_cluster, nodes_out)

        if nodes_in == nodes_out:
            self.vbucket_check = False

        rebalance_task = self.task.async_rebalance(
            self.cluster.servers[:self.nodes_init], servs_in, servs_out, check_vbucket_shuffling=self.vbucket_check)

        self.available_servers = [servs for servs in self.available_servers if servs not in servs_in]
        self.available_servers += servs_out

        self.cluster.nodes_in_cluster.extend(servs_in)
        self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out))
        return rebalance_task

    def rebalance_validation(self, tasks_info, rebalance_task):
        if not rebalance_task.result:
            for task, _ in tasks_info.items():
                self.task.jython_task_manager.get_task_result(task)
            self.fail("Rebalance Failed")

    def data_validation(self, tasks_info):
        if not self.atomicity:
            for task in tasks_info:
                self.task_manager.get_task_result(task)
            self.bucket_util.verify_doc_op_task_exceptions(tasks_info,
                                                           self.cluster)
            self.bucket_util.log_doc_ops_task_failures(tasks_info)

            self.sleep(10)

            for task, task_info in tasks_info.items():
                self.assertFalse(
                    task_info["ops_failed"],
                    "Doc ops failed for task: {}".format(task.thread_name))

        self.log.info("Validating Active/Replica Docs")
        if self.atomicity:
            self.check_replica = False
        else:
            self.check_replica = True

        for bucket in self.bucket_util.buckets:
            tasks = list()
            if self.gen_update_users is not None:
                tasks.append(self.task.async_validate_docs(
                    self.cluster, bucket, self.gen_update_users, "update", 0,
                    batch_size=10, check_replica=self.check_replica))
            if self.gen_create_users is not None:
                tasks.append(self.task.async_validate_docs(
                    self.cluster, bucket, self.gen_create_users, "create", 0,
                    batch_size=10, check_replica=self.check_replica))
            if self.gen_delete_users is not  None:
                tasks.append(self.task.async_validate_docs(
                    self.cluster, bucket, self.gen_delete_users, "delete", 0,
                    batch_size=10, check_replica=self.check_replica))
            for task in tasks:
                self.task.jython_task_manager.get_task_result(task)
            self.sleep(20)

        if not self.atomicity:
            self.bucket_util._wait_for_stats_all_buckets()
            self.bucket_util.verify_stats_all_buckets(self.end - self.initial_load_count*self.delete_perc/100*self._iter_count)

    def data_load(self):
        tasks_info = dict()
        if self.atomicity:
            self.doc_load_using_txns()
            self.sleep(10)
        else:
            tasks_info = self.normal_doc_loader()
            self.sleep(10)
        return tasks_info

    def generate_docs(self):
        self.create_perc = self.input.param("create_perc",100)
        self.update_perc = self.input.param("update_perc", 10)
        self.delete_perc = self.input.param("delete_perc", 10)

        self.gen_delete_users = None
        self.gen_create_users = None
        self.gen_update_users = None

        if "update" in self.doc_ops:
            self.mutate += 1
            self.gen_update_users = doc_generator("Users", 0, self.initial_load_count*self.update_perc/100,
                                                doc_size = self.doc_size, mutate = self.mutate)
        if "delete" in self.doc_ops:
            self.gen_delete_users = doc_generator("Users", self.start,
                                              self.start + (self.initial_load_count*self.delete_perc)/100, doc_size = self.doc_size)
            self._iter_count += 1

        if "create" in self.doc_ops:
            self.start = self.end
            self.end += self.initial_load_count*self.create_perc/100
            self.gen_create_users = doc_generator("Users", self.start, self.end, doc_size = self.doc_size)

    def data_validation_mode(self, tasks_info):
        # if not self.atomicity:
        self.data_validation(tasks_info)
        '''
        else:
            for task in self.tasks:
                self.task.jython_task_manager.get_task_result(task)
            self.sleep(10)
        '''
    def get_bucket_dgm(self, bucket):
        self.rest_client = BucketHelper(self.cluster.master)
        dgm = self.rest_client.fetch_bucket_stats(
            bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1]
        self.log.info("Active Resident Threshold of {0} is {1}".format(bucket.name, dgm))

    def print_crud_stats(self):
        self.table = TableView(self.log.info)
        self.table.set_headers(["Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted"])
        if self._iter_count != 0:
            self.table.add_row([str(self.start - self.initial_load_count*self.delete_perc/100*(self._iter_count-1)),
                                str(self.end- self.initial_load_count*self.delete_perc/100*self._iter_count),
                                str(self.update_perc - self.update_perc) + "---" +
                                str(self.initial_load_count*self.update_perc/100),
                                str(self.start) + "---" + str(self.end),
                                str(self.start - self.initial_load_count*self.create_perc/100) + "---" +
                                str(self.start + (self.initial_load_count*self.delete_perc/100) - self.initial_load_count*self.create_perc/100)])
        self.table.display("Docs statistics")

    def test_volume_taf(self):
        ########################################################################################################################
        self.log.info("Step1: Create a n node cluster")
        nodes_init = self.cluster.servers[1:self.nodes_init] if self.nodes_init != 1 else []
        self.task.rebalance([self.cluster.master], nodes_init, [])
        self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init)
        self.query_node = self.cluster.master
        ########################################################################################################################
        self.log.info("Step 2 & 3: Create required buckets.")
        bucket = self.create_required_buckets()
        self.loop = 0
        #######################################################################################################################
        while self.loop<self.iterations:
            self.log.info("Step 4: Pre-Requisites for Loading of docs")
            self.start = 0
            self.bucket_util.add_rbac_user()
            self.end = self.initial_load_count = self.input.param("initial_load", 1000)
            initial_load = doc_generator("Users", self.start, self.start + self.initial_load_count, doc_size=self.doc_size)
            self.initial_data_load(initial_load)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 5: Rebalance in with Loading of docs")
            self.generate_docs()
            self.gen_delete_users=None
            self._iter_count = 0
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in = 1, nodes_out = 0)
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            #########################################################################################################################
            self.log.info("Step 6: Rebalance Out with Loading of docs")
            self.generate_docs()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in = 0, nodes_out = 1)
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            #######################################################################################################################
            self.log.info("Step 7: Rebalance In_Out with Loading of docs")
            self.generate_docs()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in = 2, nodes_out = 1)
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 8: Swap with Loading of docs")
            self.generate_docs()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=1)
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 9: Updating the bucket replica to 2")
            bucket_helper = BucketHelper(self.cluster.master)
            for i in range(len(self.bucket_util.buckets)):
                bucket_helper.change_bucket_props(
                    self.bucket_util.buckets[i], replicaNumber=2)
            self.generate_docs()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            rebalance_task = self.rebalance(nodes_in =1, nodes_out= 0)
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            if "ephemeral" in self.bucket_type:
                self.log.info("No Memcached kill for epehemral bucket")
            else:
                self.log.info("Step 10: Stopping and restarting memcached process")
                self.generate_docs()
                if not self.atomicity:
                    self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                           num_reader_threads=self.new_num_reader_threads)
                rebalance_task = self.task.async_rebalance(self.cluster.servers, [], [])
                tasks_info = self.data_load()
                if not self.atomicity:
                    self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                           num_reader_threads="disk_io_optimized")
                # self.sleep(600, "Wait for Rebalance to start")
                self.task.jython_task_manager.get_task_result(rebalance_task)
                reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
                self.assertTrue(reached, "rebalance failed, stuck or did not complete")
                self.stop_process()
                self.data_validation_mode(tasks_info)
                self.tasks = []
                self.bucket_util.print_bucket_stats()
                self.print_crud_stats()
                self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 11: Failover a node and RebalanceOut that node with loading in parallel")
            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1)

            # Mark Node for failover
            self.generate_docs()
            tasks_info = self.data_load()
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False)

            self.sleep(300)
            self.nodes = self.rest.node_statuses()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id])
            # self.sleep(600)
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed")

            servs_out = [node for node in self.cluster.servers if node.ip == self.chosen[0].ip]
            self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out))
            self.available_servers += servs_out
            self.sleep(10)

            self.data_validation_mode(tasks_info)

            self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset, disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets, path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes, buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std, total_vbuckets=self.cluster_util.vbuckets)
            self.sleep(10)
            self.tasks = []
            rebalance_task = self.rebalance(nodes_in=1, nodes_out=0)
            # self.sleep(600)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 12: Failover a node and FullRecovery that node")

            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1)

            self.generate_docs()
            tasks_info = self.data_load()
            # Mark Node for failover
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False)

            self.sleep(300)

            # Mark Node for full recovery
            if self.success_failed_over:
                self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full")

            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)

            rebalance_task = self.task.async_rebalance(
                self.cluster.servers[:self.nodes_init], [], [])
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            # self.sleep(600)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.sleep(10)

            self.data_validation_mode(tasks_info)

            self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset, disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets, path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes, buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std, total_vbuckets=self.cluster_util.vbuckets)
            self.sleep(10)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
            ########################################################################################################################
            self.log.info("Step 13: Failover a node and DeltaRecovery that node with loading in parallel")

            self.std_vbucket_dist = self.input.param("std_vbucket_dist", None)
            std = self.std_vbucket_dist or 1.0

            prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all(
                self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None)

            self.rest = RestConnection(self.cluster.master)
            self.nodes = self.cluster_util.get_nodes(self.cluster.master)
            self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1)

            self.generate_docs()
            tasks_info = self.data_load()
            # Mark Node for failover
            self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False)

            self.sleep(300)
            if self.success_failed_over:
                self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta")
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)

            rebalance_task = self.task.async_rebalance(
                self.cluster.servers[:self.nodes_init], [], [])
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            # self.sleep(600)
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.sleep(10)

            self.data_validation_mode(tasks_info)

            self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets)
            self.sleep(10)

            self.bucket_util.data_analysis_active_replica_all(
                disk_active_dataset, disk_replica_dataset,
                self.cluster.servers[:self.nodes_in + self.nodes_init],
                self.bucket_util.buckets, path=None)
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes, buckets=self.bucket_util.buckets,
                num_replicas=2,
                std=std, total_vbuckets=self.cluster_util.vbuckets)
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
        ########################################################################################################################
            self.log.info("Step 14: Updating the bucket replica to 1")
            bucket_helper = BucketHelper(self.cluster.master)
            for i in range(len(self.bucket_util.buckets)):
                bucket_helper.change_bucket_props(
                    self.bucket_util.buckets[i], replicaNumber=1)
            self.generate_docs()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads,
                                                       num_reader_threads=self.new_num_reader_threads)
            rebalance_task = self.task.async_rebalance(self.cluster.servers, [], [])
            tasks_info = self.data_load()
            if not self.atomicity:
                self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized",
                                                       num_reader_threads="disk_io_optimized")
            # self.sleep(600, "Wait for Rebalance to start")
            self.task.jython_task_manager.get_task_result(rebalance_task)
            reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
            self.assertTrue(reached, "rebalance failed, stuck or did not complete")
            self.data_validation_mode(tasks_info)
            self.tasks = []
            self.bucket_util.print_bucket_stats()
            self.print_crud_stats()
            self.get_bucket_dgm(bucket)
        ########################################################################################################################
            self.log.info("Step 15: Flush the bucket and start the entire process again")
            self.loop += 1
            if self.loop < self.iterations:
                # Flush the bucket
                self.bucket_util.flush_all_buckets(self.cluster.master)
                self.sleep(10)
                if len(self.cluster.nodes_in_cluster) > self.nodes_init:
                    self.nodes_cluster = self.cluster.nodes_in_cluster[:]
                    self.nodes_cluster.remove(self.cluster.master)
                    servs_out = random.sample(self.nodes_cluster, int(len(self.cluster.nodes_in_cluster) - self.nodes_init))
                    rebalance_task = self.task.async_rebalance(
                        self.cluster.servers[:self.nodes_init], [], servs_out)
                    # self.sleep(600)
                    self.task.jython_task_manager.get_task_result(rebalance_task)
                    self.available_servers += servs_out
                    self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out))
                    reached = RestHelper(self.rest).rebalance_reached(wait_step=120)
                    self.assertTrue(reached, "rebalance failed, stuck or did not complete")
                    self.get_bucket_dgm(bucket)
                self._iter_count = 0
            else:
                self.log.info("Volume Test Run Complete")
                self.get_bucket_dgm(bucket)