示例#1
0
 def test_set_custom_setting(self, averecmd_params):  # noqa: F811
     """INTERNAL USE ONLY"""
     custom_settings = os.environ.get('INTERNAL_CUSTOM_SETTING', None)
     if custom_settings:
         run_averecmd(**averecmd_params,
                      method="support.setCustomSetting",
                      args=custom_settings)
示例#2
0
    def test_node_health(self, averecmd_params, node_names,
                         test_vars):  # noqa: F811
        """Get the node IPs and store them in test_vars."""
        log = logging.getLogger("test_node_health")

        node_ips = {}  # will store a map of node names and IPs
        for node in node_names:
            timeout_secs = 60
            time_start = time()
            time_end = time_start + timeout_secs
            while time() <= time_end:
                result = run_averecmd(**averecmd_params,
                                      method="node.get",
                                      args=node)
                node_state = result[node]["state"]
                log.info('Node {0} has state "{1}"'.format(node, node_state))
                if node_state == "up":
                    # Save the node IPs while we're here.
                    node_ips[node] = [
                        x["IP"] for x in result[node]["clusterIPs"]
                    ]
                    if result[node]["clientFacingIPs"]["vserver"]:
                        node_ips[node].append(result[node]["clientFacingIPs"]
                                              ["vserver"][0]["IP"])
                    break
                sleep(10)
            assert node_state == "up"

        if node_ips:
            test_vars["node_ips"] = node_ips
示例#3
0
 def test_node_health(self, averecmd_params):  # noqa: F811
     """Check that cluster is reporting that all nodes are up."""
     log = logging.getLogger("test_node_health")
     for node in run_averecmd(**averecmd_params, method="node.list"):
         timeout_secs = 60
         time_start = time()
         time_end = time_start + timeout_secs
         while time() <= time_end:
             result = run_averecmd(**averecmd_params,
                                   method="node.get",
                                   args=node)
             node_state = result[node]["state"]
             log.info('Node {0} has state "{1}"'.format(node, node_state))
             if node_state == "up":
                 break
             sleep(10)
         assert node_state == "up"
示例#4
0
    def test_for_cores(self, averecmd_params):  # noqa: F811
        """
        Check the cluster for cores. If a core is found, collect/send a GSI.
        """
        log = logging.getLogger("test_for_cores")
        node_cores = run_averecmd(**averecmd_params,
                                  method="support.listCores",
                                  args="cluster")
        cores_found = False
        for cores in node_cores.values():
            if len(cores):
                cores_found = True
                break

        if cores_found:
            log.error("Cores found: {}".format(node_cores))
            upload_gsi(averecmd_params)  # collect/upload a "normal" GSI bundle

        assert (not cores_found)
示例#5
0
def node_names(cluster_ips, ssh_con, test_vars):
    """Queries the cluster to get a list of node names."""
    log = logging.getLogger("node_names")
    last_ex = None
    for _ip in cluster_ips:
        # For resiliency, attempt to issue averecmd calls to known cluster IPs.
        try:
            nodes = run_averecmd(ssh_client=ssh_con,
                                 password=os.environ["AVERE_ADMIN_PW"],
                                 node_ip=_ip,
                                 method="node.list")
            test_vars["averecmd_ip"] = _ip  # this IP worked for averecmd
            test_vars["nodes"] = nodes
            return test_vars["nodes"]
        except Exception as e:
            log.error("node.list failed for IP {}".format(_ip))
            log.error(e)
            last_ex = e
    assert not last_ex
示例#6
0
def node_ips(cluster_ips, ssh_con, test_vars):
    """Queries the cluster to get a list of IPs for each node."""
    log = logging.getLogger("node_ips")
    last_ex = None
    for _ip in cluster_ips:
        # For resiliency, attempt to issue averecmd calls to known cluster IPs.
        try:
            result = run_averecmd(ssh_client=ssh_con,
                                  password=os.environ["AVERE_ADMIN_PW"],
                                  node_ip=_ip,
                                  method="cluster.get")
            test_vars["averecmd_ip"] = _ip  # this IP worked for averecmd
            c_ips = result["clusterIPs"][0]
            node_ip_range = "{0}-{1}".format(c_ips["firstIP"], c_ips["lastIP"])
            test_vars["cluster_node_ips"] = split_ip_range(node_ip_range)
            return test_vars["cluster_node_ips"]
        except Exception as e:
            log.error("cluster.get failed for IP {}".format(_ip))
            log.error(e)
            last_ex = e
    assert not last_ex
示例#7
0
 def test_ha_enabled(self, averecmd_params):  # noqa: F811
     """Check that high-availability (HA) is enabled."""
     result = run_averecmd(**averecmd_params, method="cluster.get")
     assert result["ha"] == "enabled"
示例#8
0
    def test_artifacts_collect(self, averecmd_params, scp_con,
                               test_vars):  # noqa: F811, E501
        """
        Collect test artifacts (node logs, rolling trace) from each node.
        Artifacts are stored to local directories.
        """
        log = logging.getLogger("test_collect_artifacts")
        artifacts_dir = "vfxt_artifacts_" + test_vars["atd_obj"].deploy_id
        os.makedirs(artifacts_dir, exist_ok=True)

        log.debug("Copying logs from controller to {}".format(artifacts_dir))
        for lf in [
                "vfxt.log", "enablecloudtrace.log",
                "create_cluster_command.log"
        ]:
            scp_con.get("~/" + lf, artifacts_dir)

        log.debug("Copying SSH keys to the controller")
        scp_con.put(test_vars["ssh_priv_key"], "~/.ssh/.")
        scp_con.put(test_vars["ssh_pub_key"], "~/.ssh/.")

        nodes = run_averecmd(**averecmd_params, method="node.list")
        log.debug("Nodes found: {}".format(nodes))
        last_error = None
        for node in nodes:
            node_dir = artifacts_dir + "/" + node
            node_dir_log = node_dir + "/log"
            node_dir_trace = node_dir + "/trace"
            log.debug("node_dir_log = {}, node_dir_trace = {}".format(
                node_dir_log, node_dir_trace))

            # make local directories to store downloaded artifacts
            os.makedirs(node_dir_trace, exist_ok=True)
            os.makedirs(node_dir_log, exist_ok=True)

            # get this node's primary cluster IP address
            node_ip = run_averecmd(**averecmd_params,
                                   method="node.get",
                                   args=node)[node]["primaryClusterIP"]["IP"]

            log.debug("Tunneling to node {} using IP {}".format(node, node_ip))

            # get_unused_local_port actually uses the port to know it's
            # available before making it available again and returning the
            # port number. Rarely, there is a race where the open() call
            # below fails because the port is not yet fully available
            # again. In those cases, try getting a new port.
            for port_attempt in range(1, 11):
                tunnel_local_port = get_unused_local_port()
                with Connection(test_vars["public_ip"],
                                user=test_vars["controller_user"],
                                connect_kwargs={
                                    "key_filename": test_vars["ssh_priv_key"],
                                }).forward_local(local_port=tunnel_local_port,
                                                 remote_port=22,
                                                 remote_host=node_ip):
                    node_c = Connection("127.0.0.1",
                                        user="******",
                                        port=tunnel_local_port,
                                        connect_kwargs={
                                            "password":
                                            os.environ["AVERE_ADMIN_PW"]
                                        })
                    try:
                        node_c.open()

                        # If port_attempt > 1, last_error had the exception
                        # from the last iteration. Clear it.
                        last_error = None
                    except NoValidConnectionsError as ex:
                        last_error = ex
                        exp_err = "Unable to connect to port {} on 127.0.0.1".format(
                            tunnel_local_port)
                        if exp_err not in str(ex):
                            raise
                        else:
                            log.warning("{0} (attempt #{1}, retrying)".format(
                                exp_err, str(port_attempt)))
                            continue  # iterate

                    scp_client = SCPClient(node_c.transport)
                    try:
                        # Calls below catch exceptions and report them to the
                        # error log, but then continue. This is because a
                        # failure to collect artifacts on one node should not
                        # prevent collection from other nodes. After collection
                        # has completed, the last exception will be raised.

                        # list of files and directories to download
                        to_collect = [
                            "/var/log/messages",
                            "/var/log/xmlrpc.log",

                            # assumes rolling trace was enabled during deploy
                            "/support/trace/rolling",

                            # TODO: 2019-0219: turned off for now
                            # "/support/gsi",
                            # "/support/cores",
                        ]
                        for tc in to_collect:
                            log.debug("SCP'ing {} from node {} to {}".format(
                                tc, node, node_dir_log))
                            try:
                                scp_client.get(tc,
                                               node_dir_log,
                                               recursive=True)
                            except Exception as ex:
                                log.error("({}) Exception caught: {}".format(
                                    node, ex))
                                last_error = ex
                    finally:
                        scp_client.close()
                log.debug("Connections to node {} closed".format(node))
                break  # no need to iterate again

        if last_error:
            log.error("See previous error(s) above. Raising last exception.")
            raise last_error