def test_hadoopjmx(version, nodeType): """ Any new versions of hadoop should be manually built, tagged, and pushed to quay.io, i.e. docker build \ -t quay.io/signalfx/hadoop-test:<version> \ --build-arg HADOOP_VER=<version> \ <repo_root>/test-services/hadoop docker push quay.io/signalfx/hadoop-test:<version> """ with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-master") as hadoop_master: with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-worker1") as hadoop_worker1: if nodeType in ["nameNode", "resourceManager"]: container = hadoop_master else: container = hadoop_worker1 host = container_ip(container) port = NODETYPE_PORT[nodeType] if nodeType in ["resourceManager", "nodeManager"]: yarn_var = YARN_VAR[nodeType] yarn_opts = YARN_OPTS % (yarn_var, port, yarn_var) cmd = ["/bin/bash", "-c", "echo 'export %s' >> %s" % (yarn_opts, YARN_ENV_PATH)] container.exec_run(cmd) start_hadoop(hadoop_master, hadoop_worker1) # wait for jmx to be available assert wait_for(p(tcp_socket_open, host, port), 60), "jmx service not listening on port %d" % port # start the agent with hadoopjmx config config = HADOOPJMX_CONFIG.substitute(host=host, port=port, nodeType=nodeType) with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "nodeType", nodeType)), ( "Didn't get hadoopjmx datapoints for nodeType %s" % nodeType )
def test_kong(kong_image): # pylint: disable=redefined-outer-name kong_env = dict(KONG_ADMIN_LISTEN="0.0.0.0:8001", KONG_LOG_LEVEL="warn", KONG_DATABASE="postgres", KONG_PG_DATABASE="kong") with run_container("postgres:9.5", environment=dict(POSTGRES_USER="******", POSTGRES_DB="kong")) as db: db_ip = container_ip(db) kong_env["KONG_PG_HOST"] = db_ip def db_is_ready(): return db.exec_run("pg_isready -U kong").exit_code == 0 assert wait_for(db_is_ready) with run_container(kong_image, environment=kong_env, command="sleep inf") as migrations: def db_is_reachable(): return migrations.exec_run( "psql -h {} -U kong".format(db_ip)).exit_code == 0 assert wait_for(db_is_reachable) assert migrations.exec_run("kong migrations up --v").exit_code == 0 with run_container(kong_image, environment=kong_env) as kong: kong_ip = container_ip(kong) def kong_is_listening(): try: return get("http://{}:8001/signalfx".format( kong_ip)).status_code == 200 except RequestException: return False assert wait_for(kong_is_listening) config = string.Template( dedent(""" monitors: - type: collectd/kong host: $host port: 8001 metrics: - metric: connections_handled report: true """)).substitute(host=container_ip(kong)) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "kong")), "Didn't get Kong data point"
def test_python_runner_with_redis(): with run_container("redis:4-alpine") as test_container: host = container_ip(test_container) config = MONITOR_CONFIG.substitute(host=host, bundle_root=BUNDLE_DIR) assert wait_for(p(tcp_socket_open, host, 6379), 60), "redis is not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info")), "didn't get datapoints" assert wait_for( p(regex_search_matches_output, get_output, PID_RE.search)) pid = int(PID_RE.search(get_output()).groups()[0]) os.kill(pid, signal.SIGTERM) time.sleep(3) backend.datapoints.clear() assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info") ), "didn't get datapoints after Python process was killed" assert wait_for( p(has_datapoint, backend, metric_name="counter.lru_clock", metric_type=sf_pbuf.CUMULATIVE_COUNTER), timeout_seconds=3, ), "metric type was wrong"
def run_kafka(version): """ Runs a kafka container with zookeeper """ with run_container("zookeeper:3.5") as zookeeper: zkhost = container_ip(zookeeper) assert wait_for(p(tcp_socket_open, zkhost, 2181), 60), "zookeeper didn't start" with run_service( "kafka", environment={ "JMX_PORT": "7099", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ), "START_AS": "broker" }, buildargs={"KAFKA_VERSION": version}, ) as kafka_container: run_service( "kafka", environment={ "START_AS": "create-topic", "KAFKA_ZOOKEEPER_CONNECT": "%s:2181" % (zkhost, ) }, buildargs={"KAFKA_VERSION": version}, ) yield kafka_container
def test_etcd_monitor(): with run_container("quay.io/coreos/etcd:v2.3.8", command=ETCD_COMMAND) as etcd_cont: host = container_ip(etcd_cont) config = ETCD_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 2379), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "etcd")), "Didn't get etcd datapoints"
def test_bad_globbing(): with run_container("zookeeper:3.4") as zk_cont: zkhost = container_ip(zk_cont) assert wait_for(p(tcp_socket_open, zkhost, 2181), 30) create_znode(zk_cont, "/env", "prod") final_conf = BAD_GLOB_CONFIG.substitute(zk_endpoint="%s:2181" % zkhost) with run_agent(final_conf) as [_, get_output, _]: assert wait_for( lambda: "Zookeeper only supports globs" in get_output())
def run_redis(image="redis:4-alpine"): with run_container(image) as redis_container: host = container_ip(redis_container) assert wait_for(p(tcp_socket_open, host, 6379), 60), "service not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" yield [host, redis_client]
def test_postgresql(): with run_container("postgres:10", environment=ENV) as cont: host = container_ip(cont) config = CONFIG_TEMP.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 5432), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "postgresql") ), "Didn't get postgresql datapoints" assert wait_for(p(has_datapoint_with_metric_name, backend, "pg_blks.toast_hit"))
def test_bad_globbing(): with run_container("zookeeper:3.4") as zk_cont: assert wait_for( p(container_cmd_exit_0, zk_cont, "nc -z localhost 2181"), 5) create_znode(zk_cont, "/env", "prod") final_conf = BAD_GLOB_CONFIG.substitute(zk_endpoint="%s:2181" % container_ip(zk_cont)) with run_agent(final_conf) as [_, get_output, _]: assert wait_for( lambda: "Zookeeper only supports globs" in get_output())
def test_redis(image): with run_container(image) as test_container: host = container_ip(test_container) config = MONITOR_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 6379), 60), "service not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "redis_info")), "didn't get datapoints"
def test_basic_etcd2_config(): with run_container(ETCD2_IMAGE, command=ETCD_COMMAND) as etcd: assert wait_for(p(container_cmd_exit_0, etcd, "/etcdctl ls"), 5), "etcd didn't start" create_path(etcd, "/env", "prod") create_path(etcd, "/monitors/cpu", "- type: collectd/cpu") create_path(etcd, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(endpoint="%s:2379" % container_ip(etcd)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata") ), "Datapoints didn't come through" assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod")), "dimension wasn't set"
def devstack(): devstack_opts = dict( entrypoint="/lib/systemd/systemd", privileged=True, volumes={ "/lib/modules": {"bind": "/lib/modules", "mode": "ro"}, "/sys/fs/cgroup": {"bind": "/sys/fs/cgroup", "mode": "ro"}, }, environment={"container": "docker"}, ) with run_container("quay.io/signalfx/devstack:latest", **devstack_opts) as container: code, output = container.exec_run("start-devstack.sh") assert code == 0, "devstack failed to start:\n%s" % output.decode("utf-8") yield container
def run_vault(): with run_container("vault:1.0.2") as vault_cont: vault_ip = container_ip(vault_cont) assert wait_for(p(tcp_socket_open, vault_ip, 8200), 30) assert wait_for(lambda: "Root Token:" in vault_cont.logs().decode("utf-8"), 10) logs = vault_cont.logs() token = re.search(r"Root Token: (.*)$", logs.decode("utf-8"), re.MULTILINE).group(1) assert token, "Could not get root token of vault server" client = hvac.Client(url=f"http://{vault_ip}:8200", token=token) client.sys.enable_audit_device( device_type="file", options={"log_raw": True, "prefix": AUDIT_PREFIX, "file_path": "stdout"} ) yield [client, lambda: parse_audit_events_from_logs(vault_cont)]
def test_marathon(marathon_image): with run_container("zookeeper:3.5") as zookeeper: zkhost = container_ip(zookeeper) assert wait_for(p(tcp_socket_open, zkhost, 2181), 60), "zookeeper didn't start" with run_container( marathon_image, command=["--master", "localhost:5050", "--zk", "zk://{0}:2181/marathon".format(zkhost)] ) as service_container: host = container_ip(service_container) config = dedent( f""" monitors: - type: collectd/marathon host: {host} port: 8080 """ ) assert wait_for(p(tcp_socket_open, host, 8080), 120), "marathon not listening on port" assert wait_for( p(http_status, url="http://{0}:8080/v2/info".format(host), status=[200]), 120 ), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "marathon")), "didn't get datapoints"
def test_interior_globbing(): with run_container(ETCD2_IMAGE, command=ETCD_COMMAND) as etcd: assert wait_for(p(container_cmd_exit_0, etcd, "/etcdctl ls"), 5), "etcd didn't start" create_path(etcd, "/env", "prod") create_path(etcd, "/services/cpu/monitor", "- type: collectd/cpu") create_path(etcd, "/services/signalfx/monitor", "- type: collectd/signalfx-metadata") final_conf = INTERNAL_GLOB_CONFIG.substitute(endpoint="%s:2379" % container_ip(etcd)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_event_with_dim, backend, "plugin", "signalfx-metadata") ), "Datapoints didn't come through" create_path(etcd, "/services/uptime/monitor", "- type: collectd/uptime") assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "uptime")), "didn't get uptime datapoints"
def test_basic_zk_config(): with run_container("zookeeper:3.4") as zk_cont: zkhost = container_ip(zk_cont) assert wait_for(p(tcp_socket_open, zkhost, 2181), 30) create_znode(zk_cont, "/env", "prod") create_znode(zk_cont, "/monitors", "") create_znode(zk_cont, "/monitors/cpu", "- type: collectd/cpu") create_znode(zk_cont, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(zk_endpoint="%s:2181" % zkhost) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata")) assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod"))
def run_init_system_image(base_image, with_socat=True): image_id = build_base_image(base_image) print("Image ID: %s" % image_id) if with_socat: backend_ip = "127.0.0.1" else: backend_ip = get_host_ip() with fake_backend.start(ip_addr=backend_ip) as backend: container_options = { # Init systems running in the container want permissions "privileged": True, "volumes": { "/sys/fs/cgroup": { "bind": "/sys/fs/cgroup", "mode": "ro" }, "/tmp/scratch": { "bind": "/tmp/scratch", "mode": "rw" }, }, "extra_hosts": { # Socat will be running on localhost to forward requests to # these hosts to the fake backend "ingest.signalfx.com": backend.ingest_host, "api.signalfx.com": backend.api_host, }, } with run_container(image_id, wait_for_ip=True, **container_options) as cont: if with_socat: # Proxy the backend calls through a fake HTTPS endpoint so that we # don't have to change the default configuration included by the # package. The base_image used should trust the self-signed certs # included in the images dir so that the agent doesn't throw TLS # verification errors. with socat_https_proxy(cont, backend.ingest_host, backend.ingest_port, "ingest.signalfx.com", "127.0.0.1"), socat_https_proxy( cont, backend.api_host, backend.api_port, "api.signalfx.com", "127.0.0.2"): yield [cont, backend] else: yield [cont, backend]
def test_basic_zk_config(): with run_container("zookeeper:3.4") as zk_cont: assert wait_for( p(container_cmd_exit_0, zk_cont, "nc -z localhost 2181"), 5) create_znode(zk_cont, "/env", "prod") create_znode(zk_cont, "/monitors", "") create_znode(zk_cont, "/monitors/cpu", "- type: collectd/cpu") create_znode(zk_cont, "/monitors/signalfx-metadata", "- type: collectd/signalfx-metadata") final_conf = CONFIG.substitute(zk_endpoint="%s:2181" % container_ip(zk_cont)) with run_agent(final_conf) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "signalfx-metadata")) assert wait_for(p(has_datapoint_with_dim, backend, "env", "prod"))
def test_mongo(): with run_container("mongo:3.6") as mongo_cont: host = container_ip(mongo_cont) config = dedent(f""" monitors: - type: collectd/mongodb host: {host} port: 27017 databases: [admin] """) assert wait_for(p(tcp_socket_open, host, 27017), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "mongo")), "Didn't get mongo datapoints"
def test_hadoop(version): with run_service("hadoop", buildargs={"HADOOP_VER": version}, hostname="hadoop-master") as hadoop_master: with run_container(hadoop_master.image, hostname="hadoop-worker1") as hadoop_worker1: containers = { "hadoop-master": hadoop_master, "hadoop-worker1": hadoop_worker1 } # distribute the ip and hostnames for each container distribute_hostnames(containers) # format hdfs print_lines( hadoop_master.exec_run( ["/usr/local/hadoop/bin/hdfs", "namenode", "-format"])[1]) # start hadoop and yarn print_lines(hadoop_master.exec_run("start-dfs.sh")[1]) print_lines(hadoop_master.exec_run("start-yarn.sh")[1]) # wait for yarn api to be available host = container_ip(hadoop_master) assert wait_for(p(tcp_socket_open, host, 8088), 60), "service not listening on port" assert wait_for( p(http_status, url="http://{0}:8088".format(host), status=[200]), 120), "service didn't start" # start the agent with hadoop config config = HADOOP_CONFIG.substitute(host=host, port=8088) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "apache_hadoop")), "Didn't get hadoop datapoints" assert wait_for( p(has_datapoint, backend, "gauge.hadoop.cluster.metrics.active_nodes", {}, 1)), "expected 1 hadoop worker node"
def test_cadvisor(): cadvisor_opts = dict( volumes={ "/": { "bind": "/rootfs", "mode": "ro" }, "/var/run": { "bind": "/var/run", "mode": "ro" }, "/sys": { "bind": "/sys", "mode": "ro" }, "/var/lib/docker": { "bind": "/var/lib/docker", "mode": "ro" }, "/dev/disk": { "bind": "/dev/disk", "mode": "ro" }, }) with run_container("google/cadvisor:latest", **cadvisor_opts) as cadvisor_container: host = container_ip(cadvisor_container) config = dedent(f""" monitors: - type: cadvisor cadvisorURL: http://{host}:8080 """) assert wait_for(p(tcp_socket_open, host, 8080), 60), "service didn't start" with run_agent(config) as [backend, _, _]: expected_metrics = get_monitor_metrics_from_selfdescribe( "cadvisor") assert wait_for( p(any_metric_found, backend, expected_metrics)), "Didn't get cadvisor datapoints"
def test_vault_renewable_secret_refresh(): """ Use the Mongo database secret engine to get renewable Mongo credentials to use in the Mongo collectd plugin. Make sure the secret gets renewed as expected. """ with run_container("mongo:3.6") as mongo_cont, run_vault() as [vault_client, get_audit_events]: assert wait_for(p(tcp_socket_open, container_ip(mongo_cont), 27017), 30), "mongo service didn't start" vault_client.sys.enable_secrets_engine(backend_type="database") vault_client.write( "database/config/my-mongodb-database", plugin_name="mongodb-database-plugin", allowed_roles="my-role", connection_url=f"mongodb://{container_ip(mongo_cont)}:27017/admin", username="******", password="", ) vault_client.write( "database/roles/my-role", db_name="my-mongodb-database", creation_statements='{ "db": "admin", "roles": [{ "role": "readWrite" }, {"role": "read", "db": "foo"}] }', default_ttl="13s", max_ttl="24h", ) with run_agent( dedent( f""" intervalSeconds: 1 configSources: vault: vaultToken: {vault_client.token} vaultAddr: {vault_client.url} monitors: - type: collectd/mongodb host: {container_ip(mongo_cont)} port: 27017 databases: - admin username: {{"#from": "vault:database/creds/my-role[username]"}} password: {{"#from": "vault:database/creds/my-role[password]"}} metricsToExclude: - metricName: "!gauge.objects" """ ) ) as [backend, _, _]: assert wait_for(p(has_datapoint, backend, dimensions={"plugin": "mongo"})) assert audit_read_paths(get_audit_events()) == ["database/creds/my-role"], "expected one read" time.sleep(10) assert audit_read_paths(get_audit_events()) == ["database/creds/my-role"], "expected still one read" renewals = audit_secret_renewals(get_audit_events()) # The secret gets renewed immediately by the renewer and then again # within its lease duration period. assert len(renewals) == 2, "expected two renewal ops" for ren in renewals: assert "database/creds/my-role" in ren, "expected renewal of right secret" backend.datapoints.clear() assert wait_for(p(has_datapoint, backend, dimensions={"plugin": "mongo"})), "plugin lost access to mongo"