def test_basic_vault_config(): with run_vault() as [vault_client, get_audit_events]: vault_client.sys.enable_secrets_engine(backend_type="kv", options={"version": "1"}) vault_client.write("secret/data/appinfo", data={"env": "prod"}) vault_client.write("kv/usernames", app="me") with run_agent( dedent( f""" intervalSeconds: 2 globalDimensions: env: {{"#from": "vault:secret/data/appinfo[data.env]"}} user: {{"#from": "vault:kv/usernames[app]"}} configSources: vault: vaultToken: {vault_client.token} vaultAddr: {vault_client.url} monitors: - type: collectd/uptime """ ) ) as [backend, _, _]: assert wait_for(p(has_datapoint, backend, dimensions={"env": "prod"})) assert wait_for(p(has_datapoint, backend, dimensions={"user": "******"})) assert audit_read_paths(get_audit_events()) == ["secret/data/appinfo", "kv/usernames"], "expected two reads"
def test_vault_kv_poll_refetch(): """ Test the KV v2 token refetch operation """ with run_vault() as [vault_client, get_audit_events]: vault_client.write("secret/data/app", data={"env": "dev"}) with run_agent( dedent( f""" intervalSeconds: 2 globalDimensions: env: {{"#from": "vault:secret/data/app[data.env]"}} configSources: vault: vaultToken: {vault_client.token} vaultAddr: {vault_client.url} kvV2PollInterval: 10s monitors: - type: collectd/uptime """ ) ) as [backend, _, _]: assert wait_for(p(has_datapoint, backend, dimensions={"env": "dev"})) assert audit_read_paths(get_audit_events()) == ["secret/data/app"], "expected one read" vault_client.write("secret/data/app", data={"env": "prod"}) assert wait_for(p(has_datapoint, backend, dimensions={"env": "prod"})) assert "secret/metadata/app" in audit_read_paths(get_audit_events())
def test_elasticsearch_without_cluster(): # start the ES container without the service with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}, entrypoint="sleep inf") as es_container: host = container_ip(es_container) config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 """) with run_agent(config) as [backend, _, _]: assert not wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "datapoints found without service" # start ES service and make sure it gets discovered es_container.exec_run( "/usr/local/bin/docker-entrypoint.sh eswrapper", detach=True) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints"
def test_python_runner_with_redis(): with run_container("redis:4-alpine") as test_container: host = container_ip(test_container) config = MONITOR_CONFIG.substitute(host=host, bundle_root=BUNDLE_DIR) assert wait_for(p(tcp_socket_open, host, 6379), 60), "redis is not listening on port" redis_client = redis.StrictRedis(host=host, port=6379, db=0) assert wait_for(redis_client.ping, 60), "service didn't start" with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info")), "didn't get datapoints" assert wait_for( p(regex_search_matches_output, get_output, PID_RE.search)) pid = int(PID_RE.search(get_output()).groups()[0]) os.kill(pid, signal.SIGTERM) time.sleep(3) backend.datapoints.clear() assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "redis_info") ), "didn't get datapoints after Python process was killed" assert wait_for( p(has_datapoint, backend, metric_name="counter.lru_clock", metric_type=sf_pbuf.CUMULATIVE_COUNTER), timeout_seconds=3, ), "metric type was wrong"
def test_negated_filter_with_monitor_type(): """ Having monitorType in a filter should make that filter only apply to a specific monitor type and not to other metrics. """ with run_agent(""" monitors: - type: collectd/signalfx-metadata - type: collectd/memory - type: collectd/df - type: collectd/uptime metricsToExclude: - metricNames: - memory.used - memory.free monitorType: collectd/memory negated: true - metricName: uptime """) as [backend, _, _]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.used")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.free")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "df_complex.free")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "memory.cached" ), 10) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "uptime"), 5)
def test_hadoopjmx(version, nodeType): """ Any new versions of hadoop should be manually built, tagged, and pushed to quay.io, i.e. docker build \ -t quay.io/signalfx/hadoop-test:<version> \ --build-arg HADOOP_VER=<version> \ <repo_root>/test-services/hadoop docker push quay.io/signalfx/hadoop-test:<version> """ with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-master") as hadoop_master: with run_container("quay.io/signalfx/hadoop-test:%s" % version, hostname="hadoop-worker1") as hadoop_worker1: if nodeType in ["nameNode", "resourceManager"]: container = hadoop_master else: container = hadoop_worker1 host = container_ip(container) port = NODETYPE_PORT[nodeType] if nodeType in ["resourceManager", "nodeManager"]: yarn_var = YARN_VAR[nodeType] yarn_opts = YARN_OPTS % (yarn_var, port, yarn_var) cmd = ["/bin/bash", "-c", "echo 'export %s' >> %s" % (yarn_opts, YARN_ENV_PATH)] container.exec_run(cmd) start_hadoop(hadoop_master, hadoop_worker1) # wait for jmx to be available assert wait_for(p(tcp_socket_open, host, port), 60), "jmx service not listening on port %d" % port # start the agent with hadoopjmx config config = HADOOPJMX_CONFIG.substitute(host=host, port=port, nodeType=nodeType) with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "nodeType", nodeType)), ( "Didn't get hadoopjmx datapoints for nodeType %s" % nodeType )
def test_monitor_filter(): """ Ensure the filters on monitors get applied """ with run_agent(""" monitors: - type: collectd/signalfx-metadata - type: collectd/df - type: collectd/memory metricsToExclude: - metricName: memory.used - type: collectd/uptime """) as [backend, _, update_config]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "df_complex.free")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.free")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "memory.used")) update_config(""" monitors: - type: collectd/signalfx-metadata - type: collectd/df - type: collectd/memory - type: collectd/uptime """) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.used")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.free"))
def test_vault_nonrenewable_secret_refresh(): with run_vault() as [vault_client, get_audit_events]: vault_client.sys.enable_secrets_engine(backend_type="kv", options={"version": "1"}) vault_client.write("kv/passwords", app="s3cr3t", ttl="10s") with run_agent( dedent( f""" intervalSeconds: 1 globalDimensions: password: {{"#from": "vault:kv/passwords[app]"}} configSources: vault: vaultToken: {vault_client.token} vaultAddr: {vault_client.url} monitors: - type: internal-metrics metricsToExclude: - metricName: "!sfxagent.go_num_goroutine" """ ) ) as [backend, _, _]: assert wait_for(p(has_datapoint, backend, dimensions={"password": "******"})) assert audit_read_paths(get_audit_events()) == ["kv/passwords"], "expected one read" # Renew time is 1/2 of the lease time of 10s time.sleep(5) assert audit_read_paths(get_audit_events()) == ["kv/passwords", "kv/passwords"], "expected two reads"
def test_docker_observer_labels_partial(): """ Test that docker observer picks up a partially configured endpoint from container labels """ with run_agent( dedent(""" observers: - type: docker monitors: - type: collectd/nginx discoveryRule: container_name =~ "nginx-disco-partial" && port == 80 """)) as [backend, _, _]: with run_service( "nginx", name="nginx-disco-partial", labels={ "agent.signalfx.com.config.80.extraDimensions": "{mydim: myvalue}" }, ): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "mydim", "myvalue")), "Didn't get extra dimension" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-disco-partial"), 10)
def test_filter_with_restart(): with run_agent(""" monitors: - type: collectd/signalfx-metadata - type: collectd/df - type: collectd/memory - type: collectd/uptime metricsToExclude: - metricNames: - memory.* monitorType: collectd/memory """) as [backend, _, update_config]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "df_complex.free")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "memory.used")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "memory.free")) update_config(""" monitors: - type: collectd/signalfx-metadata - type: collectd/df - type: collectd/memory - type: collectd/uptime metricsToExclude: - metricNames: - memory.used monitorType: collectd/memory """) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.free"))
def test_jenkins(version): with run_service("jenkins", buildargs={ "JENKINS_VERSION": version, "JENKINS_PORT": "8080" }) as jenkins_container: host = container_ip(jenkins_container) config = dedent(f""" monitors: - type: collectd/jenkins host: {host} port: 8080 metricsKey: {METRICS_KEY} """) assert wait_for(p(tcp_socket_open, host, 8080), 60), "service not listening on port" assert wait_for( p(http_status, url=f"http://{host}:8080/metrics/{METRICS_KEY}/ping/", status=[200]), 120), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "jenkins")), "Didn't get jenkins datapoints"
def test_redis_key_lengths(): with run_redis() as [hostname, redis_client]: redis_client.lpush("queue-1", *["a", "b", "c"]) redis_client.lpush("queue-2", *["x", "y"]) config = dedent(f""" monitors: - type: collectd/redis host: {hostname} port: 6379 sendListLengths: - databaseIndex: 0 keyPattern: queue-* """) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint, backend, metric_name="gauge.key_llen", dimensions={"key_name": "queue-1"}, value=3)), "didn't get datapoints" assert wait_for( p(has_datapoint, backend, metric_name="gauge.key_llen", dimensions={"key_name": "queue-2"}, value=2)), "didn't get datapoints"
def test_basic_filtering(): with run_agent(BASIC_CONFIG) as [backend, _, _]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "uptime")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "cpu.utilization"), 10)
def test_negated_filter_with_monitor_type(): with run_agent(""" monitors: - type: collectd/signalfx-metadata - type: collectd/memory - type: collectd/df - type: collectd/uptime metricsToExclude: - metricNames: - memory.used - memory.free monitorType: collectd/memory negated: true - metricName: uptime """) as [backend, _, _]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.used")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "memory.free")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "df_complex.free")) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "memory.cached" ), 10) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "uptime"), 5)
def test_docker_observer_labels(): """ Test that docker observer picks up a fully configured endpoint from container labels """ with run_agent( dedent(""" observers: - type: docker """)) as [backend, _, _]: with run_service( "nginx", name="nginx-disco-full", labels={ "agent.signalfx.com.monitorType.80": "collectd/nginx", "agent.signalfx.com.config.80.intervalSeconds": "1", }, ): assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints" # Let nginx be removed by docker observer and collectd restart time.sleep(5) backend.datapoints.clear() assert ensure_always( lambda: not has_datapoint_with_dim(backend, "container_name", "nginx-disco-full"), 10)
def test_elasticsearch_with_threadpool(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 threadPools: - bulk - index - search """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "thread_pool", "bulk")), "Didn't get bulk thread pool metrics" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_elasticsearch_with_cluster_option(): with run_service("elasticsearch/6.4.2", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 cluster: testCluster1 """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster1") ), "Cluster name not picked from read callback" # make sure all plugin_instance dimensions were overridden by the cluster option assert not wait_for( p(has_datapoint_with_dim, backend, "plugin_instance", "testCluster"), 10 ), "plugin_instance dimension not overridden by cluster option" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_include_filter_with_monitor_type(): """ Test that include filters will override exclude filters """ with run_agent(""" monitors: - type: collectd/disk - type: collectd/uptime metricsToExclude: - metricNames: - disk_time.read monitorType: collectd/disk - metricNames: - disk_ops.read - disk_ops.write monitorType: collectd/disk negated: true metricsToInclude: - metricNames: - disk_time.read """) as [backend, _, _]: assert wait_for( lambda: has_datapoint_with_metric_name(backend, "disk_ops.read")) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "disk_ops.write"), 5) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "disk_time.read"), 5) assert ensure_always( lambda: not has_datapoint_with_metric_name(backend, "disk_time.write"), 5) assert wait_for( lambda: has_datapoint_with_metric_name(backend, "uptime"), 5)
def test_elasticsearch_with_additional_metrics(): with run_service("elasticsearch/6.2.0", environment={"cluster.name": "testCluster"}) as es_container: host = container_ip(es_container) assert wait_for( p(http_status, url=f"http://{host}:9200/_nodes/_local", status=[200]), 180), "service didn't start" config = dedent(f""" monitors: - type: collectd/elasticsearch host: {host} port: 9200 username: elastic password: testing123 additionalMetrics: - cluster.initializing-shards - thread_pool.threads """) with run_agent(config) as [backend, get_output, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "elasticsearch")), "Didn't get elasticsearch datapoints" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.cluster.initializing-shards") ), "Didn't get gauge.cluster.initializing-shards metric" assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.thread_pool.threads") ), "Didn't get gauge.thread_pool.threads metric" assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_basic(): """ See if we get datapoints from a very standard set of monitors """ with run_agent(BASIC_CONFIG) as [backend, get_output, _]: assert wait_for( lambda: backend.datapoints), "Didn't get any datapoints" assert has_log_message(get_output(), "info")
def test_etcd_monitor(): with run_container("quay.io/coreos/etcd:v2.3.8", command=ETCD_COMMAND) as etcd_cont: host = container_ip(etcd_cont) config = ETCD_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 2379), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for(p(has_datapoint_with_dim, backend, "plugin", "etcd")), "Didn't get etcd datapoints"
def test_redis(image): with run_redis(image) as [hostname, _]: config = MONITOR_CONFIG.substitute(host=hostname) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint, backend, dimensions={"plugin": "redis_info"})), "didn't get datapoints"
def test_tracing_load(): """ Test that all of the traces sent through the agent get the proper service correlation datapoint. """ port = random.randint(5001, 20000) with run_agent( dedent( f""" hostname: "testhost" writer: sendTraceHostCorrelationMetrics: true traceHostCorrelationMetricsInterval: 1s staleServiceTimeout: 7s monitors: - type: trace-forwarder listenAddress: localhost:{port} """ ) ) as [backend, _, _]: assert wait_for(p(tcp_port_open_locally, port)), "trace forwarder port never opened!" for i in range(0, 100): spans = _test_trace() spans[0]["localEndpoint"]["serviceName"] += f"-{i}" spans[1]["localEndpoint"]["serviceName"] += f"-{i}" resp = requests.post( f"http://localhost:{port}/v1/trace", headers={"Content-Type": "application/json"}, data=json.dumps(spans), ) assert resp.status_code == 200 for i in range(0, 100): assert wait_for( p( has_datapoint, backend, metric_name="sf.int.service.heartbeat", dimensions={"sf_hasService": f"myapp-{i}", "host": "testhost"}, ) ), "Didn't get host correlation datapoint" assert wait_for( p( has_datapoint, backend, metric_name="sf.int.service.heartbeat", dimensions={"sf_hasService": f"file-server-{i}", "host": "testhost"}, ) ), "Didn't get host correlation datapoint" time.sleep(10) backend.datapoints.clear() assert ensure_never( p(has_datapoint, backend, metric_name="sf.int.service.heartbeat"), timeout_seconds=5 ), "Got infra correlation metric when it should have been expired"
def test_all_kafka_monitors(version): with run_kafka(version) as kafka: kafkahost = container_ip(kafka) with run_service( "kafka", environment={ "JMX_PORT": "8099", "START_AS": "producer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_producer: kafkaproducerhost = container_ip(kafka_producer) with run_service( "kafka", environment={ "JMX_PORT": "9099", "START_AS": "consumer", "KAFKA_BROKER": "%s:9092" % (kafkahost, ) }, buildargs={"KAFKA_VERSION": version}, ) as kafka_consumer: kafkaconsumerhost = container_ip(kafka_consumer) with run_agent( textwrap.dedent(""" monitors: - type: collectd/kafka host: {0} port: 7099 clusterName: testCluster - type: collectd/kafka_producer host: {1} port: 8099 - type: collectd/kafka_consumer host: {2} port: 9099 """.format(kafkahost, kafkaproducerhost, kafkaconsumerhost))) as [backend, _, _]: assert wait_for( p(has_datapoint_with_metric_name, backend, "gauge.kafka-active-controllers"), timeout_seconds=60), "Didn't get kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "cluster", "testCluster"), timeout_seconds=60 ), "Didn't get cluster dimension from kafka datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "console-producer"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_producer datapoints" assert wait_for( p(has_datapoint_with_dim, backend, "client-id", "consumer-1"), timeout_seconds=60 ), "Didn't get client-id dimension from kafka_consumer datapoints"
def test_cpufreq(): with run_agent( """ monitors: - type: collectd/cpufreq """ ) as [_, get_output, _]: time.sleep(10) assert not has_log_message(get_output().lower(), "error"), "error found in agent output!"
def test_bad_globbing(): with run_container("zookeeper:3.4") as zk_cont: zkhost = container_ip(zk_cont) assert wait_for(p(tcp_socket_open, zkhost, 2181), 30) create_znode(zk_cont, "/env", "prod") final_conf = BAD_GLOB_CONFIG.substitute(zk_endpoint="%s:2181" % zkhost) with run_agent(final_conf) as [_, get_output, _]: assert wait_for( lambda: "Zookeeper only supports globs" in get_output())
def test_health_checker_tcp(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(CONFIG.substitute(host=host)) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "health_checker")), "Didn't get health_checker datapoints"
def test_kong(kong_image): # pylint: disable=redefined-outer-name kong_env = dict(KONG_ADMIN_LISTEN="0.0.0.0:8001", KONG_LOG_LEVEL="warn", KONG_DATABASE="postgres", KONG_PG_DATABASE="kong") with run_container("postgres:9.5", environment=dict(POSTGRES_USER="******", POSTGRES_DB="kong")) as db: db_ip = container_ip(db) kong_env["KONG_PG_HOST"] = db_ip def db_is_ready(): return db.exec_run("pg_isready -U kong").exit_code == 0 assert wait_for(db_is_ready) with run_container(kong_image, environment=kong_env, command="sleep inf") as migrations: def db_is_reachable(): return migrations.exec_run( "psql -h {} -U kong".format(db_ip)).exit_code == 0 assert wait_for(db_is_reachable) assert migrations.exec_run("kong migrations up --v").exit_code == 0 with run_container(kong_image, environment=kong_env) as kong: kong_ip = container_ip(kong) def kong_is_listening(): try: return get("http://{}:8001/signalfx".format( kong_ip)).status_code == 200 except RequestException: return False assert wait_for(kong_is_listening) config = string.Template( dedent(""" monitors: - type: collectd/kong host: $host port: 8001 metrics: - metric: connections_handled report: true """)).substitute(host=container_ip(kong)) with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "kong")), "Didn't get Kong data point"
def test_nginx(): with run_service("nginx") as nginx_container: host = container_ip(nginx_container) config = NGINX_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "nginx")), "Didn't get nginx datapoints"
def test_apache(): with run_service("apache") as apache_container: host = container_ip(apache_container) config = APACHE_CONFIG.substitute(host=host) assert wait_for(p(tcp_socket_open, host, 80), 60), "service didn't start" with run_agent(config) as [backend, _, _]: assert wait_for( p(has_datapoint_with_dim, backend, "plugin", "apache")), "Didn't get apache datapoints"