def random_fault(): metafaults.pick_fault([ (1, kill_short_zk), (1, kill_short_server), (1, kill_short_client), #(1, kill_long_zk), #(1, kill_long_server), #(1, kill_long_client), #(1, pause_zk), #(1, pause_server), #(1, pause_client), ])()
# XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ]))), ] ########NEW FILE######## __FILENAME__ = hbase #!/usr/bin/env python # # Licensed to Cloudera, Inc. under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. Cloudera, Inc. licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3) rs_pause = faults.pause_daemons(["HRegionServer"], 62) dn_pause = faults.pause_daemons(["DataNode"], 20) # This fault isn't that useful yet, since it only drops inbound packets # but outbound packets (eg, the ZK pings) keep going. rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64) profile = [ triggers.Periodic( 45, metafaults.pick_fault([ # kill -9s (5, rs_kill_long), (1, dn_kill_long), # fast kill -9s (5, rs_kill_short), (1, dn_kill_short), # pauses (simulate GC?) (10, rs_pause), (1, dn_pause), # drop packets (simulate network outage) #(1, faults.drop_packets_to_daemons(["DataNode"], 20)), #(1, rs_drop_inbound_packets), ])), # triggers.WebServerTrigger(12321) ]
bastion = os.getenv("GREMLINS_BASTION_HOST", hostutils.guess_remote_host()) if not bastion: raise Exception("GREMLINS_BASTION_HOST not set, and I couldn't guess your remote host.") logging.info("Using %s as bastion host for network failures. You should be able to ssh from that host at all times." % bastion) fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is greater than ZK heartbeats fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ])) ), ]
rs_pause = faults.pause_daemons(["HRegionServer"], 62) dn_pause = faults.pause_daemons(["DataNode"], 20) # This fault isn't that useful yet, since it only drops inbound packets # but outbound packets (eg, the ZK pings) keep going. rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64) profile = [ triggers.Periodic( 45, metafaults.pick_fault([ # kill -9s (5, rs_kill_long), (1, dn_kill_long), # fast kill -9s (5, rs_kill_short), (1, dn_kill_short), # pauses (simulate GC?) (10, rs_pause), (1, dn_pause ), # drop packets (simulate network outage) #(1, faults.drop_packets_to_daemons(["DataNode"], 20)), #(1, rs_drop_inbound_packets), ])), # triggers.WebServerTrigger(12321) ]
fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is greater than ZK heartbeats fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ])) ), ] ########NEW FILE######## __FILENAME__ = hbase #!/usr/bin/env python # # Licensed to Cloudera, Inc. under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. Cloudera, Inc. licenses this file # to you under the Apache License, Version 2.0 (the
fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is greater than ZK heartbeats fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ]))), ]
from gremlins import faults, metafaults, triggers, tc clear_network_faults = faults.clear_network_faults() introduce_partition = faults.introduce_network_partition() introduce_latency = faults.introduce_network_latency() INTERVAL=30 profile = [ # clear any existing configurations triggers.OneShot(clear_network_faults), # every 5 seconds, either clear faults, introduce a latency or a partition # other faults are available, but let's start-simply triggers.Periodic( INTERVAL, metafaults.pick_fault([ (10, clear_network_faults), (10, introduce_partition), ])), ]
clear_network_faults = faults.clear_network_faults() introduce_packet_loss = faults.introduce_network_packet_loss() introduce_partition = faults.introduce_network_partition() introduce_latency = faults.introduce_network_latency() introduce_packet_reordering = faults.introduce_packet_reordering() server_cmd = "nc.*4242" nc_kill = faults.kill_processes([server_cmd], signal.SIGKILL) nc_pause = faults.pause_processes([server_cmd], 5) profile = [ triggers.OneShot(clear_network_faults), # triggers.Periodic( # 10, metafaults.pick_fault([ # # kill -9s # # (5, nc_kill), # # pauses (simulate GC) # (10, nc_pause), # ])), triggers.Periodic( 10, metafaults.pick_fault([ (10, clear_network_faults), # (10, introduce_packet_loss), (10, introduce_partition), (10, introduce_latency), # (10, introduce_packet_reordering), ])), # triggers.WebServerTrigger(12321) ]