Пример #1
0
def main():
    log_directory = read_conf_file("cluster.conf", "cluster", "log_directory")
    log_directory = os.path.expanduser(log_directory)
    log_directory = os.path.join(log_directory, "networkbench")

    parser = argparse.ArgumentParser(
        description="Harness for network benchmark application")
    parser.add_argument(
        "--config", "-c", help="config file to use for the benchmark "
        "(default: %(default)s)",
        default=os.path.join(BENCHMARK_DIR, "config.yaml"), type=str)
    parser.add_argument(
        "--log_directory", "-l",
        help="directory containing logs for an experiment "
        "(default: %(default)s)",
        default=log_directory)
    parser.add_argument(
        "--profiler", help="path to the binary of a profiling tool to use, for "
        "example valgrind or operf")
    parser.add_argument(
        "--profiler_options", help="options surrounded by quotes to pass to "
        "the profiler", type=str, default="")
    parser.add_argument(
        "--iterations", "-i", help="run the benchmark this many times "
        "(default: %(default)s)", type=int, default=1)
    parser.add_argument(
        "--sleep", "-s", help="sleep this many seconds between iterations "
        "(default: %(default)s)", type=int, default=0)
    parser.add_argument(
        "--per_peer_config", help="use separate config files for each peer, by "
        "appending the peer's IP address to the config file name: .A.B.C.D",
        action="store_true", default=False)
    parser.add_argument(
        "--dump_core_directory", "-d", help="dump core file to this directory "
        "if the benchmark crashes", default=None)
    parser.add_argument(
        "peer_ips", help="comma delimited list of host IPs to use for "
        "benchmarking")
    parser.add_argument(
        "--remote_connections_only", "-r", help="Only send to remote peers, "
        "instead of sending all-to-all, which includes localhost",
        action="store_true", default=False)

    utils.add_interfaces_params(parser)

    args = parser.parse_args()
    binary = os.path.join(BENCHMARK_DIR, "networkbench")
    delete_output = False
    solo_mode = False
    stage_stats = "sender,receiver"

    params = "-REMOTE_CONNECTIONS_ONLY %d" % (args.remote_connections_only)

    run_benchmark_iterations(
        binary, args.log_directory, args.config, args.peer_ips, args.profiler,
        args.profiler_options, args.iterations, args.sleep, delete_output,
        args.per_peer_config, args.dump_core_directory, solo_mode,
        stage_stats, args.interfaces, params)
def main():
    parser = argparse.ArgumentParser(description="Run a benchmark application on a collection of nodes.")
    parser.add_argument("binary", help="benchmark application binary")
    parser.add_argument("log_directory", help="directory containing logs for an experiment")
    parser.add_argument("config", help="config file to use for the benchmark.")
    parser.add_argument("peer_ips", help="comma delimited list of host IPs to use for " "benchmarking")
    parser.add_argument(
        "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf"
    )
    parser.add_argument(
        "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default=""
    )
    parser.add_argument(
        "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1
    )
    parser.add_argument(
        "--sleep",
        "-s",
        help="sleep this many seconds between iterations " "(default: %(default)s)",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--delete_output", help="delete output files after run completes", action="store_true", default=False
    )
    parser.add_argument(
        "--per_peer_config",
        help="use separate config files for each peer, by "
        "appending the peer's IP address to the config file name: .A.B.C.D",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None
    )
    parser.add_argument(
        "--solo_mode",
        help="run the benchmark on all peers, but run each peer " "as if it's its own cluster of size 1.",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--stage_stats", help="comma delimited list of stage names to show " "runtime stats for upon completion"
    )
    parser.add_argument("--params", help="params that will override the config file", type=str, default="")

    add_interfaces_params(parser)

    args = parser.parse_args()
    run_benchmark_iterations(**vars(args))
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Run a benchmark application on a collection of nodes.")
    parser.add_argument(
        "binary", help="benchmark application binary")
    parser.add_argument(
        "log_directory", help="directory containing logs for an experiment")
    parser.add_argument(
        "config", help="config file to use for the benchmark.")
    parser.add_argument(
        "peer_ips", help="comma delimited list of host IPs to use for "
        "benchmarking")
    parser.add_argument(
        "--profiler", help="path to the binary of a profiling tool to use, for "
        "example valgrind or operf")
    parser.add_argument(
        "--profiler_options", help="options surrounded by quotes to pass to "
        "the profiler", type=str, default="")
    parser.add_argument(
        "--iterations", "-i", help="run the benchmark this many times "
        "(default: %(default)s)", type=int, default=1)
    parser.add_argument(
        "--sleep", "-s", help="sleep this many seconds between iterations "
        "(default: %(default)s)", type=int, default=0)
    parser.add_argument(
        "--delete_output", help="delete output files after run completes",
        action="store_true", default=False)
    parser.add_argument(
        "--per_peer_config", help="use separate config files for each peer, by "
        "appending the peer's IP address to the config file name: .A.B.C.D",
        action="store_true", default=False)
    parser.add_argument(
        "--dump_core_directory", "-d", help="dump core file to this directory "
        "if the benchmark crashes", default=None)
    parser.add_argument(
        "--solo_mode", help="run the benchmark on all peers, but run each peer "
        "as if it's its own cluster of size 1.",
        action="store_true", default=False)
    parser.add_argument(
        "--stage_stats", help="comma delimited list of stage names to show "
        "runtime stats for upon completion")
    parser.add_argument(
        "--params", help="params that will override the config file",
        type=str, default="")

    add_interfaces_params(parser)

    args = parser.parse_args()
    run_benchmark_iterations(**vars(args))
def main():
    # Load cluster.conf
    parser = ConfigParser.SafeConfigParser()
    parser.read(CLUSTER_CONF)

    # Get default log directory
    log_directory = parser.get("cluster", "log_directory")

    parser = argparse.ArgumentParser(
        description="coordinates the execution of Themis jobs")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config", help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("--log_directory", "-l",
                        help="the directory in which to store coordinator logs "
                        "(default: %(default)s)", default=log_directory)
    parser.add_argument("--keepalive_refresh", help="the length of time node "
                        "coordinators should wait between refreshing keepalive "
                        "information (default: %(default)s seconds)", type=int,
                        default=2)
    parser.add_argument("--keepalive_timeout", help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "a node coordinator before the cluster coordinator "
                        "considers that node to be dead (default: %(default)s "
                        "seconds)", type=int, default=10)
    parser.add_argument("--profiler", help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options", help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload", help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.config = os.path.abspath(args.config)

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    job_status_gui = None
    job_status_gui_out_fp = None

    resource_monitor_gui = None
    resource_monitor_gui_out_fp = None

    coordinator = None

    try:
        # To make the status GUI port distinct for each user but deterministic
        # for a single user, use 2000 + (the md5 hash of the user's username
        # mod 1000) as the web GUI's port number
        username_md5sum = hashlib.md5()
        username_md5sum.update(getpass.getuser())

        job_status_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10
        resource_monitor_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10


        print ""

        # Start the resource monitor web GUI
        resource_monitor_gui, resource_monitor_gui_out_fp = \
            start_resource_monitor_gui(args, resource_monitor_gui_port)

        # Start the job status web GUI
        job_status_gui, job_status_gui_out_fp = start_job_status_gui(
            args, job_status_gui_port)

        print ""

        coordinator = ClusterCoordinator(**vars(args))
        coordinator.run()
    finally:
        if job_status_gui is not None:
            log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid))
            os.killpg(job_status_gui.pid, signal.SIGTERM)
            job_status_gui.wait()


        if job_status_gui_out_fp is not None:
            job_status_gui_out_fp.flush()
            job_status_gui_out_fp.close()

        if resource_monitor_gui is not None:
            log.info("Stopping resource monitor GUI (PID %d)" % (
                    resource_monitor_gui.pid))
            os.killpg(resource_monitor_gui.pid, signal.SIGTERM)
            resource_monitor_gui.wait()

        if resource_monitor_gui_out_fp is not None:
            resource_monitor_gui_out_fp.flush()
            resource_monitor_gui_out_fp.close()

        if coordinator is not None:
            log.info("Stopping node coordinators")
            coordinator.stop_node_coordinators()
def main():
    # Load cluster.conf
    parser = ConfigParser.SafeConfigParser()
    parser.read(CLUSTER_CONF)

    # Get default log directory
    log_directory = parser.get("cluster", "log_directory")

    parser = argparse.ArgumentParser(
        description="coordinates the execution of Themis jobs")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config",
                        help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument(
        "--log_directory",
        "-l",
        help="the directory in which to store coordinator logs "
        "(default: %(default)s)",
        default=log_directory)
    parser.add_argument(
        "--keepalive_refresh",
        help="the length of time node "
        "coordinators should wait between refreshing keepalive "
        "information (default: %(default)s seconds)",
        type=int,
        default=2)
    parser.add_argument("--keepalive_timeout",
                        help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "a node coordinator before the cluster coordinator "
                        "considers that node to be dead (default: %(default)s "
                        "seconds)",
                        type=int,
                        default=10)
    parser.add_argument("--profiler",
                        help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options",
                        help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload",
                        help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.config = os.path.abspath(args.config)

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    job_status_gui = None
    job_status_gui_out_fp = None

    resource_monitor_gui = None
    resource_monitor_gui_out_fp = None

    coordinator = None

    try:
        # To make the status GUI port distinct for each user but deterministic
        # for a single user, use 2000 + (the md5 hash of the user's username
        # mod 1000) as the web GUI's port number
        username_md5sum = hashlib.md5()
        username_md5sum.update(getpass.getuser())

        job_status_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10
        resource_monitor_gui_port = (
            (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10

        print ""

        # Start the resource monitor web GUI
        resource_monitor_gui, resource_monitor_gui_out_fp = \
            start_resource_monitor_gui(args, resource_monitor_gui_port)

        # Start the job status web GUI
        job_status_gui, job_status_gui_out_fp = start_job_status_gui(
            args, job_status_gui_port)

        print ""

        coordinator = ClusterCoordinator(**vars(args))
        coordinator.run()
    finally:
        if job_status_gui is not None:
            log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid))
            os.killpg(job_status_gui.pid, signal.SIGTERM)
            job_status_gui.wait()

        if job_status_gui_out_fp is not None:
            job_status_gui_out_fp.flush()
            job_status_gui_out_fp.close()

        if resource_monitor_gui is not None:
            log.info("Stopping resource monitor GUI (PID %d)" %
                     (resource_monitor_gui.pid))
            os.killpg(resource_monitor_gui.pid, signal.SIGTERM)
            resource_monitor_gui.wait()

        if resource_monitor_gui_out_fp is not None:
            resource_monitor_gui_out_fp.flush()
            resource_monitor_gui_out_fp.close()

        if coordinator is not None:
            log.info("Stopping node coordinators")
            coordinator.stop_node_coordinators()
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config",
                        help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory",
                        help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce",
                        help="the nonce for all batches "
                        "executed by this node coordinator",
                        type=int)
    parser.add_argument(
        "--keepalive_refresh",
        help="the interval, in seconds, "
        "between refreshes of the key that this node "
        "coordinator uses to tell the cluster coordinator that "
        "it's still alive",
        type=int)
    parser.add_argument("--keepalive_timeout",
                        help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)",
                        type=int,
                        default=10)
    parser.add_argument("--profiler",
                        help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options",
                        help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload",
                        help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(args.log_directory,
                                        "node_coordinators",
                                        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch("Node coordinator error: " +
                                           str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config", help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory", help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce", help="the nonce for all batches "
                        "executed by this node coordinator", type=int)
    parser.add_argument("--keepalive_refresh", help="the interval, in seconds, "
                        "between refreshes of the key that this node "
                        "coordinator uses to tell the cluster coordinator that "
                        "it's still alive", type=int)
    parser.add_argument("--keepalive_timeout", help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)", type=int, default=10)
    parser.add_argument("--profiler", help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options", help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload", help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(
        args.log_directory, "node_coordinators",
        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch(
                "Node coordinator error: " + str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()