예제 #1
0
def supervisor():
    logger.debug("Supervisor called with: %r", sys.argv)
    supervisor_args = []
    agent_args = []
    in_agent_args = False
    tmp_argv = list(sys.argv)
    del tmp_argv[0]
    for arg in tmp_argv:
        if not in_agent_args and arg != "--":
            supervisor_args.append(arg)
        elif not in_agent_args and arg == "--":
            in_agent_args = True
        else:
            agent_args.append(arg)

    logger.debug("supervisor_args: %s", supervisor_args)

    parser = AgentArgumentParser(
        description="Start and monitor the agent process")
    parser.add_argument("--updates-drop-dir",
                        config="agent_updates_dir",
                        type=isdir, type_kwargs=dict(create=True),
                        help="Where to look for agent updates")
    parser.add_argument("--agent-package-dir",
                        type=isdir, type_kwargs=dict(create=True),
                        help="Path to the actual agent code")
    parser.add_argument("--pidfile", config="supervisor_lock_file",
                        help="The file to store the process id in. "
                             "[default: %(default)s]")
    parser.add_argument("-n", "--no-daemon", default=False, action="store_true",
                        config=False,
                        help="If provided then do not run the process in the "
                             "background.")
    parser.add_argument("--chdir", config="agent_chdir", type=isdir,
                        help="The directory to chdir to upon launch.")
    parser.add_argument("--uid", type=int,
                        help="The user id to run the supervisor as.  "
                             "*This setting is ignored on Windows.*")
    parser.add_argument("--gid", type=int,
                        help="The group id to run the supervisor as.  "
                             "*This setting is ignored on Windows.*")
    args = parser.parse_args(supervisor_args)

    if not args.no_daemon and fork is not NotImplemented:
        logger.info("sending supervisor log output to %s" %
                    config["supervisor_log"])
        daemon_start_return_code = start_daemon_posix(
            args.log, args.chdir, args.uid, args.gid)

        if isinstance(daemon_start_return_code, INTEGER_TYPES):
            return daemon_start_return_code

    elif not args.no_daemon and fork is NotImplemented:
        logger.warning(
            "`fork` is not implemented on %s, starting in "
            "foreground" % OS.title())
    else:
        logger.debug("Not forking to background")

    pid = os.getpid()
    # Write the PID file
    try:
        with open(config["supervisor_lock_file"], "w") as pidfile:
            pidfile.write(str(os.getpid()))
    except OSError as e:
        logger.error(
            "Failed to write PID file %s: %s",
            config["supervisor_lock_file"], e)
        return 1
    else:
        logger.debug("Wrote PID to %s", config["supervisor_lock_file"])

    logger.info("supervisor pid: %s" % pid)

    if getuid is not NotImplemented:
        logger.info("uid: %s" % getuid())

    if getgid is not NotImplemented:
        logger.info("gid: %s" % getgid())

    def terminate_handler(*_):
        subprocess.call(["pyfarm-agent"] + agent_args + ["stop"])
        sys.exit(0)

    def restart_handler(*_):
        subprocess.call(["pyfarm-agent"] + agent_args + ["stop"])

    logger.debug("Setting signal handlers")

    signal.signal(signal.SIGTERM, terminate_handler)
    signal.signal(signal.SIGINT, terminate_handler)
    signal.signal(signal.SIGHUP, restart_handler)

    update_file_path = join(config["agent_updates_dir"], "pyfarm-agent.zip")
    run_control_file = config["run_control_file_by_platform"]\
        [operating_system()]
    loop_interval = config["supervisor_interval"]

    while True:
        if subprocess.call(["pyfarm-agent", "status"]) != 0:
            if not isfile(run_control_file):
                logger.info("pyfarm_agent is not running, but run control file "
                            "%s does not exist. Not restarting the agent",
                            run_control_file)
            logger.info("pyfarm-agent is not running")
            if (os.path.isfile(update_file_path) and
                zipfile.is_zipfile(update_file_path)):
                logger.info("Found an upgrade to pyfarm-agent")
                try:
                    remove_directory(args.agent_package_dir, raise_=True)
                    os.makedirs(args.agent_package_dir)
                    with zipfile.ZipFile(update_file_path, "r") as archive:
                        archive.extractall(args.agent_package_dir)

                    remove_file(
                        update_file_path, retry_on_exit=True, raise_=False)
                except Exception as e:
                    logger.error(
                        "Caught exception trying to update agent: %r", e)

            logger.info("starting pyfarm-agent now")
            if subprocess.call(["pyfarm-agent"] + agent_args + ["start"]) != 0:
                logger.error("Could not start pyfarm-agent")
                sys.exit(1)

        time.sleep(loop_interval)
예제 #2
0
    def __init__(self):
        self.args = None
        self.parser = AgentArgumentParser(
            usage="%(prog)s [status|start|stop]",
            epilog="%(prog)s is a command line client for working with a "
                   "local agent.  You can use it to stop, start, and report "
                   "the general status of a running agent process.")

        # main subparser for start/stop/status/etc
        subparsers = self.parser.add_subparsers(
            help="individual operations %(prog)s can run")
        start = subparsers.add_parser(
            "start", help="starts the agent")
        stop = subparsers.add_parser(
            "stop", help="stops the agent")
        status = subparsers.add_parser(
            "status", help="query the 'running' state of the agent")

        # relate a name and function to each subparser
        start.set_defaults(target_name="start", target_func=self.start)
        stop.set_defaults(target_name="stop", target_func=self.stop)
        status.set_defaults(target_name="status", target_func=self.status)

        # command line flags which configure the agent's network service
        global_network = self.parser.add_argument_group(
            "Agent Network Service",
            description="Main flags which control the network services running "
                        "on the agent.")
        global_network.add_argument(
            "--port", config="agent_api_port", type=port,
            type_kwargs=dict(get_uid=lambda: self.args.uid == 0),
            help="The port number which the agent is either running on or "
                 "will run on when started.  This port is also reported the "
                 "master when an agent starts. [default: %(default)s]")
        global_network.add_argument(
            "--host", config="agent_hostname",
            help="The host to communicate with or hostname to present to the "
                 "master when starting.  Defaults to the fully qualified "
                 "hostname.")
        global_network.add_argument(
            "--agent-api-username", default="agent", config=False,
            help="The username required to access or manipulate the agent "
                 "using REST. [default: %(default)s]")
        global_network.add_argument(
            "--agent-api-password", default="agent", config=False,
            help="The password required to access manipulate the agent "
                 "using REST. [default: %(default)s]")
        global_network.add_argument(
            "--agent-id", config="agent_id", type=uuid_type,
            default=None,
            help="The UUID used to identify this agent to the master.  By "
                 "default the agent will attempt to load a cached value "
                 "however a specific UUID could be provided with this flag.")
        global_network.add_argument(
            "--agent-id-file", config="agent_id_file",
            default=expanduser(expandvars(
                config["agent_id_file_platform_defaults"][operating_system()])),
            help="The location to store the agent's id.  By default the path "
                 "is platform specific and defined by the "
                 "`agent_id_file_platform_defaults` key in the configuration.  "
                 "[default: %(default)s]")

        # command line flags for the connecting the master apis
        global_apis = self.parser.add_argument_group(
            "Network Resources",
            description="Resources which the agent will be communicating with.")
        global_apis.add_argument(
            "--master", config="master",
            help="This is a convenience flag which will allow you to set the "
                 "hostname for the master.  By default this value will be "
                 "substituted in --master-api")
        global_apis.add_argument(
            "--master-api", config="master_api",
            help="The location where the master's REST api is located. "
                 "[default: %(default)s]")
        global_apis.add_argument(
            "--master-api-version", config="master_api_version",
            help="Sets the version of the master's REST api the agent should"
                 "use [default: %(default)s]")

        # global command line flags which apply to top level
        # process control
        global_process = self.parser.add_argument_group(
            "Process Control",
            description="These settings apply to the parent process of the "
                        "agent and contribute to allowing the process to run "
                        "as other users or remain isolated in an environment. "
                        "They also assist in maintaining the 'running state' "
                        "via a process id file.")
        global_process.add_argument(
            "--pidfile", config="agent_lock_file",
            help="The file to store the process id in. [default: %(default)s]")
        global_process.add_argument(
            "-n", "--no-daemon", default=False, action="store_true",
            config=False,
            help="If provided then do not run the process in the background.")
        global_process.add_argument(
            "--chdir", config="agent_chdir", type=isdir,
            help="The working directory to change the agent into upon launch")
        global_process.add_argument(
            "--uid", type=uidgid, config=False,
            type_kwargs=dict(get_id=getuid, check_id=getpwuid, set_id=setuid),
            help="The user id to run the agent as.  *This setting is "
                 "ignored on Windows.*")
        global_process.add_argument(
            "--gid", type=uidgid, config=False,
            type_kwargs=dict(get_id=getgid, check_id=getgrgid, set_id=setgid),
            help="The group id to run the agent as.  *This setting is "
                 "ignored on Windows.*")
        global_process.add_argument(
            "--pdb-on-unhandled", action="store_true",
            help="When set pdb.set_trace() will be called if an unhandled "
                 "error is caught in the logger")

        # start general group
        start_general_group = start.add_argument_group(
            "General Configuration",
            description="These flags configure parts of the agent related to "
                        "hardware, state, and certain timing and scheduling "
                        "attributes.")
        start_general_group.add_argument(
            "--state", default=AgentState.ONLINE, config=False,
            type=enum, type_kwargs=dict(enum=AgentState),
            help="The current agent state, valid values are "
                 "" + str(list(AgentState)) + ". [default: %(default)s]")
        start_general_group.add_argument(
            "--time-offset", config="agent_time_offset",
            type=int, type_kwargs=dict(min_=0),
            help="If provided then don't talk to the NTP server at all to "
                 "calculate the time offset.  If you know for a fact that this "
                 "host's time is always up to date then setting this to 0 is "
                 "probably a safe bet.")
        start_general_group.add_argument(
            "--ntp-server", config="agent_ntp_server",
            help="The default network time server this agent should query to "
                 "retrieve the real time.  This will be used to help determine "
                 "the agent's clock skew if any.  Setting this value to '' "
                 "will effectively disable this query. [default: %(default)s]")
        start_general_group.add_argument(
            "--ntp-server-version", config="agent_ntp_server_version",
            type=int,
            help="The version of the NTP server in case it's running an older"
                 "or newer version. [default: %(default)s]")
        start_general_group.add_argument(
            "--no-pretty-json", config="agent_pretty_json",
            action="store_false",
            help="If provided do not dump human readable json via the agent's "
                 "REST api")
        start_general_group.add_argument(
            "--shutdown-timeout", config="agent_shutdown_timeout",
            type=int, type_kwargs=dict(min_=0),
            help="How many seconds the agent should spend attempting to inform "
                 "the master that it's shutting down.")
        start_general_group.add_argument(
            "--updates-drop-dir", config="agent_updates_dir",
            help="The directory to drop downloaded updates in. This should be "
            "the same directory pyfarm-supervisor will look for updates in. "
            "[default: %(default)s]")
        start_general_group.add_argument(
            "--run-control-file", config="run_control_file",
            default=expanduser(expandvars(
                config["run_control_file_by_platform"][operating_system()])),
            help="The path to a file that will signal to the supervisor that "
                 "agent is supposed to be restarted if it stops for whatever "
                 "reason."
                 "[default: %(default)s]")
        start_general_group.add_argument(
            "--farm-name", config="farm_name",
            default=None,
            help="The name of the farm the agent should join.  If unset, the "
                 "agent will join any farm.")

        # start hardware group
        start_hardware_group = start.add_argument_group(
            "Physical Hardware",
            description="Command line flags which describe the hardware of "
                        "the agent.")
        start_hardware_group.add_argument(
            "--cpus", default=cpu.total_cpus(),
            config="agent_cpus", type=int,
            help="The total amount of cpus installed on the "
                 "system.  Defaults to the number of cpus installed "
                 "on the system.")
        start_hardware_group.add_argument(
            "--ram", default=memory.total_ram(),
            config="agent_ram", type=int,
            help="The total amount of ram installed on the system in "
                 "megabytes.  Defaults to the amount of ram the "
                 "system has installed.")

        # start interval controls
        start_interval_group = start.add_argument_group(
            "Interval Controls",
            description="Controls which dictate when certain internal "
                        "intervals should occur.")
        start_interval_group.add_argument(
            "--ram-check-interval",
            config="agent_ram_check_interval", type=int,
            help="How often ram resources should be checked for changes. "
                 "The amount of memory currently being consumed on the system "
                 "is checked after certain events occur such as a process but "
                 "this flag specifically controls how often we should check "
                 "when no such events are occurring. [default: %(default)s]")
        start_interval_group.add_argument(
            "--ram-max-report-frequency",
            config="agent_ram_max_report_frequency", type=int,
            help="This is a limiter that prevents the agent from reporting "
                 "memory changes to the master more often than a specific "
                 "time interval.  This is done in order to ensure that when "
                 "100s of events fire in a short period of time cause changes "
                 "in ram usage only one or two will be reported to the "
                 "master. [default: %(default)s]")
        start_interval_group.add_argument(
            "--ram-report-delta", config="agent_ram_report_delta", type=int,
            help="Only report a change in ram if the value has changed "
                 "at least this many megabytes. [default: %(default)s]")
        start_interval_group.add_argument(
            "--master-reannounce", config="agent_master_reannounce", type=int,
            help="Controls how often the agent should reannounce itself "
                 "to the master.  The agent may be in contact with the master "
                 "more often than this however during long period of "
                 "inactivity this is how often the agent will 'inform' the "
                 "master the agent is still online.")

        # start logging options
        logging_group = start.add_argument_group(
            "Logging Options",
            description="Settings which control logging of the agent's parent "
                        "process and/or any subprocess it runs.")
        logging_group.add_argument(
            "--log", config="agent_log",
            help="If provided log all output from the agent to this path.  "
                 "This will append to any existing log data.  [default: "
                 "%(default)s]")
        logging_group.add_argument(
            "--capture-process-output", config="jobtype_capture_process_output",
            action="store_true",
            help="If provided then all log output from each process launched "
                 "by the agent will be sent through agent's loggers.")
        logging_group.add_argument(
            "--task-log-dir", config="jobtype_task_logs",
            type=isdir, type_kwargs=dict(create=True),
            help="The directory tasks should log to.")

        # network options for the agent when start is called
        start_network = start.add_argument_group(
            "Network Service",
            description="Controls how the agent is seen or interacted with "
                        "by external services such as the master.")
        start_network.add_argument(
            "--ip-remote", type=ip, config=False,
            help="The remote IPv4 address to report.  In situation where the "
                 "agent is behind a firewall this value will typically be "
                 "different.")

        start_manhole = start.add_argument_group(
            "Manhole Service",
            description="Controls the manhole service which allows a telnet "
                        "connection to be made directly into the agent as "
                        "it's running.")
        start_manhole.add_argument(
            "--enable-manhole", config="agent_manhole",
            action="store_true",
            help="When provided the manhole service will be started once the "
                 "reactor is running.")
        start_manhole.add_argument(
            "--manhole-port", config="agent_manhole_port", type=port,
            type_kwargs=dict(get_uid=lambda: self.args.uid == 0),
            help="The port the manhole service should run on if enabled.")
        start_manhole.add_argument(
            "--manhole-username", config="agent_manhole_username",
            help="The telnet username that's allowed to connect to the "
                 "manhole service running on the agent.")
        start_manhole.add_argument(
            "--manhole-password", config="agent_manhole_password",
            help="The telnet password to use when connecting to the "
                 "manhole service running on the agent.")

        # various options for how the agent will interact with the
        # master server
        start_http_group = start.add_argument_group(
            "HTTP Configuration",
            description="Options for how the agent will interact with the "
                        "master's REST api and how it should run it's own "
                        "REST api.")
        start_http_group.add_argument(
            "--html-templates-reload", config="agent_html_template_reload",
            action="store_true",
            help="If provided then force Jinja2, the html template system, "
                 "to check the file system for changes with every request. "
                 "This flag should not be used in production but is useful "
                 "for development and debugging purposes.")
        start_http_group.add_argument(
            "--static-files", config="agent_static_root", type=isdir,
            help="The default location where the agent's http server should "
                 "find static files to serve.")
        start_http_group.add_argument(
            "--http-retry-delay-offset",
            config="agent_http_retry_delay_offset", type=number,
            help="If a http request to the master has failed, wait at least "
                 "this amount of time before resending the request.")
        start_http_group.add_argument(
            "--http-retry-delay-factor",
            config="agent_http_retry_delay_factor", type=number,
            help="The value provided here is used in combination with "
                 "--http-retry-delay-offset to calculate the retry delay.  "
                 "This is used as a multiplier against random() before being "
                 "added to the offset.")

        jobtype_group = start.add_argument_group("Job Types")
        jobtype_group.add_argument(
            "--jobtype-no-cache", config="jobtype_enable_cache",
            action="store_true",
            help="If provided then do not cache job types, always directly "
                 "retrieve them.  This is beneficial if you're testing the "
                 "agent or a new job type class.")

        # options when stopping the agent
        stop_group = stop.add_argument_group(
            "optional flags",
            description="Flags that control how the agent is stopped")
        stop_group.add_argument(
            "--no-wait", default=False, action="store_true", config=False,
            help="If provided then don't wait on the agent to shut itself "
                 "down.  By default we would want to wait on each task to stop "
                 "so we can catch any errors and then finally wait on the "
                 "agent to shutdown too.  If you're in a hurry or stopping a "
                 "bunch of agents at once then setting this flag will let the "
                 "agent continue to stop itself without waiting for each agent")