def test_replace_uid_tag():
    command = "command without uid tag"
    assert_array_equal(smartdispatch.replace_uid_tag([command]), [command])

    command = "command with one {UID} tag"
    uid = utils.generate_uid_from_string(command)
    assert_array_equal(smartdispatch.replace_uid_tag([command]),
                       [command.replace("{UID}", uid)])

    command = "command with two {UID} tag {UID}"
    uid = utils.generate_uid_from_string(command)
    assert_array_equal(smartdispatch.replace_uid_tag([command]),
                       [command.replace("{UID}", uid)])

    commands = ["a command with a {UID} tag"] * 10
    uid = utils.generate_uid_from_string(commands[0])
    assert_array_equal(smartdispatch.replace_uid_tag(commands),
                       [commands[0].replace("{UID}", uid)] * len(commands))
Exemplo n.º 2
0
def main():
    # Necessary if we want 'logging.info' to appear in stderr.
    logging.root.setLevel(logging.INFO)

    args = parse_arguments()

    command_manager = CommandManager(args.commands_filename)

    while True:
        command = command_manager.get_command_to_run()

        if command is None:
            break

        uid = utils.generate_uid_from_string(command)
        stdout_filename = os.path.join(args.logs_dir, uid + ".out")
        stderr_filename = os.path.join(args.logs_dir, uid + ".err")

        # Get job and node ID
        job_id = os.environ.get('PBS_JOBID', 'undefined')
        node_name = os.environ.get('HOSTNAME', 'undefined')

        with open(stdout_filename, 'a') as stdout_file:
            with open(stderr_filename, 'a') as stderr_file:
                log_datetime = t.strftime(
                    "## SMART-DISPATCH - Started on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n"
                    .format(job_id=job_id, node_name=node_name))
                if stdout_file.tell(
                ) > 0:  # Not the first line in the log file.
                    log_datetime = t.strftime(
                        "\n## SMART-DISPATCH - Resumed on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n"
                        .format(job_id=job_id, node_name=node_name))

                log_command = "## SMART-DISPATCH - Command: " + command + '\n'

                stdout_file.write(log_datetime + log_command)
                stdout_file.flush()
                stderr_file.write(log_datetime + log_command)
                stderr_file.flush()

                error_code = subprocess.call(command,
                                             stdout=stdout_file,
                                             stderr=stderr_file,
                                             shell=True)

        command_manager.set_running_command_as_finished(command, error_code)
def replace_uid_tag(commands):
    return [
        command.replace("{UID}", utils.generate_uid_from_string(command))
        for command in commands
    ]
Exemplo n.º 4
0
def main():
    # Necessary if we want 'logging.info' to appear in stderr.
    logging.root.setLevel(logging.INFO)

    args = parse_arguments()

    command_manager = CommandManager(args.commands_filename)

    if args.assumeResumable:
        # Handle TERM signal gracefully by sending running commands back to
        # the list of pending commands.
        # NOTE: There are several cases when the handler will not have
        #       up-to-date information on running the command and/or process,
        #       but chances of that happening are VERY slim and the
        #       consequences are not fatal.
        def sigterm_handler(signal, frame):
            if sigterm_handler.triggered:
                return
            else:
                sigterm_handler.triggered = True

            if sigterm_handler.proc is not None:
                sigterm_handler.proc.wait()
            if sigterm_handler.command is not None:
                command_manager.set_running_command_as_pending(
                    sigterm_handler.command)
            sys.exit(0)

        sigterm_handler.triggered = False
        sigterm_handler.command = None
        sigterm_handler.proc = None
        signal.signal(signal.SIGTERM, sigterm_handler)

    while True:
        command = command_manager.get_command_to_run()
        if args.assumeResumable:
            sigterm_handler.proc = None
            sigterm_handler.command = command

        if command is None:
            break

        uid = utils.generate_uid_from_string(command)
        stdout_filename = os.path.join(args.logs_dir, uid + ".out")
        stderr_filename = os.path.join(args.logs_dir, uid + ".err")

        # Get job and node ID
        job_id = os.environ.get('PBS_JOBID', 'undefined')
        node_name = os.environ.get('HOSTNAME', 'undefined')

        with open(stdout_filename, 'a') as stdout_file:
            with open(stderr_filename, 'a') as stderr_file:
                log_datetime = t.strftime(
                    "## SMART-DISPATCH - Started on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n"
                    .format(job_id=job_id, node_name=node_name))
                if stdout_file.tell(
                ) > 0:  # Not the first line in the log file.
                    log_datetime = t.strftime(
                        "\n## SMART-DISPATCH - Resumed on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n"
                        .format(job_id=job_id, node_name=node_name))

                log_command = "## SMART-DISPATCH - Command: " + command + '\n'

                stdout_file.write(log_datetime + log_command)
                stdout_file.flush()
                stderr_file.write(log_datetime + log_command)
                stderr_file.flush()

                proc = subprocess.Popen(command,
                                        stdout=stdout_file,
                                        stderr=stderr_file,
                                        shell=True)
                if args.assumeResumable:
                    sigterm_handler.proc = proc
                error_code = proc.wait()

        command_manager.set_running_command_as_finished(command, error_code)
Exemplo n.º 5
0
def replace_uid_tag(commands):
    return [command.replace("{UID}", utils.generate_uid_from_string(command)) for command in commands]
Exemplo n.º 6
0
def test_generate_uid_from_string():
    assert_equal(utils.generate_uid_from_string("same text"),
                 utils.generate_uid_from_string("same text"))
    assert_true(
        utils.generate_uid_from_string("same text") !=
        utils.generate_uid_from_string("sametext"))
Exemplo n.º 7
0
def main(argv=None):
    # Necessary if we want 'logging.info' to appear in stderr.
    logging.root.setLevel(logging.INFO)

    args = parse_arguments(argv)
    path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME)

    # Check if RESUME or LAUNCH mode
    if args.mode == "launch":
        if args.commandsFile is not None:
            # Commands are listed in a file.
            jobname = smartdispatch.generate_logfolder_name(os.path.basename(args.commandsFile.name), max_length=235)
            commands = smartdispatch.get_commands_from_file(args.commandsFile)
        else:
            # Command that needs to be parsed and unfolded.
            command = " ".join(args.commandAndOptions)
            jobname = smartdispatch.generate_name_from_command(command, max_length=235)
            commands = smartdispatch.unfold_command(command)

        commands = smartdispatch.replace_uid_tag(commands)
        nb_commands = len(commands)  # For print at the end

        if args.batchName:
            jobname = smartdispatch.generate_logfolder_name(utils.slugify(args.batchName), max_length=235)

    elif args.mode == "resume":
        jobname = args.batch_uid
        if os.path.isdir(jobname):
            # We assume `jobname` is `path_job` repo, we extract the real `jobname`.
            jobname = os.path.basename(os.path.abspath(jobname))

        if not os.path.isdir(pjoin(path_smartdispatch_logs, jobname)):
            raise LookupError("Batch UID ({0}) does not exist! Cannot resume.".format(jobname))
    else:
        raise ValueError("Unknown subcommand!")

    job_folders_paths = smartdispatch.get_job_folders(path_smartdispatch_logs, jobname)
    path_job, path_job_logs, path_job_commands = job_folders_paths

    # Keep a log of the command line in the job folder.
    command_line = " ".join(sys.argv)
    smartdispatch.log_command_line(path_job, command_line)

    command_manager = CommandManager(pjoin(path_job_commands, "commands.txt"))

    # If resume mode, reset running jobs
    if args.mode == "launch":
        command_manager.set_commands_to_run(commands)
    elif args.mode == "resume":
        # Verifying if there are failed commands
        failed_commands = command_manager.get_failed_commands()
        if len(failed_commands) > 0:
            FAILED_COMMAND_MESSAGE = dedent("""\
            {nb_failed} command(s) are in a failed state. They won't be resumed.
            Failed commands:
            {failed_commands}
            The actual errors can be found in the log folder under:
            {failed_commands_err_file}""")
            utils.print_boxed(FAILED_COMMAND_MESSAGE.format(
                nb_failed=len(failed_commands),
                failed_commands=''.join(failed_commands),
                failed_commands_err_file='\n'.join([utils.generate_uid_from_string(c[:-1]) + '.err' for c in failed_commands])
            ))

            if not utils.yes_no_prompt("Do you want to continue?", 'n'):
                exit()

        if args.expandPool is None:
            command_manager.reset_running_commands()

        nb_commands = command_manager.get_nb_commands_to_run()

        if args.expandPool is not None:
            args.pool = min(nb_commands, args.expandPool)

    # If no pool size is specified the number of commands is taken
    if args.pool is None:
        args.pool = command_manager.get_nb_commands_to_run()

    # Generating all the worker commands
    worker_script = pjoin(os.path.dirname(smartdispatch.__file__), 'workers', 'base_worker.py')
    worker_script_flags = ''
    if args.autoresume:
        worker_script_flags = '-r'

    worker_call_prefix = ''
    worker_call_suffix = ''
    if args.autoresume:
        worker_call_prefix = AUTORESUME_WORKER_CALL_PREFIX
        worker_call_suffix = AUTORESUME_WORKER_CALL_SUFFIX

    COMMAND_STRING = 'cd "{cwd}"; {worker_call_prefix}python2 {worker_script} {worker_script_flags} "{commands_file}" "{log_folder}" '\
                     '1>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.o" '\
                     '2>> "{log_folder}/worker/$PBS_JOBID\"\"_worker_{{ID}}.e" &'\
                     '{worker_call_suffix}'
    COMMAND_STRING = COMMAND_STRING.format(cwd=os.getcwd(), worker_call_prefix=worker_call_prefix, worker_script=worker_script,
                                           worker_script_flags=worker_script_flags, commands_file=command_manager._commands_filename,
                                           log_folder=path_job_logs, worker_call_suffix=worker_call_suffix)
    commands = [COMMAND_STRING.format(ID=i) for i in range(args.pool)]

    # TODO: use args.memPerNode instead of args.memPerNode
    queue = Queue(args.queueName, CLUSTER_NAME, args.walltime, args.coresPerNode, args.gpusPerNode, float('inf'), args.modules)

    # Check that requested core number does not exceed node total
    if args.coresPerCommand > queue.nb_cores_per_node:
        sys.stderr.write("smart-dispatch: error: coresPerCommand exceeds nodes total: asked {req_cores} cores, nodes have {node_cores}\n"
                         .format(req_cores=args.coresPerCommand, node_cores=queue.nb_cores_per_node))
        sys.exit(2)

    # Check that requested gpu number does not exceed node total
    if args.gpusPerCommand > queue.nb_gpus_per_node:
        sys.stderr.write("smart-dispatch: error: gpusPerCommand exceeds nodes total: asked {req_gpus} gpus, nodes have {node_gpus}\n"
                         .format(req_gpus=args.gpusPerCommand, node_gpus=queue.nb_gpus_per_node))
        sys.exit(2)


    command_params = {'nb_cores_per_command': args.coresPerCommand,
                      'nb_gpus_per_command': args.gpusPerCommand,
                      'mem_per_command': None  # args.memPerCommand
                      }

    prolog = []
    epilog = ['wait']
    if args.autoresume:
        prolog = [AUTORESUME_PROLOG]
        epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)]

    job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job)
    
    # generating default names per each jobs in each batch
    for pbs_id, pbs in enumerate(job_generator.pbs_list):
        proper_size_name = utils.jobname_generator(jobname, pbs_id)
        pbs.add_options(N=proper_size_name)
    
    if args.pbsFlags is not None:
        job_generator.add_pbs_flags(args.pbsFlags.split(' '))
    pbs_filenames = job_generator.write_pbs_files(path_job_commands)

    # Launch the jobs
    print "## {nb_commands} command(s) will be executed in {nb_jobs} job(s) ##".format(nb_commands=nb_commands, nb_jobs=len(pbs_filenames))
    print "Batch UID:\n{batch_uid}".format(batch_uid=jobname)
    if not args.doNotLaunch:
        
        try:
            launch_jobs(LAUNCHER if args.launcher is None else args.launcher, pbs_filenames, CLUSTER_NAME, path_job)
        except subprocess.CalledProcessError as e:
            sys.stderr.write("smart-dispatch: error: The launcher wasn't able the launch the job(s) properly. Maybe the pbs file(s) generated were invalid: \n{}".format(e.output))
            sys.exit(2)

    print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job)
Exemplo n.º 8
0
def test_generate_uid_from_string():
    assert_equal(utils.generate_uid_from_string("same text"), utils.generate_uid_from_string("same text"))
    assert_true(utils.generate_uid_from_string("same text") != utils.generate_uid_from_string("sametext"))
Exemplo n.º 9
0
def main():
    # Necessary if we want 'logging.info' to appear in stderr.
    logging.root.setLevel(logging.INFO)

    args = parse_arguments()

    command_manager = CommandManager(args.commands_filename)

    if args.assumeResumable:
        # Handle TERM signal gracefully by sending running commands back to
        # the list of pending commands.
        # NOTE: There are several cases when the handler will not have
        #       up-to-date information on running the command and/or process,
        #       but chances of that happening are VERY slim and the
        #       consequences are not fatal.
        def sigterm_handler(signal, frame):
            if sigterm_handler.triggered:
                return
            else:
                sigterm_handler.triggered = True

            if sigterm_handler.proc is not None:
                sigterm_handler.proc.wait()
            if sigterm_handler.command is not None:
                command_manager.set_running_command_as_pending(sigterm_handler.command)
            sys.exit(0)
        sigterm_handler.triggered = False
        sigterm_handler.command = None
        sigterm_handler.proc = None
        signal.signal(signal.SIGTERM, sigterm_handler)

    while True:
        command = command_manager.get_command_to_run()
        if args.assumeResumable:
            sigterm_handler.proc = None
            sigterm_handler.command = command

        if command is None:
            break

        uid = utils.generate_uid_from_string(command)
        stdout_filename = os.path.join(args.logs_dir, uid + ".out")
        stderr_filename = os.path.join(args.logs_dir, uid + ".err")

        # Get job and node ID
        job_id = os.environ.get('PBS_JOBID', 'undefined')
        node_name = os.environ.get('HOSTNAME', 'undefined')

        with open(stdout_filename, 'a') as stdout_file:
            with open(stderr_filename, 'a') as stderr_file:
                log_datetime = t.strftime("## SMART-DISPATCH - Started on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n".format(job_id=job_id, node_name=node_name))
                if stdout_file.tell() > 0:  # Not the first line in the log file.
                    log_datetime = t.strftime("\n## SMART-DISPATCH - Resumed on: %Y-%m-%d %H:%M:%S - In job: {job_id} - On nodes: {node_name} ##\n".format(job_id=job_id, node_name=node_name))

                log_command = "## SMART-DISPATCH - Command: " + command + '\n'

                stdout_file.write(log_datetime + log_command)
                stdout_file.flush()
                stderr_file.write(log_datetime + log_command)
                stderr_file.flush()

                proc = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file, shell=True)
                if args.assumeResumable:
                    sigterm_handler.proc = proc
                error_code = proc.wait()

        command_manager.set_running_command_as_finished(command, error_code)