Exemplo n.º 1
1
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
Exemplo n.º 2
0
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO',
                  pdb_on_exception=False):
    """
    Run a single rocket in the current directory.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        fw_id (int): if set, a particular Firework to run
        strm_lvl (str): level at which to output logs to stdout
        pdb_on_exception (bool): if set to True, python will start
            the debugger on a firework exception

    Returns:
        bool
    """
    fworker = get_fworker(fworker)
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl)

    log_multi(l_logger, 'Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket_ran = rocket.run(pdb_on_exception=pdb_on_exception)
    log_multi(l_logger, 'Rocket finished')
    return rocket_ran
Exemplo n.º 3
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False,
                        local_redirect=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
        local_redirect (bool): redirect standard input and output to local file
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()
    firing_state_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict,
                              local_redirect=local_redirect, firing_state_dict=firing_state_dict)
    FWData().Running_IDs = running_ids_dict
    FWData().FiringState = firing_state_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Exemplo n.º 4
0
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              nlaunches=0,
              max_loops=-1,
              sleep_time=None,
              strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=launchpad.get_logdir(),
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        while launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            time.sleep(
                0.15
            )  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
Exemplo n.º 5
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1,
              sleep_time=None, strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops
    :param max_loops: (int) maximum number of loops (default -1 is infinite)
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    :param timeout: (int) # of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
Exemplo n.º 6
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Exemplo n.º 7
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        skip_check = False  # this is used to speed operation
        while skip_check or launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
Exemplo n.º 8
0
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'):
    """
    Run a single rocket in the current directory
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param fw_id: (int) if set, a particular Firework to run
    :param strm_lvl: (str) level at which to output logs to stdout
    """
    fworker = fworker if fworker else FWorker()
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl)

    log_multi(l_logger, 'Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket_ran = rocket.run()
    log_multi(l_logger, 'Rocket finished')
    return rocket_ran
Exemplo n.º 9
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(sleep_time))
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout)
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Exemplo n.º 10
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl)
    rapidfire(
        launchpad,
        fworker=fworker,
        m_dir=None,
        nlaunches=nlaunches,
        max_loops=-1,
        sleep_time=sleep,
        strm_lvl=loglvl,
        timeout=timeout,
    )
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, "Resubmit sub job".format(sleep_time))
            rapidfire(
                launchpad,
                fworker=fworker,
                m_dir=None,
                nlaunches=nlaunches,
                max_loops=-1,
                sleep_time=sleep,
                strm_lvl=loglvl,
                timeout=timeout,
            )
        else:
            break
    log_multi(l_logger, "Sub job finished")
Exemplo n.º 11
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout,
                      running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
              max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5) # wait for LaunchPad to be initialized
        firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
                      max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Exemplo n.º 12
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
        local_redirect (bool): redirect standard input and output to local file
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    def time_ok():
        # has the rapidfire run timed out?
        return (timeout is None or
                (datetime.now() - start_time).total_seconds() < timeout)

    while num_loops != max_loops and time_ok():
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and time_ok():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            if local_redirect:
                with redirect_local():
                    rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                               pdb_on_exception=pdb_on_exception)
            else:
                rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                           pdb_on_exception=pdb_on_exception)

            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if nlaunches > 0 and num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if nlaunches == 0:
            if not launchpad.future_run_exists(fworker):
                break
        elif num_launched == nlaunches:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...')
    os.chdir(curdir)
Exemplo n.º 13
0
def rapidfire_process(fworker,
                      nlaunches,
                      sleep,
                      loglvl,
                      port,
                      node_list,
                      sub_nproc,
                      timeout,
                      running_ids_dict,
                      local_redirect,
                      firing_state_dict,
                      macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        firing_pids = [
            pid for pid, is_firing in fw_data.FiringState.items() if is_firing
        ]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')