Exemplo n.º 1
0
def write_checkpoint(checkpoint_tag, checkpoint_tag_next_list,
                     checkpoint_route_tag_next_list, output_files):
    # checkpoint_tag
    #   common, route_A (mutation route) or route_B (sv route)
    # checkpoint_tag_next_list, checkpoint_route_tag_next_list
    #   next task(s) and route(s)

    elapsed_time = time.time()

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    verbose = int(get_vge_conf("vge", "verbose", 0))
    if verbose == 0:
        logger.setLevel(CRITICAL)
    elif verbose == 1 or verbose == 2:
        logger.setLevel(INFO)
    elif verbose == 3:
        logger.setLevel(DEBUG)
    basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')

    pname = "Pipeline(CHECKPOINT WRITE)"
    process_id = os.getpid()
    logger.info("%s pid[%i]: collecting files in target output directory...." %
                (pname, process_id))

    # exclude tasks that did not write checkpoints
    exclude_checkpoint_list = list(
        get_vge_conf("restart", "exclude_checkpoint_list", []))

    for exclude_checkpoint_tag in exclude_checkpoint_list:
        if checkpoint_tag == exclude_checkpoint_tag:
            logger.warning("%s pid[%i]: exclude task for %s" %
                           (pname, process_id, exclude_checkpoint_tag))
            elapsed_time = time.time() - elapsed_time
            logger.info("%s pid[%i]: write_checkpoint elapsed_time %f[sec]" %
                        (pname, process_id, elapsed_time))
            return 0

    target_dir = chk_target_dir
    checkpoint_dir = chk_checkpoint_dir

    # create checkpoint data directory
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)

    target_list = []
    for route_tag_index in range(len(checkpoint_route_tag_next_list)):
        elapsed_time_mkchkpoint1 = time.time()

        # set target directory
        target_absdir = os.path.abspath(target_dir)
        target_absdir = os.path.normpath(target_absdir)
        # make a target_dir_dict
        targetdir_dict = {}
        targetdir_dict["filename_list"] = []
        targetdir_dict["size_list"] = []

        flag = 0
        for root, dirs, files in os.walk(target_absdir):
            #dirs[:] = [d for d in dirs if os.path.basename(checkpoint_dir) not in os.path.join(root, d)]
            dirs[:] = [
                d for d in dirs
                if os.path.abspath(checkpoint_dir) not in os.path.join(
                    root, d)
            ]
            if flag == 0:
                for d in dirs:
                    target_list.append(os.path.join(root, d))
                for filename in files:
                    target_list.append(os.path.join(root, filename))
                flag = 1

            target_dirname = root[len(target_absdir) + 1:]
            if target_dirname == "":
                target_dirname = "."
            #    for filename in files:
            #        targetdir_dict["filename_list"].append(target_dirname + "/" + filename)
            #        targetdir_dict["size_list"].append(os.stat(root + "/" + filename).st_size)
            for filename in files:
                if os.path.isfile(root + "/" + filename):
                    targetdir_dict["filename_list"].append(target_dirname +
                                                           "/" + filename)
                    targetdir_dict["size_list"].append(
                        os.stat(root + "/" + filename).st_size)

        checkpointdata = {}
        checkpointdata["outputfiles"] = output_files
        checkpointdata["targetdir"] = target_dir
        checkpointdata["targetdirdict"] = targetdir_dict

        checkpoint_absdir = os.path.abspath(checkpoint_dir)
        checkpoint_absdir = os.path.normpath(checkpoint_absdir)

        # make restartdata filename
        temp_checkpoint_tag = checkpoint_tag.replace(" ", "")
        checkpoint_filename = "checkpoint_" + temp_checkpoint_tag + ".chk"
        writefilenamepath = os.path.join(checkpoint_absdir,
                                         checkpoint_filename)
        writefilenamepath = os.path.normpath(writefilenamepath)

        err_flag = False
        # serialize the checkpoint data
        if not err_flag:
            try:
                package = pickle.dumps(checkpointdata)
            except Exception as e:
                logger.info(
                    "%s pid[%i]: error was occured when serializing... [%s]. "
                    % (pname, process_id, str(e)))
                err_flag = True

        elapsed_time_mkchkpoint2 = time.time()
        # write data
        if not err_flag:
            try:
                if os.path.isfile(writefilenamepath):
                    logger.warning(
                        "%s pid[%i]: (WRITE WARNING) checkpoint file already exists in [%s] so it will be removed and written."
                        % (pname, process_id, writefilenamepath))
                    os.remove(writefilenamepath)
                write_checkpoint = open(writefilenamepath, "wb")
                pickle.dump(package, write_checkpoint)
                write_checkpoint.close()

            except Exception as e:
                logger.info(
                    "%s pid[%i]: checkpoint file write error was occured [%s]. "
                    % (pname, process_id, str(e)))
                err_flag = True
                return 1

        elapsed_time_mkchkpoint3 = time.time()
        # target_dir backup
        restartdata_dir = os.path.join(checkpoint_dir, "backupdata")
        restartdata_dir = os.path.abspath(restartdata_dir)
        restartdata_dir = os.path.normpath(
            restartdata_dir
        ) + "_" + checkpoint_route_tag_next_list[route_tag_index]
        restartdata_dir = os.path.normpath(restartdata_dir)

        if not os.path.isdir(restartdata_dir):
            os.mkdir(restartdata_dir)

        elapsed_time_mkchkpoint4 = time.time()
        # backup to restartdata_dir by hardlink excpet checkpoint direcotry
        backuplist = target_list
        for backupdir in backuplist:
            if backupdir == os.path.basename(checkpoint_dir):
                continue
            else:
                #src_target_dir = target_dir + "/" + backupdir
                src_target_dir = backupdir
                #if not os.path.isdir(src_target_dir):
                #    continue

                args = ["cp", "-lr", src_target_dir, restartdata_dir]
                ret = subprocess.call(args)
                logger.info(
                    "%s pid[%i]: (WRITE WARNING) target_dir backup %s to %s" %
                    (pname, process_id, src_target_dir, restartdata_dir))
                logger.warning(
                    "%s pid[%i]: (WRITE WARNING) target_dir backup subprocess.call returncode %d"
                    % (pname, process_id, ret))

        if not err_flag:

            tag_next_len = len(checkpoint_tag_next_list)
            route_tag_next_len = len(checkpoint_route_tag_next_list)
            if tag_next_len != route_tag_next_len:
                logger.warning(
                    "%s pid[%i]: (WRITE WARNING) difference in the length of checkpoint_tag_next_list(%d) and checkpoint_route_tag_next_list(%d)"
                    % (pname, process_id, tag_next_len, route_tag_next_len))
            elif tag_next_len > 2 or route_tag_next_len > 2:
                logger.warning(
                    "%s pid[%i]: (WRITE WARNING) checkpoint_tag_next_list(%d) and checkpoint_route_tag_next_list(%d) greater than 2 "
                    % (pname, process_id, tag_next_len, route_tag_next_len))

            route_tag_next_str = str(
                checkpoint_route_tag_next_list[route_tag_index])
            tag_next_str = str(checkpoint_tag_next_list[route_tag_index])
            # make restartmasterdata filename
            checkpointmaster_filename = "checkpoint_" + route_tag_next_str + ".mst"
            restartinfo_filename = os.path.join(checkpoint_absdir,
                                                checkpointmaster_filename)
            restartinfo_filename = os.path.normpath(restartinfo_filename)

            # append or write restart info
            restart_info_list = []
            restart_info_list = [tag_next_str, writefilenamepath]

            restart_info_string = ",".join(restart_info_list)
            restart_info_string += "\n"

            try:
                if os.path.isfile(restartinfo_filename):
                    logger.debug(
                        "%s pid[%i]: restart master file was already found in [%s]. "
                        % (pname, process_id, restartinfo_filename))
                    write_masterfile = open(restartinfo_filename, "a")
                else:
                    logger.info(
                        "%s pid[%i]: restart master file was newly created in [%s]. "
                        % (pname, process_id, restartinfo_filename))
                    write_masterfile = open(restartinfo_filename, "w")
                write_masterfile.write(restart_info_string)
                write_masterfile.close()

            except (IOError, OSError) as e:
                logger.error(
                    "%s pid[%i]: ERROR! ... write error was occured [%s]. " %
                    (pname, process_id, str(e)))
                err_flag = True
                return 1

        elapsed_time_mkchkpoint5 = time.time()
        logger.info(
            "%s pid[%i]: write_checkpoint tag %s elapsed_time_mkchkpoint %f[sec]"
            % (pname, process_id, route_tag_index,
               elapsed_time_mkchkpoint2 - elapsed_time_mkchkpoint1))
        logger.info(
            "%s pid[%i]: write_checkpoint tag %s elapsed_time_write_chkpoint %f[sec]"
            % (pname, process_id, route_tag_index,
               elapsed_time_mkchkpoint3 - elapsed_time_mkchkpoint2))
        logger.info(
            "%s pid[%i]: write_checkpoint tag %s elapsed_time_mk_targetdir %f[sec]"
            % (pname, process_id, route_tag_index,
               elapsed_time_mkchkpoint4 - elapsed_time_mkchkpoint3))
        logger.info(
            "%s pid[%i]: write_checkpoint tag %s elapsed_time_hardlink %f[sec]"
            % (pname, process_id, route_tag_index,
               elapsed_time_mkchkpoint5 - elapsed_time_mkchkpoint4))

    elapsed_time = time.time() - elapsed_time
    logger.info("%s pid[%i]: write_checkpoint elapsed_time %f[sec]" %
                (pname, process_id, elapsed_time))

    return 0
Exemplo n.º 2
0
def checkpoint_restart(target_dir):

    elapsed_time = time.time()
    # init_restart_conf()

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    verbose = int(get_vge_conf("vge", "verbose", 0))
    if verbose == 0:
        logger.setLevel(CRITICAL)
    elif verbose == 1 or verbose == 2:
        logger.setLevel(INFO)
    elif verbose == 3:
        logger.setLevel(DEBUG)
    basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
    pname = "Pipeline(CHECKPOINT RESTART)"
    process_id = os.getpid()
    logger.info("%s pid[%i]: checkpoint judgment and reading start task " %
                (pname, process_id))

    chk_file = int(get_vge_conf("restart", "chk_file", 2))

    # common parameters
    global chk_work_dir
    global chk_target_dir
    global chk_checkpoint_dir

    work_dir_path = os.path.dirname(target_dir)
    work_dir_path = os.path.abspath(work_dir_path)
    work_dir_path = os.path.normpath(work_dir_path)
    checkpoint_dir_path = os.path.join(target_dir, "checkpoint")
    checkpoint_dir_path = os.path.abspath(checkpoint_dir_path)
    checkpoint_dir_path = os.path.normpath(checkpoint_dir_path)
    target_dir_path = os.path.abspath(target_dir)
    target_dir_path = os.path.normpath(target_dir_path)
    chk_work_dir = work_dir_path
    chk_target_dir = target_dir_path
    chk_checkpoint_dir = checkpoint_dir_path

    restartdata_dir = os.path.join(chk_checkpoint_dir, "backupdata")
    restartdata_dir = os.path.abspath(restartdata_dir)
    restartdata_dir = os.path.normpath(restartdata_dir)

    # reconst_checkpoint
    flag_skiptask = {}
    flag_starttask = {}

    if not os.path.isdir(chk_checkpoint_dir):
        logger.info("%s pid[%i]: normal start" % (pname, process_id))
        elapsed_time = time.time() - elapsed_time
        logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                    (pname, process_id, elapsed_time))
        return flag_skiptask, flag_starttask, {}

    # checkpoint mstart file
    masterfile_list = []
    # create a list of checkpoint master
    for filename in os.listdir(chk_checkpoint_dir):
        basename, ext = os.path.splitext(filename)
        if ".mst" == ext:
            masterfile_list.append(filename)

    # not restart if no checkpoint master file
    if not masterfile_list:
        logger.info("%s pid[%i]: not restart, checkpoint start" %
                    (pname, process_id))
        elapsed_time = time.time() - elapsed_time
        logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                    (pname, process_id, elapsed_time))
        return flag_skiptask, flag_starttask, {}

    set_task_list = []
    set_checkpointfiles = {}
    dict_route = {}
    for mstfile in masterfile_list:
        chk_masterfile = os.path.join(chk_checkpoint_dir, mstfile)
        logger.debug("%s pid[%i]: checkpoint master file path: %s" %
                     (pname, process_id, chk_masterfile))

        if os.path.isfile(chk_masterfile):
            try:
                checkpoint_master = open(chk_masterfile, "rb")
                logger.info(
                    "%s pid[%i]: restart start, open checkpoint master" %
                    (pname, process_id))
                tmplist = checkpoint_master.readlines()
                checkpoint_master.close()
            except IOError as e:
                logger.info("%s pid[%i]: error was masterfile open [%s]. " %
                            (pname, process_id, str(e)))
                exit()
        else:
            logger.info("%s pid[%i]: not restart, checkpoint start" %
                        (pname, process_id))
            exit()

        # check the contents of checkpoint master file
        # the first column of master file is a task that created the checkpoint
        # the second column is the path of the checkpoint
        # do nothing if the first column is "final"
        for line in tmplist:
            line_list = line.split(",")
            task = line_list[0]
            set_task_list.append(task)
            if len(set_task_list) == 1:
                set_checkpointfiles[task] = [line_list[1].strip()]
            # compare the previous task name with the current task name
            elif set_task_list[-2] == set_task_list[-1]:
                set_checkpointfiles[task].append(line_list[1].strip())
            else:
                set_checkpointfiles[task] = [line_list[1].strip()]
        if set_task_list[-1] != "final":
            logger.debug("%s pid[%i]: set_task_list: %s" %
                         (pname, process_id, set_task_list))
            flag_starttask[set_task_list[-1]] = True

            # create a dictionary that links task names with root tags
            # dict_route[task] contins the root tag of master file
            # basename[0:10] contains "checkpoint_"
            basename, ext = os.path.splitext(mstfile)
            dict_route[task] = basename[11:]

    # set flag from master file
    for task in set_task_list:
        # skip if the checkpoint is already written
        # checkpointtag contains the task name of checkpoint file
        # basename[0:10] contains "checkpoint_"
        for checkpointpath in set_checkpointfiles[task]:
            basename, ext = os.path.splitext(os.path.basename(checkpointpath))
            checkpointtag = basename[11:]
            flag_skiptask[checkpointtag] = True

    # read checkpointdata
    # verify_checkpoint
    restart_files_list_dict = {}
    for task, startflag in flag_starttask.items():
        new_outputfiles_list = []

        for checkpointpath in set_checkpointfiles[task]:
            # modify checkpoint data path
            chkpoint_filename = os.path.basename(checkpointpath)
            checkpoint_absdir = os.path.abspath(chk_checkpoint_dir)
            checkpoint_absdir = os.path.normpath(checkpoint_absdir)
            new_checkpoint_filepath = os.path.join(checkpoint_absdir,
                                                   chkpoint_filename)
            new_checkpoint_filepath = os.path.normpath(new_checkpoint_filepath)
            readfilename = new_checkpoint_filepath

            checkpointdata = None
            binalyfile = None

            # open checkpoint data
            try:
                binalyfile = open(readfilename, "rb")
                loaded_package = pickle.load(binalyfile)
                binalyfile.close()
                checkpointdata = pickle.loads(loaded_package)
            except IOError:
                exit()

            # check file integrity
            if chk_file == 0:
                pass

            elif chk_file == 1:
                restart_input_files_set = set()
                for root, dirs, files in os.walk(restartdata_dir + "_" +
                                                 dict_route[task]):
                    target_dirname = root[len(restartdata_dir + "_" +
                                              dict_route[task]) + 1:]
                    if target_dirname == "":
                        target_dirname = "."
                    for filename in files:
                        restart_input_files_set.add(target_dirname + "/" +
                                                    filename)

                if not (set(checkpointdata["targetdirdict"]["filename_list"])
                        <= restart_input_files_set):
                    logger.warning(
                        "%s pid[%i]: (VERIFY WARNING) can not restart because file size mismatch...."
                        % (pname, process_id))
                    logger.debug(
                        "%s pid[%i]: (VERIFY WARNING) restart_input_files_set: %s"
                        % (pname, process_id, restart_input_files_set))
                    logger.debug(
                        "%s pid[%i]: (VERIFY WARNING) set(checkpointdata[\"targetdirdict\"][\"filename_list\"]): %s"
                        %
                        (pname, process_id,
                         set(checkpointdata["targetdirdict"]["filename_list"]))
                    )
                    exit()

            elif chk_file == 2:
                for i in range(
                        len(checkpointdata["targetdirdict"]["filename_list"])):
                    restart_data_file = os.path.join(
                        restartdata_dir + "_" + dict_route[task],
                        checkpointdata["targetdirdict"]["filename_list"][i])
                    if os.path.islink(
                            restart_data_file
                    ) and not os.path.isfile(restart_data_file):
                        # modify a link in a symbolic link file
                        current_dir = os.getcwd()
                        current_dir_len = len(current_dir.split("/"))
                        realpath = os.path.realpath(restart_data_file)
                        # replace the name of the current directory
                        # because Kei computer uses stagin system
                        realpath_split = realpath.split("/")
                        dircount = 0
                        new_path = ""
                        for dirname in realpath_split:
                            dircount = dircount + 1
                            if dircount > current_dir_len:
                                new_path = new_path + "/" + dirname
                        new_src_file = os.path.normpath(current_dir + new_path)
                        os.unlink(restart_data_file)
                        os.symlink(new_src_file, restart_data_file)
                        logger.debug(
                            "%s pid[%i]: re symbolic link %s to %s" %
                            (pname, process_id, restartdata_dir, new_src_file))
                    if not os.path.isfile(restart_data_file):
                        logger.warning(
                            "%s pid[%i]: (VERIFY WARNING) can not restart because 'restart_data_file' does not exist...."
                            % (pname, process_id))
                        logger.debug(
                            "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s"
                            % (pname, process_id, restart_data_file))
                        exit()
                    if os.stat(restart_data_file).st_size != checkpointdata[
                            "targetdirdict"]["size_list"][i]:
                        logger.warning(
                            "%s pid[%i]: (VERIFY WARNING) can not restart because file size does not match"
                            % (pname, process_id))
                        logger.debug(
                            "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s"
                            % (pname, process_id, restart_data_file))
                        exit()

            #
            # create outputfiles to pass to the input of the next task
            # (modify checkpoint data path)
            #

            ret = 0
            chkpointd_outputfiles_list = checkpointdata["outputfiles"]
            chkpointd_target_dir = checkpointdata["targetdir"]

            # get each element of the output from the last run.
            # element is list (multiple files) or string (single file)
            for x in chkpointd_outputfiles_list:
                if type(x) is list:
                    new_outputfiles_list2 = []
                    # combine x elements to create a file path and add it to the list
                    for output_file_path in x:
                        new_file_name = os.path.join(
                            chk_target_dir,
                            output_file_path[len(chkpointd_target_dir) + 1:])
                        new_file_name = os.path.abspath(new_file_name)
                        new_file_name = os.path.normpath(new_file_name)
                        new_outputfiles_list2.append(new_file_name)
                    new_outputfiles_list.append(new_outputfiles_list2)

                else:
                    # x is a file path and add it to the list
                    new_file_name = os.path.join(
                        chk_target_dir, x[len(chkpointd_target_dir) + 1:])
                    new_file_name = os.path.abspath(new_file_name)
                    new_file_name = os.path.normpath(new_file_name)
                    new_outputfiles_list.append(new_file_name)

            restart_files_list_dict[dict_route[task]] = new_outputfiles_list
            logger.debug("%s pid[%i]: new_outputfiles_list: %s" %
                         (pname, process_id, new_outputfiles_list))
            logger.debug("%s pid[%i]: task: %s" % (pname, process_id, task))
            logger.debug("%s pid[%i]: dict_route: %s" %
                         (pname, process_id, dict_route[task]))

        # relocate files for restart
        backup_dirs = os.listdir(restartdata_dir + "_" + dict_route[task])
        for backup_dir in backup_dirs:
            src_backup_dir = os.path.join(
                restartdata_dir + "_" + dict_route[task], backup_dir)
            args = ["cp", "-lr", src_backup_dir, target_dir]
            ret = subprocess.call(args)
            logger.warning(
                "%s pid[%i]: (WRITE WARNING) restored backup file, subprocess.call retcode %d"
                % (pname, process_id, ret))
            logger.debug("%s pid[%i]: src_backup_dir: %s" %
                         (pname, process_id, src_backup_dir))

    elapsed_time = time.time() - elapsed_time
    logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                (pname, process_id, elapsed_time))

    return flag_skiptask, flag_starttask, restart_files_list_dict
Exemplo n.º 3
0
def vge_task(arg1,arg2,arg3,arg4,arg5=None):
  # arg1: command to be performed
  # arg2: max_task
  # arg3: basefilename
  # arg4: parameter dict
  #   involved keys:               arguments
  #      verbose                   --> verbose
  #      host                      --> host
  #      port                      --> port
  #      bufsize                   --> bufsize
  #      nsleep_request            --> socket_interval_after
  #      nsleep_interval=0.5       --> socket_interval_request
  #      nsleep_connection_refused --> socket_interval_error
  #      nsleep_aftersend          --> socket_interval_send
  #      nsleep_updatelist         --> socket_interval_update
  #      nsleep_controller=0.01    --> socket_interval_close
  #      timeout_client=600.0      --> socket_timeout1
  #      timeout_ask=600.0         --> socket_timeout2
  #
  # arg5: for future use
  #       "multi" --> multi-sample mode
  #       Pipeline sample or process will be considered in vge_task.
  #       this needs more CPU resouces than single-sample mode.
  #       "single" or None --> single-sample mode
  #      
  #           
  total_time = time.time()
  # 

  #
  # return value
  #
  # it will be the lastest return code of jobs for new job 
  # and be a response message from VGE for the others.
  result = None

  #
  # socket
  #
  host = socket.gethostbyname(socket.gethostname())
  port = 8000
  bufsize = 16384

  #
  # get current process id
  #  
  process_id = os.getpid()
  #

  #
  # assign a header name 
  # 
  temp_processname = get_process_name(process_id)
  temp_processnamelist= temp_processname.split(" ")
  #
  pname="VGETASK(Client)" 
  #
  for temp0 in temp_processnamelist:
      temp1=os.path.basename(temp0)
      if "pipeline" in temp1:
         pname="VGETASK(Pipeline)"
         break
      elif temp1 == "vge_connect" :
         pname="VGETASK(CONNECT)"
         break
      elif temp1 == "vge" :
         pname="VGETASK(VGE)"
         break
  #
  del temp_processname
  del temp_processnamelist
  #
  #pname="VGETASK(Client)"
  #if "pipeline" in temp_processname: 
  #  pname="VGETASK(Pipeline)"
  #elif "vge_connect" in temp_processnamelist:
  #  pname="VGETASK(CONNECT)"
  #elif "vge" in temp_processnamelist and not "vge_connect" in temp_processnamelist:
  #  pname="VGETASK(VGE)"
  #print temp_processname, temp_processnamelist, pname
  #

  #
  # wait time 
  #
  nsleep_connection_refused=0.0
  nsleep_aftersend =0.00 # pause after send
  nsleep_updatelist=0.00 # 
  nsleep_request=20.0     # pause after asked job status to VGE
  nsleep_interval=20.0    # pause after job has been submitted.
  nsleep_controller=1.00 # wait time after connection timed out
  timeout_client=600.0
  timeout_ask=600.0

  # 
  # Config
  # 
  time_config = time.time()
  current_dir=os.getcwd()
  port = int(get_vge_conf("socket","port",port))
  bufsize = int(get_vge_conf("socket","bufsize",bufsize))
  nsleep_interval = get_vge_conf("pipeline","socket_interval_after",nsleep_interval)
  nsleep_request = get_vge_conf("pipeline","socket_interval_request", nsleep_request)
  nsleep_connection_refused = get_vge_conf("pipeline","socket_interval_error", nsleep_connection_refused)
  nsleep_aftersend = get_vge_conf("pipeline","socket_interval_send", nsleep_aftersend)
  nsleep_updatelist = get_vge_conf("pipeline","socket_interval_update",nsleep_updatelist)
  nsleep_controller = get_vge_conf("pipeline","socket_interval_close",nsleep_controller)
  timeout_client = get_vge_conf("pipeline","socket_timeout1", timeout_client)
  timeout_ask = get_vge_conf("pipeline","socket_timeout2", timeout_ask)
  time_config = time.time() - time_config
  # 
  if isinstance(arg4, argparse.Namespace):
     if arg4.socket_interval_after > 0:
         nsleep_request =  arg4.socket_interval_after
     if arg4.socket_interval_request > 0:
         nsleep_interval =  arg4.socket_interval_request
     if arg4.socket_interval_error > 0:
         nsleep_connection_refused = arg4.socket_interval_error
     if arg4.socket_interval_send > 0:
         nsleep_aftersend = arg4.socket_interval_send
     if arg4.socket_interval_update > 0:
         nsleep_updatelist = arg4.socket_interval_update
     if arg4.socket_interval_close > 0:
         nsleep_controller = arg4.socket_interval_close
     if arg4.socket_timeout1 > 0:
         timeout_client = arg4.socket_timeout1
     if arg4.socket_timeout2 > 0:
         timeout_ask = arg4.socket_timeout2


  #
  # verbose level
  # 
  verbose=0
  verbose = int(get_vge_conf("pipeline","verbose",verbose))
  from logging import getLogger,StreamHandler,basicConfig,DEBUG,INFO,WARNING,ERROR,CRITICAL
  logger=getLogger(__name__)
  if verbose == 0:
     logger.setLevel(CRITICAL)
  elif verbose == 1:
     logger.setLevel(INFO)
  elif verbose == 3:
     logger.setLevel(DEBUG)
  basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')

  logger.debug("%s pid[%i]: socket host=%s" %(pname,process_id,host))
  logger.debug("%s pid[%i]: socket port=%i" %(pname,process_id,port))
  logger.debug("%s pid[%i]: verbose level=%i" %(pname,process_id,verbose))
  logger.debug("%s pid[%i]: socket buffer size=%i" %(pname,process_id,bufsize))
  logger.debug("%s pid[%i]: socket timeout1 (job sending)=%12.5e" %(pname,process_id,timeout_client))
  logger.debug("%s pid[%i]: socket timeout2 (job waiting)=%12.5e" %(pname,process_id,timeout_ask))
  logger.debug("%s pid[%i]: socket_interval_after=%12.5e" %(pname,process_id, nsleep_interval))
  logger.debug("%s pid[%i]: socket_interval_send=%12.5e" %(pname,process_id,nsleep_aftersend))
  logger.debug("%s pid[%i]: socket_interval_request=%12.5e" %(pname,process_id, nsleep_request))
  logger.debug("%s pid[%i]: socket_interval_error=%12.5e" %(pname,process_id,nsleep_connection_refused))
  logger.debug("%s pid[%i]: socket_interval_update=%12.5e" %(pname,process_id, nsleep_updatelist))
  logger.debug("%s pid[%i]: socket_interval_close=%12.5e" %(pname,process_id, nsleep_controller))
  logger.debug("%s pid[%i]: config: elapsed time=%12.5e" %(pname,process_id, time_config))
  #

  # 
  # set a sampling mode
  #
  multi_sampling=True
  if arg5 is "single" :
     multi_sampling=False
     #logger.debug( "Pipeline pid[%i]: Pipeline sampling mode .... single" %(process_id))
  elif arg5 is "multi":
     multi_sampling=True
     #logger.debug( "Pipeline pid[%i]: Pipeline sampling mode .... multiple" %(process_id))
  elif arg5 is "response":
     multi_sampling=True
     logger.debug( "%s pid[%i]: Pipeline sampling mode .... none (response)" %(pname,process_id))

  #
  # get a pipeline_parent_pid for multi-sampling or response mode
  #
  pipeline_parent_pid=-1
  if multi_sampling:
     temp_pipeline_parent_pid=os.getppid()
     logger.debug( "%s pid[%i]: parent pid from os.getppid [%i]" %(pname,process_id,temp_pipeline_parent_pid))
     try: 
       #
       from VGE.pipeline_parent_pid import pipeline_parent_pid
       if temp_pipeline_parent_pid == pipeline_parent_pid:
          logger.debug("%s pid[%i]: VGE.pipeline_parent_pid [%i] is the same as that [%i] of os.getppid() in this process." %(pname,process_id,pipeline_parent_pid,temp_pipeline_parent_pid))
          if process_id == pipeline_parent_pid:
             logger.info("%s pid[%i]: this is a parent process [id %i]" %(pname,process_id, pipeline_parent_pid))
             pass
          elif pipeline_parent_pid == -1:
             # parent pid info was not used.
             pass
          elif process_id != pipeline_parent_pid:
             logger.info( "%s pid[%i]: parent process id [%i] (current process is a child)" %(pname,process_id, pipeline_parent_pid))
       else:
           logger.debug( "%s pid[%i]: VGE.pipeline_parent_pid [%i] is not the same as that [%i] of os.getppid() in this process." %(pname,process_id,pipeline_parent_pid,temp_pipeline_parent_pid))
           logger.info( "%s pid[%i]: current process [%i] is a main (parent) " %(pname,process_id,pipeline_parent_pid))
     except: 
         logger.debug( "%s pid[%i]: VGE.pipeline_parent_pid module was not be loaded beforehand." %(pname,process_id))
         #logger.debug( "Pipeline pid[%i]: parent pid has been taken from os.getppid() in the current process." %(process_id))
         logger.info( "%s pid[%i]: try to get a Pipeline parent id ...." %(pname,process_id))
         logger.info( "%s pid[%i]: get a parent process id from os.getppid()..... [%i] " %(pname,process_id,temp_pipeline_parent_pid))
         logger.info( "%s pid[%i]: search Pipeline related process in the master rank..." %(pname,process_id))
         #
         parent_process_name = ""
         current_process_name = ""
         parent_process_name = get_process_name(temp_pipeline_parent_pid)
         current_process_name = get_process_name(process_id)
         #
         logger.debug( "%s pid[%i]: check os.getppid() ..... [%i] " %(pname,process_id,temp_pipeline_parent_pid))
         logger.debug( "%s pid[%i]: check.... process_name from os.getppid() [%s] " %(pname,process_id,parent_process_name))
         logger.debug( "%s pid[%i]: check.... process_name from os.getpid() [%s] " %(pname,process_id,current_process_name))
         #
         pipeline_process_list=[]
         pipeline_process_list=get_pipeline_process_name_list("master")
         logger.info("%s pid[%i]: check .... pipeline_process_list [%s]." %(pname,process_id,pipeline_process_list))
         # 
         flag_parent=False
         flag_current=False
         for pipeline_process_name in pipeline_process_list:
            #logger.info( "Pipeline pid[%i]: check ...parent_process_name[%s], pipeline_process_name[%s]" %(process_id,parent_process_name,pipeline_process_name))
            if pipeline_process_name in parent_process_name:
               logger.debug("%s pid[%i]: found Pipeline-related name [%s] in a parent process [%s]." %(pname,process_id,pipeline_process_name,parent_process_name))
               flag_parent=True
               break
         for pipeline_process_name in pipeline_process_list:
             #logger.info( "Pipeline pid[%i]: check ...currnet_process_name[%s], pipeline_process_name[%s]" %(process_id,current_process_name,pipeline_process_name))
             if pipeline_process_name in current_process_name:
                 logger.debug("%s pid[%i]: found Pipeline-related name [%s] in a current process [%s]." %(pname,process_id,pipeline_process_name,current_process_name))
                 flag_current=True
                 break
         if flag_parent and flag_current:
             logger.info( "%s pid[%i]: current process is a child" %(pname,process_id))
             logger.info( "%s pid[%i]: parent Pipeline process id is set to be [%i] from os.getppid()." %(pname,process_id,temp_pipeline_parent_pid))
             pipeline_parent_pid = temp_pipeline_parent_pid 
         elif not flag_parent and flag_current:
             logger.info( "%s pid[%i]: this is on a main process." %(pname,process_id))
             logger.info( "%s pid[%i]: Pipeline parent process id is set to be [%i] of the current pid." %(pname,process_id,process_id))
             pipeline_parent_pid = process_id
         else:
             logger.info( "%s pid[%i]: this state should be unknown." %(pname,process_id))
             logger.info( "%s pid[%i]: Pipeline process id will not be considered." %(pname,process_id))
             pipeline_parent_pid = -1

  #
  # initialize parameters
  # 
  jobid =0
  messeage=""
  flag_pipeline_task_loop=True
  finished="False"
  current_unique_jobid=0
  current_vge_jobid = 0
  current_max_task = 0
  joblist = dict()
  #
  current_jobid=None
  Flag=True
  retry=1
  #
  current_unique_jobid = -1
  current_vge_jobid = -1
  current_max_task = -1
  #
  time_jobsend = 0.0
  time_jobrecv = 0.0
  time_waitsend = 0.0
  time_waitrecv = 0.0
  time_jobdispatch=0.0
  wait_time = 0.0

  #
  # create a new job 
  #
  joblist[jobid]= dict()
  joblist[jobid]=make_pipeline_jobstatuslist(0)
  joblist[jobid]["pipeline_pid"] = process_id
  joblist[jobid]["pipeline_parent_pid"] = pipeline_parent_pid
 
  #
  # args
  #
  # get command
  command = ""
  if isinstance(arg1,str):
     if arg1 != "":
         joblist[jobid]["command"] = arg1

  # get max_task
  max_task=0
  if not isinstance(arg2, bool) and isinstance(arg2, int):
     if arg2 > -1:
         joblist[jobid]["max_task"] = int(arg2) 

  # get basefilename
  if isinstance(arg3,str):
     if arg3 != "":
         joblist[jobid]["basefilename"] = arg3


  #
  # check a request...
  #
  flag_newjob=True
  #
  # 

  #
  # shutdown VGE
  #
  if arg1 == "finished":
      logger.info("%s pid[%i]: shutdown VGE..." %(pname,process_id))
      joblist = "finished" 
      flag_newjob=False
      joblist_shipped = pickle.dumps(joblist)
  #
  # shutdown VGE forcibly
  #
  elif arg1 == "forcedstop":
      logger.info("%s pid[%i]: forced to shutdown VGE..." %(pname,process_id))
      joblist = "forcedstop"
      flag_newjob=False
      joblist_shipped = pickle.dumps(joblist)
  #
  # wake VGE up
  #
  elif arg1 == "hello":
      logger.info("%s pid[%i]: start up VGE..." %(pname,process_id))
      joblist = "hello" 
      flag_newjob=False
  #
  # request a list of Pipeline paretent process id which VGE knows.
  #
  elif arg1 == "pipeline_ppid_list":
      logger.info("%s pid[%i]: request Pipeline parent process id list..." %(pname,process_id))
      joblist = "pipeline_ppid_list"
      flag_newjob=False

  #
  # request the rest of jobs to be performed by VGE
  #
  elif arg1 == "restofjob":
      logger.info("%s pid[%i]: request the number of queued jobs which VGE has..." %(pname,process_id))
      joblist = "restofjob"
      flag_newjob=False

  #
  # send a wakeup messeage to VGE.
  #
  elif arg1 == "hello_from_pipeline":
      logger.info("%s pid[%i]: send hello from a Pipeline to VGE..." %(pname,process_id))
      joblist = dict()
      joblist["hello_from_pipeline"] = True
      joblist["pipeline_parent_pid"] = pipeline_parent_pid
      flag_newjob=False
  #
  # mark a pipeline byebye flag bacause a Pipeline was finished.
  #
  elif arg1 == "byebye_from_pipeline":
      logger.info("%s pid[%i]: send byebye from a Pipeline to VGE..." %(pname,process_id))
      joblist = dict()
      joblist["byebye_from_pipeline"] = True
      joblist["pipeline_parent_pid"] = pipeline_parent_pid
      flag_newjob=False

  #
  # package a joblist
  #
  if not flag_newjob:
      joblist_shipped = pickle.dumps(joblist)

  #
  # send the job to VGE
  #
  time_connection = time.time()
  while Flag:
          try:
                time_jobdispatch=time.time()
                try:
                   sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                except Exception,error: 
                   logger.info("%s pid[%i]: socket error [%s]." %(pname,process_id,error))

                if arg1 == "finished" or retry > 1:
                    sock.settimeout(timeout_client)
                #
                #
                mesg = ""
                messeage=""
                #
                # connect with VGE
                #
                if retry==1:
                    pass
                    #logger.info("Pipeline pid(%i): try to connect with VGE." %process_id)
                #
                sock.connect((host, port))
                logger.info("%s pid[%i]: connection with VGE was established. [access times:%i] " %(pname,process_id,retry))
                time_connection = time.time() -time_connection

                if isinstance(joblist, dict) and flag_newjob:
                    joblist_shipped=""
                    joblist[jobid]["sendvgetime"] = datetime.datetime.now()
                    # delete keys which are not used in VGE
                    temp=joblist[jobid].copy()
                    del temp["vge_jobid"]
                    #
                    joblist_shipped = pickle.dumps(temp)

                #
                # send the messeage(order) to VGE
                #
                #logger.info("Pipeline pid[%i]: new job has been submitted to VGE. [%s]" %(process_id, pickle.loads(joblist_shipped)))
                try: 
                    time_jobsend = time.time()
                    #logger.info("check...")
                    #logger.info(joblist_shipped)
                    sock.sendall(joblist_shipped)
                    time_jobsend = time.time() - time_jobsend
                    time.sleep(nsleep_aftersend)
                except socket.error:
                    logger.critical("%s   Error!! socket sending was failed. [data size->(%i) byte :(%s)]" %(pname,len(joblist_shipped), pickle.loads(joblist_shipped)))
                    pass
                    #sys.exit()
                except Exception,error:
                    logger.debug("%s pid[%i]: data send error (%s)"  %(pname,process_id, error))
                    pass
                #
                # receive an incoming messeage(the jobid for this order) from VGE
                #
                #mesg = sock.recv(bufsize)
                temp=None
                mesg=""
                icount=0
                time_jobrecv = time.time()
                while True:
                    temp = sock.recv(bufsize)
                    icount += 1
                    mesg += temp
                    if temp == "" or temp is None:
                        break
                    elif temp[-2:] == "\n." or temp[-3:] == "\ns.":
                        break
                    elif temp[-1:] == ".":
                        if mesg[-2:] == "\n." or mesg[-3:] == "\ns." :
                          break
                time_jobrecv = time.time() - time_jobrecv

                if isinstance(mesg,str):
                    messeage=None
                    if mesg > 0:
                        logger.info("%s pid[%i]: recieved mesg size=%i [%i recv times]" %(pname,process_id,len(mesg),icount))
                        logger.debug("%s pid[%i]: recieved mesg %s]" %(pname,process_id,mesg))
                    try:
                        messeage=pickle.loads(mesg)
                        logger.info("%s pid[%i]: response from VGE: %s"  %(pname,process_id, messeage))
                    except EOFError:
                        pass
                    except ValueError:
                        logger.error("%s pid[%i]: Value error. mesg is not recieved correctly. [bufsize:%i] [mesg:%s]" %(pname,process_id,bufsize,mesg))
                    except Exception,error:
                        logger.error("%s pid[%i]: exception error.... check [%s]" %(pname,process_id,error))

                    #try: 
                    #      messeage=pickle.loads(mesg)
                    #      logger.info("Pipeline pid[%i]: response from VGE: %s"  %(process_id, messeage))
                    #except EOFError: 
                    #      pass
                    #

                    #
                    # package a message from VGE as a return value if not flag_newjob.
                    #
                    if not flag_newjob: 
                       if isinstance(messeage, dict) and joblist is "pipeline_ppid_list":
                          result=messeage
                       elif isinstance(messeage, str) and joblist is "restofjob":
                          result=messeage

                    #
                    # check the response 
                    #
                    if isinstance(messeage, dict) and flag_newjob:
                       current_unique_jobid = int(messeage["unique_jobid"])
                       current_vge_jobid = int(messeage["vge_jobid"])
                       current_max_task = int(joblist[jobid]["max_task"])
                       check_pipeline_pid = int(messeage["pipeline_pid"])
                       joblist[jobid]["unique_jobid"] = current_unique_jobid
                       joblist[jobid]["vge_jobid"] = current_vge_jobid
                       #
                       logger.info("%s pid[%i]: assigned: jobid [%i] --> VGE jobid [%i] "  \
                              %(pname,process_id, current_unique_jobid, current_vge_jobid))
                       logger.debug("Pipeline: job contents: %s" %joblist[jobid])
                       logger.debug("Pipeline: CHECK : pid for jobid[%i] : pid (%i) <--> pid(%i) replied from VGE" \
                                     %(process_id, current_unique_jobid, check_pipeline_pid))
                       Flag=False
Exemplo n.º 4
0
def pipeline_jobcontroll(cl_args, job_args, total_joblist, new_joblist,
                         task_check, pipeline_parent_pid_list, command_list):
    #
    from logging import getLogger, StreamHandler, basicConfig, DEBUG, INFO, WARNING, ERROR, CRITICAL
    logger = getLogger(__name__)
    logger.setLevel(INFO)

    #
    # set verbose level
    #
    verbose = 0
    verbose = int(get_vge_conf("vge", "verbose", verbose))
    if verbose == 0:
        logger.setLevel(CRITICAL)
    elif verbose == 1 or verbose == 2:
        logger.setLevel(INFO)
    elif verbose == 3:
        logger.setLevel(DEBUG)
    basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')

    #
    # initialize...
    #
    flag_byebye = False
    flag_forcedbyebye = False
    task_check["exit"] = False
    task_check["forced"] = False
    task_check["restofjob"] = 0
    server_connected = True
    #
    command_id = 0
    jobid = 0  # is an indivitual job id used in VGE
    unique_jobid = 0  # is an unique job id for both VGE an Pipeline
    pipeline_pid_for_job = -1  # in case of the job that is not listed yet.
    #
    #
    mypid = os.getpid()
    logger.info("VGE(JOB): Pipeline job controller [pid:%i] is running. " %
                mypid)
    #
    #pipeline_parent_pid_list={}  # pipeline_parent_pid_list[25434] = True if VGE got a final signal from Pipeline)
    #
    num_of_pipeline_sample = 0
    unique_pipeline_parent_id = []  # unique_pipeline_parent_id[1] = 25434
    #

    #
    # communicate with Pipeline(s)
    #
    host = socket.gethostbyname(socket.gethostname())
    port = 8000
    backlog = 128
    bufsize = 16384
    default_bufsize = bufsize
    flag_socket_error = False
    count_socket_error = 0
    time_onesocket = 0.0
    pipeline_connect_established = 1
    flag_dispatch = False

    #
    # Config
    #
    server_timeout = 600.0  # sec
    nsleep_aftersend = 0.0
    nsleep_updatelist = 0.0
    nsleep_afterconnectionerror = 1.0
    nsleep_controller = 1.0

    #
    # read from conf
    #
    if job_args["flag_vge_conf"]:
        port = int(get_vge_conf("socket", "port", port))
        bufsize = int(get_vge_conf("socket", "bufsize", bufsize))
        server_timeout = get_vge_conf("vge", "socket_timeout", server_timeout)
        nsleep_aftersend = get_vge_conf("vge", "socket_interval_send",
                                        nsleep_aftersend)
        nsleep_updatelist = get_vge_conf("vge", "socket_interval_update",
                                         nsleep_updatelist)
        nsleep_afterconnectionerror = get_vge_conf(
            "vge", "socket_interval_error", nsleep_afterconnectionerror)
        nsleep_controller = get_vge_conf("vge", "socket_interval_close",
                                         nsleep_controller)
    #
    logger.info("VGE(JOB): socket host = %s" % host)
    logger.info("VGE(JOB): socket port = %i" % port)
    logger.info("VGE(JOB): socket buffer size = %i" % bufsize)
    logger.info("VGE(JOB): socket_interval_send=%12.5e" % nsleep_aftersend)
    logger.info("VGE(JOB): socket_interval_update=%12.5e" % nsleep_updatelist)
    logger.info("VGE(JOB): socket_interval_close=%12.5e" %
                nsleep_afterconnectionerror)
    logger.info("VGE(JOB): socket_timeout=%12.5e" % server_timeout)
    logger.info("VGE(JOB): socket_interval_close=%12.5e" % nsleep_controller)

    #
    while True:
        server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

        inputs = set([server_sock])
        if not server_connected:
            break
        try:
            server_sock.bind((host, port))
            server_sock.listen(backlog)
            #
            if flag_socket_error == True:
                logger.info("VGE(JOB): ....recovered (%i)" %
                            count_socket_error)
                flag_socket_error = False
                count_socket_error = 0
            #
            while True:
                #
                # wait for connection with Pipeline(s)
                #
                #logger.info("VGE(JOB): try to connect with Pipeline")

                try:
                    inputready, outputready, exceptready = select.select(
                        inputs, [], [], server_timeout)
                except Exception, error:
                    logger.debug("VGE(JOB): check. connection error. [%s] " %
                                 error)

                if not inputready:
                    logger.info("VGE(JOB): connection timed out. [%i sec] " %
                                server_timeout)
                else:
                    pass

                sum_send_time = 0.0
                sum_recv_time = 0.0
                socket_count = 0
                for sock in inputready:
                    time_onesocket = time.time()
                    flag_dispatch = False
                    if sock is server_sock:
                        client, address = server_sock.accept()
                        inputs.add(client)
                    else:
                        #
                        # get a messeage from Pipeline
                        #
                        socket_count += 1
                        messeage = ""
                        mesg = ""
                        recv_time = time.time()
                        temp = None
                        #
                        #mesg=sock.recv(bufsize)
                        icount = 0
                        while True:
                            temp = sock.recv(bufsize)
                            icount += 1
                            mesg += temp
                            if temp == "" or temp is None:
                                break
                            elif temp[-2:] == "\n." or temp[-3:] == "\ns.":
                                break
                            elif temp[-1:] == ".":
                                if mesg[-2:] == "\n." or mesg[-3:] == "\ns.":
                                    break
                        #
                        #logger.info("VGE(JOB): recv mesg size = %i [%s] " %(len(mesg),mesg))
                        #logger.info("VGE(JOB): recv mesg size=%i" %(len(mesg)))
                        recv_time = time.time() - recv_time
                        sum_recv_time += recv_time

                        if mesg > 0:
                            logger.info(
                                "VGE(JOB): recieved mesg size=%i [%i recv times]"
                                % (len(mesg), icount))
                        try:
                            messeage = pickle.loads(mesg)
                        except EOFError:
                            pass
                        except ValueError:
                            logger.error(
                                "VGE(JOB): Value error. mesg is not recieved correctly. [bufsize:%i] [mesg:%s]"
                                % (bufsize, mesg))
                        except Exception, error:
                            logger.error(
                                "VGE(JOB): exception error.... check [%s]" %
                                (error))

                        #
                        logger.info(
                            "VGE(JOB): connection with Pipeline was established. [%i] "
                            % pipeline_connect_established)
                        pipeline_connect_established += 1

                        if isinstance(messeage, str):
                            if "finished" in messeage:  # the VGE controller will stop.
                                logger.info(
                                    "VGE(JOB): got a shutdown signal from Pipeline/VGE_CONNECT."
                                )
                                mesg = "byebye"
                                flag_byebye = True
                                send_time = time.time()
                                sock.sendall(pickle.dumps(mesg))
                                send_time = time.time() - send_time
                                sum_send_time += send_time
                                time.sleep(nsleep_aftersend)
                                server_connected = False
                                #server_sock.close()

                            elif "forcedstop" in messeage:  # the VGE controller will be force to stop.
                                logger.info(
                                    "VGE(JOB): got a forced shutdown signal from Pipeline/VGE_CONNECT."
                                )
                                mesg = "byebye!!!"
                                flag_byebye = True
                                flag_forcedbyebye = True
                                #
                                task_check[
                                    "forced"] = True  # this is a forced shutdown signal for the MPI job controller
                                #
                                send_time = time.time()
                                sock.sendall(pickle.dumps(mesg))
                                send_time = time.time() - send_time
                                sum_send_time += send_time
                                time.sleep(nsleep_aftersend)
                                server_connected = False
                                #server_sock.close()

                            elif "hello" in messeage:  # echo hello
                                logger.info(
                                    "VGE(JOB): got a start signal from Pipeline."
                                )
                                mesg = "hello"
                                send_time = time.time()
                                sock.sendall(pickle.dumps(mesg))
                                send_time = time.time() - send_time
                                sum_send_time += send_time
                                time.sleep(nsleep_aftersend)

                            elif "pipeline_ppid_list" in messeage:  #  request Pipeline parent process id list
                                logger.info(
                                    "VGE(JOB): request Pipeline parent process id list from vge_connect."
                                )
                                mesg = ""
                                temp = pipeline_parent_pid_list.copy(
                                )  # this copy is a must because pipeline_parent_pid_list is a multiprocessing module object.
                                send_time = time.time()
                                sock.sendall(pickle.dumps(temp))
                                send_time = time.time() - send_time
                                del temp
                                sum_send_time += send_time
                                time.sleep(nsleep_aftersend)

                            elif "restofjob" in messeage:  #  request Pipeline parent process id list
                                logger.info(
                                    "VGE(JOB): request the number of rest of jobs to be carried out from VGE_CONNECT"
                                )
                                mesg = ""
                                send_time = time.time()
                                sock.sendall(
                                    pickle.dumps(task_check["restofjob"]))
                                send_time = time.time() - send_time
                                sum_send_time += send_time
                                time.sleep(nsleep_aftersend)

                        elif isinstance(messeage, dict):
                            logger.debug(
                                "VGE(JOB): messeage from Pipeline (%s)" %
                                messeage)

                            #
                            # mark pipeline byebye flag
                            #
                            if messeage.has_key("byebye_from_pipeline"):
                                temp = int(messeage["pipeline_parent_pid"])
                                if pipeline_parent_pid_list.has_key(temp):
                                    logger.info(
                                        "VGE(JOB): Pipeline [pid: %i] was completed."
                                        % temp)
                                    pipeline_parent_pid_list[temp] = True
                                else:
                                    logger.info(
                                        "VGE(JOB): Pipeline [pid: %i] was not found in pipeline_parent_pid_list [%s]."
                                        % (temp, pipeline_parent_pid_list))
                            #
                            # mark a new pipeline
                            #
                            elif messeage.has_key("hello_from_pipeline"):
                                temp = int(messeage["pipeline_parent_pid"])
                                if pipeline_parent_pid_list.has_key(temp):
                                    logger.info(
                                        "VGE(JOB): we know this Pipeline [pid: %i].... hello again?"
                                        % (temp))
                                    logger.debug(
                                        "VGE(JOB): pipeline_parent_pid_list ...%s"
                                        % pipeline_parent_pid_list)
                                else:
                                    logger.info(
                                        "VGE(JOB): got a new Pipeline parent pid (got a hello pipeline) [%i]."
                                        % temp)
                                    #
                                    # update Pipeline parent list
                                    pipeline_parent_pid_list[temp] = False
                                    num_of_pipeline_sample += 1
                                    unique_pipeline_parent_id.append(temp)
                                    logger.info(
                                        "VGE(JOB): [update] current num of parent pids Pipelines are running is %i ."
                                        % len(pipeline_parent_pid_list.keys()))

                            #
                            # job status inquiry from Pipeline
                            #
                            elif messeage.has_key("request"):
                                pipeline_pid_for_job = -1  # in case of the job that is not listed yet.
                                check_unique_jobid = int(
                                    messeage["request"]
                                )  # this should be unique_jobid
                                check_vge_jobid = int(messeage["vge_jobid"])
                                check_max_task = int(messeage["max_task"])
                                check_pipeline_pid = int(
                                    messeage["pipeline_pid"])
                                check_status = ""
                                output = ""
                                output += "VGE(JOB): jobid (%i) max_task(%i) job_check: " \
                                           %(check_unique_jobid, check_max_task)

                                logger.info(
                                    "VGE(JOB): request job status [id:%i] from Pipeline [pid:%i]"
                                    % (check_unique_jobid, check_pipeline_pid))
                                #
                                # check whether the job(s) including bulk jobs for this unique jobid were done or not
                                #
                                check_flag = True
                                #output_running ={}
                                #check_return_code = 0 # # for bulk job, the code is the largest one.
                                check_return_code = 0  # for bulk job, returned the last non-zero value if it was found.
                                #
                                check_jobid = int(check_vge_jobid)
                                if check_max_task == 0:
                                    check_count = -1
                                    if total_joblist.has_key(check_jobid):
                                        flag = True
                                        while flag:
                                            try:
                                                check_job = total_joblist[
                                                    check_jobid]
                                                flag = False
                                            except KeyError:
                                                logger.debug(
                                                    "VGE(JOB): (Key) retry to read data."
                                                )  # a little bit too early to check
                                                time.sleep(nsleep_updatelist)
                                                pass
                                            except EOFError:
                                                #logger.debug("VGE(JOB): (EOF) retry to read data.") # a little bit too early to check
                                                time.sleep(nsleep_updatelist)
                                                pass
                                        #
                                        # check Pipeline process id
                                        #
                                        pipeline_pid_for_job = check_job[
                                            "pipeline_pid"]
                                        logger.debug("VGE(JOB): pid(%i) for job(%i) <--> pid(%i) requested"  \
                                                     %(pipeline_pid_for_job, check_jobid,check_pipeline_pid) )
                                        #if pipeline_pid_for_job != check_pipeline_pid:
                                        #    logger.critical("VGE(JOB): Error!!! Error!!! mismatch pid!!! something wrong!!!!")
                                        #    exit()

                                        if check_job[
                                                "status"] == "done" or check_job[
                                                    "status"] == "aborted":
                                            check_count = 0
                                            temp_check_return_code = int(
                                                check_job["return_code"])
                                            if temp_check_return_code != 0:
                                                check_return_code = temp_check_return_code
                                                logger.warning(
                                                    "VGE(JOB): job id[%i] has a non-zero return code[%i] "
                                                    % (check_unique_jobid,
                                                       check_return_code))
                                            del temp_check_return_code
                                            #
                                        else:
                                            #output_running[check_jobid] = check_job
                                            pass  # the job is not carried out yet.
                                else:
                                    check_count = 0
                                    count = 0
                                    if total_joblist.has_key(check_jobid):
                                        while check_max_task > count:
                                            check_job = dict()
                                            flag = True
                                            while flag:
                                                try:
                                                    check_job = total_joblist[
                                                        check_jobid]
                                                    flag = False
                                                except KeyError:
                                                    logger.debug(
                                                        "VGE(JOB): (Key) retry to read data."
                                                    )
                                                    time.sleep(
                                                        nsleep_updatelist)
                                                    pass
                                                except EOFError:
                                                    #logger.debug("VGE(JOB): (EOF) retry to read data.")
                                                    time.sleep(
                                                        nsleep_updatelist)
                                                    pass

                                            bulkjob_id = int(
                                                check_job["bulkjob_id"])
                                            #
                                            # check Pipeline process id
                                            #
                                            if count == 0:
                                                pipeline_pid_for_job = check_job[
                                                    "pipeline_pid"]
                                                logger.debug("VGE(JOB): pid(%i) for job(%i) <-->  pid(%i) requested"  \
                                                                   %(pipeline_pid_for_job, check_jobid,check_pipeline_pid) )
                                                #if pipeline_pid_for_job != check_pipeline_pid:
                                                #    logger.critical("VGE(JOB): Error!!! mismatch pid 2!!!")
                                                #    exit()

                                            if check_job[
                                                    "status"] == "done" or check_job[
                                                        "status"] == "aborted":
                                                if check_job[
                                                        "unique_jobid"] == check_unique_jobid:
                                                    #
                                                    check_count += 1
                                                    output += "%03i " % bulkjob_id
                                                    #
                                                    temp_return_code = int(
                                                        check_job[
                                                            "return_code"])
                                                    if temp_return_code != 0:
                                                        logger.warning(
                                                            "VGE(JOB): job id[%i] (bulkjob id[%i]) has a non-zero return code[%s] "
                                                            %
                                                            (check_unique_jobid,
                                                             bulkjob_id,
                                                             temp_return_code))
                                                        check_return_code = temp_return_code
                                                    #
                                                else:
                                                    # this statement should not be passed.
                                                    logger.critical(
                                                        "VGE(JOB) something wrong!! check_jobid   (%i)"
                                                        % check_jobid)
                                                    logger.critical(
                                                        "VGE(JOB) something wrong!! check_maxtask (%i)"
                                                        % check_max_task)
                                                    logger.critical(
                                                        "VGE(JOB) something wrong!! check_job     (%s)"
                                                        % check_job)
                                                    for okey, ojob in total_joblist.items(
                                                    ):
                                                        print "total_list: jobid(%04i) unique_jobid(%04i) bulk_id(%04i)" \
                                                               %(okey, ojob["unique_jobid"], ojob["bulkjob_id"])
                                                    time.sleep(30)
                                            else:
                                                # some jobs are still running.
                                                pass
                                            #
                                            check_jobid += 1
                                            count += 1

                                if check_count == check_max_task:  # all jobs for this unique id
                                    output += "\n"
                                    logger.debug(output)
                                    output = "VGE(JOB): CHECK: job [id:%i] has been completed. [total %i]" % (
                                        check_unique_jobid, check_max_task)
                                    logger.info(output)
                                    check_status = "done," + str(
                                        check_return_code
                                    )  # done,0 for normally finished job
                                else:
                                    output += "\n"
                                    logger.debug(output)
                                    output = "VGE(JOB): CHECK: job [id:%i] is not done yet. [total:%i  completed:%i  queued:%i]" \
                                                %(check_unique_jobid, check_max_task, check_count, (check_max_task - check_count))
                                    logger.info(output)
                                    check_status = "wait,0"
                                #
                                # add Pipeline PID for this job
                                check_status += "," + str(pipeline_pid_for_job)
                                #
                                # send jobstatus to Pipeline
                                #
                                logger.info(
                                    "VGE(JOB): send messeage to Pipeline (%s)"
                                    % check_status)
                                send_time = time.time()
                                sock.sendall(pickle.dumps(check_status))
                                send_time = time.time() - send_time
                                sum_send_time += send_time
                                flag_dispatch = True
                                time.sleep(nsleep_aftersend)

                            #
                            # new job(s) from Pipeline
                            #
                            else:
                                logger.info(
                                    "VGE(JOB): incoming messeage from Pipeline: %s"
                                    % messeage)
                                if messeage is not None or messeage != "":
                                    messeage[
                                        "unique_jobid"] = unique_jobid  # VGE will reply to Pipeline via unique_jobid
                                    messeage.update(make_vge_jobstatuslist())

                                    #
                                    # update command list
                                    #
                                    current_unique_jobid = unique_jobid
                                    unique_jobid += 1
                                    command_list[command_id] = dict()
                                    temp = dict()
                                    temp["command"] = messeage["command"]
                                    temp["basefilename"] = messeage[
                                        "basefilename"]
                                    temp["max_task"] = messeage["max_task"]
                                    temp["unique_jobid"] = current_unique_jobid
                                    temp["pipeline_pid"] = messeage[
                                        "pipeline_pid"]
                                    temp["pipeline_parent_pid"] = messeage[
                                        "pipeline_parent_pid"]

                                    #
                                    basefilename = messeage["basefilename"]
                                    #

                                    command_list[command_id] = temp
                                    messeage["command_id"] = command_id
                                    command_id += 1
                                    del temp
                                    del messeage["command"]
                                    del messeage["basefilename"]

                                    #logger.debug("VGE(JOB): update command_list: %s" %command_list)

                                    #
                                    # check if pipeline parent pid is new
                                    #
                                    temp = int(messeage["pipeline_parent_pid"])
                                    if not pipeline_parent_pid_list.has_key(
                                            temp):
                                        #
                                        # update Pipeline parent list
                                        #
                                        logger.info(
                                            "VGE(JOB): a job from new Pipeline [pid %i] has come."
                                            % temp)
                                        pipeline_parent_pid_list[temp] = False
                                        num_of_pipeline_sample += 1
                                        unique_pipeline_parent_id.append(temp)
                                        logger.info(
                                            "VGE(JOB): [update] num of parent pids Pipelines are running is %i ."
                                            % len(pipeline_parent_pid_list.
                                                  keys()))
                                    else:
                                        logger.debug(
                                            "VGE(JOB): we already know this Pipeline process id [%i]."
                                            % temp)

                                    #
                                    # update new_joblist
                                    #
                                    start_jobid = jobid
                                    if messeage["max_task"] == 0:
                                        filename = basefilename + ".sh.0"
                                        messeage["filename"] = filename
                                        messeage["bulkjob_id"] = 0  #
                                        new_joblist[jobid] = dict()
                                        new_joblist[jobid] = messeage
                                        #
                                        total_joblist[jobid] = dict()
                                        total_joblist[jobid] = messeage
                                        #
                                        #time.sleep(nsleep_updatelist)
                                        jobid += 1
                                    else:  # bulk job
                                        number_of_bulkjob = int(
                                            messeage["max_task"])
                                        for bulkjob_id in range(
                                                number_of_bulkjob):
                                            filename = basefilename + ".sh.%i" % bulkjob_id
                                            messeage["filename"] = filename
                                            messeage["bulkjob_id"] = bulkjob_id
                                            new_joblist[jobid] = dict()
                                            new_joblist[jobid] = messeage
                                            #
                                            total_joblist[jobid] = dict()
                                            total_joblist[jobid] = messeage
                                            #
                                            jobid += 1

                                    #
                                    answer = dict()
                                    answer[
                                        "unique_jobid"] = current_unique_jobid
                                    answer["pipeline_pid"] = messeage[
                                        "pipeline_pid"]
                                    answer["pipeline_parent_pid"] = messeage[
                                        "pipeline_parent_pid"]
                                    answer[
                                        "vge_jobid"] = start_jobid  # this is vge_jobid which Pipeline wants to know
                                    #
                                    # send the unique jobid to the worker
                                    #
                                    send_time = time.time()
                                    sock.sendall(pickle.dumps(answer))
                                    send_time = time.time() - send_time
                                    flag_dispatch = True
                                    sum_send_time += send_time
                                    time.sleep(nsleep_aftersend)
                                    logger.info(
                                        "VGE(JOB): reply to Pipeline: [%s] " %
                                        answer)
                                    #
                                    #
                                else:
                                    #
                                    # send the unique jobid to Pipeline
                                    #
                                    send_time = time.time()
                                    sock.sendall(pickle.dumps(unique_jobid))
                                    send_time = time.time() - send_time
                                    sum_send_time += send_time
                                    time.sleep(nsleep_aftersend)

                        #
                        #  close connection with Pipeline
                        #
                        sock.close()
                        inputs.remove(sock)
                        if flag_byebye:
                            break
                #
                #
                time_onesocket = time.time() - time_onesocket
                if flag_dispatch:
                    if socket_count == 1:
                        logger.info("VGE(JOB): communication with Pipeline: elapsed times -> mesg-recv[%8.3e]/send[%8.3e] 1-socket cycle[%8.3e] sec" \
                                    %(recv_time, send_time, time_onesocket))
                    #
                    elif socket_count > 1:
                        logger.info("VGE(JOB): communication with %i Pipeline(s) : averaged elapsed times -> mesg-recv[%8.3e]/send[%8.3e] 1-socket cycle[%8.3e] sec" \
                                    %(socket_count, sum_recv_time/socket_count, sum_send_time/socket_count, time_onesocket))
                    #
                if flag_byebye:
                    break
Exemplo n.º 5
0
def mpi_jobcontroll_worker(cl_args, mpi_args, comm):

    #
    # master rank
    #
    master_rank = 0

    #
    # MPI message tag
    #
    message_tag = 10  # job message from the master
    forcedstop_tag = 99  # forced stop message from the master

    #
    flag_forcedstop = False

    #
    # set verbose level
    #
    #
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    #
    verbose = 0
    verbose = int(get_vge_conf("vge", "verbose", verbose))
    if verbose == 0:
        logger.setLevel(CRITICAL)
    elif verbose == 1:
        logger.setLevel(INFO)
    elif verbose == 2:
        logger.setLevel(CRITICAL)
    elif verbose == 3:
        logger.setLevel(DEBUG)
    basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
    #
    myrank = comm.Get_rank()
    nproc = comm.Get_size()
    number_of_nproc_digit = int(math.log10(nproc) + 1)
    myrank_str = str(myrank).zfill(number_of_nproc_digit)
    status = MPI.Status()
    #
    max_sizecommand = 131071  # for generic linux. for Kei, the max size is 262143 bye (less than 256KB).
    max_sizecommand = int(
        get_vge_conf("vge", "mpi_command_size", max_sizecommand))
    logger.info("VGE(MPI): worker[%s]: max. command size = %i" %
                (myrank_str, max_sizecommand))

    # job status
    jobstatus = None
    logger.info("VGE(MPI): MPI worker[%s] is running." % myrank_str)
    command = ""
    ret = 0  # return code from subprocess

    # probe parameters ; not used in workers
    #number_of_probe = 1000
    #nsleep_probe=0.01

    # monitoring interval of  mpi_test() on worker for killing a job. this is available if monitor_worker_runjob option is on.
    time_interval_irecv_test = 30.0
    time_interval_irecv_test = float(
        get_vge_conf("vge", "worker_interval_irecv_test",
                     time_interval_irecv_test))

    # pause after checking a running job [sec]. this is available if monitor_worker_runjob option is on.
    time_interval_runjobcheck = 10.0
    time_interval_runjobcheck = float(
        get_vge_conf("vge", "worker_interval_runjobcheck",
                     time_interval_runjobcheck))

    # monitoring running job on worker for emergency call from master ?
    flag_monitor_worker_runjob = False
    flag_killthejob = False
    if cl_args.monitor_workerjob:
        flag_monitor_worker_runjob = True
        logger.info(
            "VGE(MPI): MPI worker[%s] will monitor a job when it is running." %
            myrank_str)
        logger.info(
            "VGE(MPI): MPI worker[%s] [job monitoring] worker_interval_irecv_test : [%0.2f sec]."
            % (myrank_str, time_interval_irecv_test))
        logger.info(
            "VGE(MPI): MPI worker[%s] [job monitoring] worker_interval_runjobcheck: [%0.2f sec]."
            % (myrank_str, time_interval_runjobcheck))
        #
        # monitor a forced-stop signal of a running job ...
        #
        emergency_signal_recv = comm.irecv(source=master_rank,
                                           tag=forcedstop_tag)
        #

    #//////////////////////////
    # job loop for worker nodes
    #//////////////////////////
    while True:

        #
        # request a job and send the previous job status to the master
        #
        start_job = time.time()

        start_time = time.time()
        #comm.send(jobstatus, dest=0, tag=myrank)
        comm.send(jobstatus, dest=0, tag=message_tag)
        end_time = time.time()
        mpi_send_time = end_time - start_time
        if jobstatus is not None:
            logger.info(
                "VGE(MPI): worker[%s]: send job status to the master: [%s]" %
                (myrank_str, jobstatus))

        #
        # init
        #
        flag_forcedstop = False

        #
        # get an order from the master
        #
        start_time = time.time()
        #package = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
        package = comm.recv(source=0, tag=message_tag, status=status)
        end_time = time.time()
        mpi_recv_time = end_time - start_time

        command = None
        filename = None
        jobid = None
        bulkjob_id = None
        if isinstance(package, dict):
            command = package["command"]
            filename = package["filename"]
            jobid = package["jobid"]
            bulkjob_id = package["bulkjob_id"]
            if command is not None:
                logger.info(
                    "VGE(MPI): worker[%s]: got job[id %i] from the master: [%s]"
                    % (myrank_str, jobid, package))
        elif isinstance(package, str):
            if package == "terminate_all_workers":
                break  # terminate worker loop
        #
        # execute job command
        #
        ret = 0
        start_time = None
        finish_time = None
        jobstatus = None
        if command is not None:
            logger.info("VGE(MPI): worker[%s]: job [id %i] has been started." %
                        (myrank_str, jobid))
            start_time = datetime.datetime.now()
            time_command = time.time()

            # check command size
            sizecommand = len(command)
            flag_execfromscript = False

            #
            # check command size
            #
            if sizecommand > max_sizecommand:
                flag_execfromscript = True

                logger.info("VGE(MPI): worker[%s]: command size [%i bytes] of job [id %i] is large. [thresh=%i]" \
                          %(myrank_str,sizecommand, jobid, max_sizecommand))
                logger.info(
                    "VGE(MPI): worker[%s]: job [id %i] will be executed from script file directly."
                    % (myrank_str, jobid))
                logger.info("VGE(MPI): worker[%s]: .... write file -> [%s]" %
                            (myrank_str, filename))
                #
                if cl_args.nowrite_jobscript:
                    logger.info(
                        "VGE(MPI): worker[%s]: but the file will be deleted later because '--nowrite_jobscript' option turns on "
                        % myrank_str)
                #
                vge_script_dir = mpi_args["vge_script_dir"]
                filename_script = vge_script_dir + "/" + filename
                #
                time_write2 = time.time()
                #
                writefile = open(filename_script, "w")
                writefile.write(command)
                writefile.flush()
                writefile.close()
                time_write2 = time.time() - time_write2
                while True:
                    if os.path.isfile(filename_script):
                        statinfo = os.stat(filename_script)
                        if int(statinfo.st_size) > sizecommand / 2:
                            break
                    else:
                        time.sleep(0.00005)  # 50 us wait
                filename_stdout = vge_script_dir + "/" + filename + ".o" + str(
                    jobid)
                filename_stderr = vge_script_dir + "/" + filename + ".e" + str(
                    jobid)
                command = "bash %s > %s 2> %s" % (
                    filename_script, filename_stdout, filename_stderr)

            env = os.environ.copy()
            if bulkjob_id is not None:
                env["SGE_TASK_ID"] = str(bulkjob_id + 1)

            #
            # popen
            #
            time_popen = time.time()
            p = subprocess.Popen(command, shell=True, bufsize=-1, stdin=None, stdout=subprocess.PIPE, \
                                    stderr=subprocess.PIPE, env=env)
            time_popen = time.time() - time_popen
            #
            jobpid = p.pid
            logger.info("VGE(MPI): worker[%s]: pid of job [id %i] ==> [%i]." %
                        (myrank_str, jobid, jobpid))

            #
            # wait until the job is completed.
            #
            wait_count = 0
            stdout = ""
            stderr = ""
            time_pcomm = time.time()
            #
            if flag_monitor_worker_runjob:
                #
                time_check = time.time()
                #
                flag_running_job = True
                while flag_running_job:
                    current_interval = time.time() - time_check
                    if current_interval > time_interval_irecv_test:
                        time_check = time.time()
                        #
                        check_recv = emergency_signal_recv.test(
                        )  # req.test() gives a tuple with a mesg but req.Test() logical.
                        logger.debug(
                            "VGE(MPI): worker[%s]: check_recv   ... %s " %
                            (myrank_str, check_recv))
                        #
                        if check_recv[
                                0] is True:  # master is calling me! kill the job!
                            #
                            mesg = check_recv[1]  # get a message
                            logger.debug(
                                "VGE(MPI): worker[%s]: got a forced stop message [%s] from master when job [%i] is running"
                                % (myrank_str, mesg, jobid))
                            logger.info(
                                "VGE(MPI): worker[%s]: job [%i] (pid [%i]) was aborted. "
                                % (myrank_str, jobid, jobpid))
                            #
                            try:
                                os.kill(
                                    jobpid, signal.SIGTERM
                                )  # note :OS dependent. Should be checked on K.
                            except Exception, error:
                                logger.error(
                                    "VGE(MPI): worker[%s]: os.kill error was occured... [%s] "
                                    % (myrank_str, error))
                                pass
                            #
                            flag_forcedstop = True
                            flag_running_job = False
                            break  # escape from poll() check loop
                            #

                    if p.poll() is not None:
                        flag_running_job = False
                        break  # escape from poll() check loop
                    else:
                        time.sleep(time_interval_runjobcheck)
                        wait_count += 1

                #
                # job was finished.
                #
                #(stdout, stderr) = p.communicate()
                #ret = p.returncode
                if flag_forcedstop:
                    ret = -99  # stop by emergency call from master
                    stdout = "forced stop \n"
                else:
                    (stdout, stderr) = p.communicate()
                    ret = p.returncode

                #
                #
                logger.debug(
                    "VGE(MPI): worker[%s]: result of job [%i] ... stdout:[%s] stderr:[%s] ret:[%s] "
                    % (myrank_str, jobid, stdout, stderr, ret))
                #
                logger.info(
                    "VGE(MPI): worker[%s]: wait count of job [id %i] ==> [%i]."
                    % (myrank_str, jobid, wait_count))
                #
                #
            else:
                #
                # get standard and error outputs of the job
                #
                (stdout, stderr) = p.communicate()
                #
                # get a return code
                #
                ret = p.returncode
                #
                logger.debug(
                    "VGE(MPI): worker[%s]: result of job [%i] ... stdout:[%s] stderr:[%s] ret:[%s] "
                    % (myrank_str, jobid, stdout, stderr, ret))
            #
            time_pcomm = time.time() - time_pcomm
            #

            if stdout is None:
                stdout = ""
            if stderr is None:
                stderr = ""

            # the overhead for os module was the same as that for subprocess...
            #os.system("sh ./%s" %filename)

            time_command = time.time() - time_command
            finish_time = datetime.datetime.now()
            logger.info(
                "VGE(MPI): worker[%s]: job [id %i] is completed with code [%i]."
                % (myrank_str, jobid, ret))
            #
            jobstatus = {}
            jobstatus["jobid"] = jobid
            jobstatus["start_time"] = start_time
            jobstatus["finish_time"] = finish_time
            jobstatus["elapsed_time"] = (finish_time -
                                         start_time).total_seconds()

            #
            # return code (shell)
            #    -15: aborted
            #    000: normally finished
            #    002: No such file or directory
            #    127: command not found
            #    139: segmentation fault (in shirokane)
            jobstatus["return_code"] = ret
            #
            #
            if ret < 0:
                jobstatus["status"] = "aborted"
            else:
                jobstatus["status"] = "done"

            #
            # write script, standard output, and standard error
            #
            time_write = time.time()
            if cl_args.nowrite_jobscript or flag_execfromscript:
                pass  # skip to write files...
                #
                # delte files...
                #
                if cl_args.nowrite_jobscript and flag_execfromscript:
                    os.remove(filename_script)
                    os.remove(filename_stdout)
                    os.remove(filename_stderr)

            else:
                logger.info("VGE(MPI): worker[%s]: write files -> [%s]" %
                            (myrank_str, filename))
                vge_script_dir = mpi_args["vge_script_dir"]
                #
                filename_script = vge_script_dir + "/" + filename
                try:
                    writefile = open(filename_script, "w")
                    writefile.write(command)
                    writefile.close()
                except Exception, error:
                    logger.error(
                        "VGE(MPI): worker[%s]: write error [script] [%s] [%s] [%s]"
                        % (myrank_str, filename, error, command))
                    pass

                #
                filename_stdout = vge_script_dir + "/" + filename + ".o" + str(
                    jobid)
                try:
                    writefile.close()
                    writefile = open(filename_stdout, "w")
                    writefile.write(stdout)
                except Exception, error:
                    logger.error(
                        "VGE(MPI): worker[%s]: write error [stdout] [%s] [%s] [%s]"
                        % (myrank_str, filename, error, stdout))
                    pass
                #
                filename_stderr = vge_script_dir + "/" + filename + ".e" + str(
                    jobid)
                try:
                    writefile = open(filename_stderr, "w")
                    writefile.write(stderr)
                    writefile.close()
                except Exception, error:
                    logger.error(
                        "VGE(MPI): worker[%s]: write error [stderr] [%s] [%s] [%s]"
                        % (myrank_str, filename, error, stdout))
                    pass
Exemplo n.º 6
0
def checkpoint_restart(target_dir):

    elapsed_time = time.time()
    #    init_restart_conf()

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    #    verbose = int(get_restart_conf("verbose", 0))
    verbose = int(get_vge_conf("vge", "verbose", 0))
    if verbose == 0:
        logger.setLevel(CRITICAL)
    elif verbose == 1 or verbose == 2:
        logger.setLevel(INFO)
    elif verbose == 3:
        logger.setLevel(DEBUG)
    basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
    pname = "Pipeline(CHECKPOINT RESTART)"
    process_id = os.getpid()
    logger.info("%s pid[%i]: checkpoint judgment and reading start task " %
                (pname, process_id))

    #    chk_file = int(get_restart_conf("chk_file", 2))
    chk_file = int(get_vge_conf("restart", "chk_file", 2))

    # common parameters
    global chk_work_dir
    global chk_target_dir
    global chk_checkpoint_dir

    work_dir_path = os.path.dirname(target_dir)
    work_dir_path = os.path.abspath(work_dir_path)
    work_dir_path = os.path.normpath(work_dir_path)
    checkpoint_dir_path = os.path.join(target_dir, "checkpoint")
    checkpoint_dir_path = os.path.abspath(checkpoint_dir_path)
    checkpoint_dir_path = os.path.normpath(checkpoint_dir_path)
    target_dir_path = os.path.abspath(target_dir)
    target_dir_path = os.path.normpath(target_dir_path)
    chk_work_dir = work_dir_path
    chk_target_dir = target_dir_path
    chk_checkpoint_dir = checkpoint_dir_path

    restartdata_dir = os.path.join(chk_checkpoint_dir, "backupdata")
    restartdata_dir = os.path.abspath(restartdata_dir)
    restartdata_dir = os.path.normpath(restartdata_dir)

    # reconst_checkpoint
    flag_skiptask = {}
    flag_starttask = {}

    if not os.path.isdir(chk_checkpoint_dir):
        logger.info("%s pid[%i]: normal start" % (pname, process_id))
        elapsed_time = time.time() - elapsed_time
        logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                    (pname, process_id, elapsed_time))
        return flag_skiptask, flag_starttask, {}

    # checkpoint mstart file
    masterfile_list = []
    # チェックポイントマスタのリスト作成
    for filename in os.listdir(chk_checkpoint_dir):
        basename, ext = os.path.splitext(filename)
        if ".mst" == ext:
            masterfile_list.append(filename)

    # チェックポイントマスタファイルが1つも無ければ
    # リスタートしない
    if not masterfile_list:
        logger.info("%s pid[%i]: not restart, checkpoint start" %
                    (pname, process_id))
        elapsed_time = time.time() - elapsed_time
        logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                    (pname, process_id, elapsed_time))
        return flag_skiptask, flag_starttask, {}

    set_task_list = []
    set_checkpointfiles = {}
    dict_route = {}
    for mstfile in masterfile_list:
        chk_masterfile = os.path.join(chk_checkpoint_dir, mstfile)
        logger.debug("%s pid[%i]: checkpoint master file path: %s" %
                     (pname, process_id, chk_masterfile))

        if os.path.isfile(chk_masterfile):
            try:
                checkpoint_master = open(chk_masterfile, "rb")
                logger.info(
                    "%s pid[%i]: restart start, open checkpoint master" %
                    (pname, process_id))
                tmplist = checkpoint_master.readlines()
                checkpoint_master.close()
            except IOError as e:
                logger.info("%s pid[%i]: error was masterfile open [%s]. " %
                            (pname, process_id, str(e)))
                exit()
        else:
            logger.info("%s pid[%i]: not restart, checkpoint start" %
                        (pname, process_id))
            exit()

        # チェックポイントマスタデータの中身の確認
        # マスタデータの第1カラムがチェックポイントを作ったtask
        # 第2カラムがチェックポイントデータのパス
        # 第1カラムがfinalならば何もしない
        for line in tmplist:
            line_list = line.split(",")
            task = line_list[0]
            set_task_list.append(task)
            # 最初は長さが1の配列なので、このifが無いとelifでエラーが出る
            if (len(set_task_list) == 1):
                set_checkpointfiles[task] = [line_list[1].strip()]
            # 1つ前のTask名が同じなら真
            elif (set_task_list[-2] == set_task_list[-1]):
                set_checkpointfiles[task].append(line_list[1].strip())
            else:
                set_checkpointfiles[task] = [line_list[1].strip()]
        if set_task_list[-1] != "final":
            logger.debug("%s pid[%i]: set_task_list: %s" %
                         (pname, process_id, set_task_list))
            flag_starttask[set_task_list[-1]] = True

            # タスク名とルートタグを紐づける辞書を作る
            # dict_route[task] = マスタファイルのルートタグの部分(basenameの0-10番目までが"checkpoint_")
            basename, ext = os.path.splitext(mstfile)
            dict_route[task] = basename[11:]

    # マスターファイルの情報を元に、flagを設定
    for task in set_task_list:
        # すでにチェックポイントが書かれていたらskip
        # checkpointtag = チェックポイントファイルのタスク名の部分(basenameの0-10番目までが"checkpoint_")
        for checkpointpath in set_checkpointfiles[task]:
            basename, ext = os.path.splitext(os.path.basename(checkpointpath))
            checkpointtag = basename[11:]
            flag_skiptask[checkpointtag] = True

    # read checkpointdata
    # verify_checkpoint
    restart_files_list_dict = {}
    for task, startflag in flag_starttask.items():
        new_outputfiles_list = []

        for checkpointpath in set_checkpointfiles[task]:
            # チェックポイントデータのパスの修正
            chkpoint_filename = os.path.basename(checkpointpath)
            checkpoint_absdir = os.path.abspath(chk_checkpoint_dir)
            checkpoint_absdir = os.path.normpath(checkpoint_absdir)
            new_checkpoint_filepath = os.path.join(checkpoint_absdir,
                                                   chkpoint_filename)
            new_checkpoint_filepath = os.path.normpath(new_checkpoint_filepath)
            readfilename = new_checkpoint_filepath

            checkpointdata = None
            binalyfile = None

            # チェックポイントデータのopen
            try:
                binalyfile = open(readfilename, "rb")
                loaded_package = pickle.load(binalyfile)
                binalyfile.close()
                checkpointdata = pickle.loads(loaded_package)
            except IOError as e:
                exit()

            # ファイルの整合性チェック
            if chk_file == 0:
                pass

            elif chk_file == 1:
                restart_input_files_set = set()
                for root, dirs, files in os.walk(restartdata_dir + "_" +
                                                 dict_route[task]):
                    target_dirname = root[len(restartdata_dir + "_" +
                                              dict_route[task]) + 1:]
                    #if target_dirname is not "":
                    if target_dirname is "":
                        target_dirname = "."

                        #for filename in files:
                        #    restart_input_files_set.add(target_dirname + "/" + filename)
                    for filename in files:
                        restart_input_files_set.add(target_dirname + "/" +
                                                    filename)

                if not (set(checkpointdata["targetdirdict"]["filename_list"])
                        <= restart_input_files_set):
                    logger.warning(
                        "%s pid[%i]: (VERIFY WARNING) can not restart because file size mismatch...."
                        % (pname, process_id))
                    logger.debug(
                        "%s pid[%i]: (VERIFY WARNING) restart_input_files_set: %s"
                        % (pname, process_id, restart_input_files_set))
                    logger.debug(
                        "%s pid[%i]: (VERIFY WARNING) set(checkpointdata[\"targetdirdict\"][\"filename_list\"]): %s"
                        %
                        (pname, process_id,
                         set(checkpointdata["targetdirdict"]["filename_list"]))
                    )
                    exit()

            elif chk_file == 2:
                for i in range(
                        len(checkpointdata["targetdirdict"]["filename_list"])):
                    restart_data_file = os.path.join(
                        restartdata_dir + "_" + dict_route[task],
                        checkpointdata["targetdirdict"]["filename_list"][i])
                    if os.path.islink(
                            restart_data_file
                    ) and not os.path.isfile(restart_data_file):
                        # シンボリックリンクのファイルのリンクを修正する
                        current_dir = os.getcwd()
                        current_dir_len = len(current_dir.split("/"))
                        realpath = os.path.realpath(restart_data_file)
                        # カレントディレクトリ部分の名前を置き換える@京向け
                        realpath_split = realpath.split("/")
                        dircount = 0
                        new_path = ""
                        for dirname in realpath_split:
                            dircount = dircount + 1
                            if dircount > current_dir_len:
                                new_path = new_path + "/" + dirname
                        new_src_file = os.path.normpath(current_dir + new_path)
                        os.unlink(restart_data_file)
                        os.symlink(new_src_file, restart_data_file)
                        logger.debug(
                            "%s pid[%i]: re symbolic link %s to %s" %
                            (pname, process_id, restartdata_dir, new_src_file))
                    if not os.path.isfile(restart_data_file):
                        logger.warning(
                            "%s pid[%i]: (VERIFY WARNING) can not restart because 'restart_data_file' does not exist...."
                            % (pname, process_id))
                        logger.debug(
                            "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s"
                            % (pname, process_id, restart_data_file))
                        exit()
                    if os.stat(restart_data_file).st_size != checkpointdata[
                            "targetdirdict"]["size_list"][i]:
                        logger.warning(
                            "%s pid[%i]: (VERIFY WARNING) can not restart because file size does not match"
                            % (pname, process_id))
                        logger.debug(
                            "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s"
                            % (pname, process_id, restart_data_file))
                        exit()

            # 次のタスクの入力に渡すoutputfilesを作る(チェックポイントデータのパスを修正)
            ret = 0
            chkpointd_outputfiles_list = checkpointdata["outputfiles"]
            chkpointd_target_dir = checkpointdata["targetdir"]

            # 前回実行時のアウトプットの各要素をみる。要素は、リストの場合(複数ファイル)と文字列の場合(ファイル単体)がある。
            for x in chkpointd_outputfiles_list:
                if type(x) is list:
                    new_outputfiles_list2 = []
                    # xの各要素が一つのファイルのパス。これらを新しいパスにしてリストに追加
                    for output_file_path in x:
                        new_file_name = os.path.join(
                            chk_target_dir,
                            output_file_path[len(chkpointd_target_dir) + 1:])
                        new_file_name = os.path.abspath(new_file_name)
                        new_file_name = os.path.normpath(new_file_name)
                        new_outputfiles_list2.append(new_file_name)
                    new_outputfiles_list.append(new_outputfiles_list2)

                else:
                    # xは一つのファイルのパス。これを新しいパスにしてリストに追加
                    new_file_name = os.path.join(
                        chk_target_dir, x[len(chkpointd_target_dir) + 1:])
                    new_file_name = os.path.abspath(new_file_name)
                    new_file_name = os.path.normpath(new_file_name)
                    new_outputfiles_list.append(new_file_name)

            restart_files_list_dict[dict_route[task]] = new_outputfiles_list
            logger.debug("%s pid[%i]: new_outputfiles_list: %s" %
                         (pname, process_id, new_outputfiles_list))
            logger.debug("%s pid[%i]: task: %s" % (pname, process_id, task))
            logger.debug("%s pid[%i]: dict_route: %s" %
                         (pname, process_id, dict_route[task]))

        # リスタート用ファイルの再配置
        backup_dirs = os.listdir(restartdata_dir + "_" + dict_route[task])
        for backup_dir in backup_dirs:
            src_backup_dir = os.path.join(
                restartdata_dir + "_" + dict_route[task], backup_dir)
            args = ["cp", "-lr", src_backup_dir, target_dir]
            ret = subprocess.call(args)
            logger.warning("%s pid[%i]: (WRITE WARNING) restored backup file, subprocess.call retcode %d" \
                       %(pname,process_id,ret))
            logger.debug("%s pid[%i]: src_backup_dir: %s" %
                         (pname, process_id, src_backup_dir))

    elapsed_time = time.time() - elapsed_time
    logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" %
                (pname, process_id, elapsed_time))

    return flag_skiptask, flag_starttask, restart_files_list_dict