示例#1
0
def main():
    histogram = sys.argv[1]
    metadata_file = histogram + '.metadata'
    metadata_txt = get_histogram_metadata(histogram)
    run_cmd("echo '%s' > '%s'" % (metadata_txt, metadata_file))

    print("Created metadata file %s with contents: \n%s" %
          (metadata_file, metadata_txt))
示例#2
0
 def run(self):
     """Runs all Ntuple production jobs -- either locally or on the batch system.
     """
     record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES)
     run_cmd(
         "make -f %s -j %i 2>%s 1>%s" % \
         (self.makefile, self.num_parallel_jobs, self.stderr_file_path, self.stdout_file_path),
         False
     )
示例#3
0
 def get_scratch_dir(self):
     scratch_dir = "/scratch/%s" % getpass.getuser()
     if not os.path.exists(scratch_dir):
         print "Directory '%s' does not yet exist, creating it !!" % scratch_dir
         run_cmd(command_create_scratchDir)
     scratch_dir = os.path.join(
         scratch_dir,
         "tthAnalysis" + "_" + date.today().isoformat()
     )
     create_if_not_exists(scratch_dir)
     return scratch_dir
def main():
    input_files = sys.argv[1:len(sys.argv)]

    print("<check_that_histograms_are_valid.py>: input files = '%s'" % " ".join(input_files))

    run_cmd('sleep 20')

    for input_file in input_files:
        check_that_histogram_is_valid(input_file)

    print("All input files are ok.")
    sys.exit(0)
示例#5
0
    def submit_job_version2(
        self,
        task_name=None,
        command=None,
        output_dir=None
    ):
        '''
            This method is similar to submitJob, but has less required parameters.
            Supports multiple lines of Bash commands instead of fixed oneliner.
        '''

        print("SBatchManager#hadd_on_cluster_node(task_name=%s, command=%s, output_dir=%s)" % (
            task_name, command, command))

        if not self.workingDir:
            raise ValueError(
                "Please call 'setWorkingDir' before calling 'submitJob' !!")

        scratch_dir = self.get_scratch_dir()

        # Create script for executing jobs

        script_file = output_dir + "/cfgs/" + task_name + ".sh"
        wrapper_log_file = output_dir + "/logs/" + task_name + "_wrapper.log"
        executable_log_file = output_dir + "/logs/" + task_name + "_executable.log"

        run_cmd("mkdir -p '%s'" % (output_dir + "/cfgs/"))
        run_cmd("mkdir -p '%s'" % (output_dir + "/logs/"))

        sbatch_command = "%s --partition=%s --output=%s %s" % (
            self.command_submit,  # "sbatch"
            self.queue,
            wrapper_log_file,
            script_file
        )

        script = jinja2.Template(submit_job_version2_template).render(
            command=command,
            working_dir=self.workingDir,
            scratch_dir=scratch_dir,
            wrapper_log_file=wrapper_log_file,
            executable_log_file=executable_log_file,
            sbatch_command=sbatch_command
        )
        print "writing sbatch script file = '%s'" % script_file
        with codecs.open(script_file, "w", "utf-8") as f:
            f.write(script)

        # Run command

        sbatch_command_result = run_cmd(sbatch_command)
        job_id = sbatch_command_result.split()[-1]
        self.jobIds.append(job_id)
示例#6
0
 def get_job_dir(self):
     if self.use_home:
         prefix = os.path.join('/home', getpass.getuser(), 'jobs')
     else:
         prefix = os.path.join('/scratch', getpass.getuser())
         if not hdfs.isdir(prefix):
             run_cmd('/scratch/mkscratch')
     job_dir = os.path.join(
         prefix,
         "%s_%s" % (self.analysisName, datetime.date.today().isoformat()),
     )
     return job_dir
示例#7
0
 def get_scratch_dir(self):
     scratch_dir = "/scratch/%s" % getpass.getuser()
     if not os.path.exists(scratch_dir):
         logging.info("Directory '%s' does not yet exist, creating it !!" %
                      scratch_dir)
         run_cmd(command_create_scratchDir)
     scratch_dir = os.path.join(
         scratch_dir,
         "%s_%s" % (self.analysisName, datetime.date.today().isoformat()),
     )
     create_if_not_exists(scratch_dir)
     return scratch_dir
示例#8
0
def executable_hadd_in_cluster_spec():

    # Prepare

    run_cmd("rm -rf %(temp_dir)s/executable_hadd_in_cluster_spec/*" % config)
    run_cmd("mkdir -p %(temp_dir)s/executable_hadd_in_cluster_spec/" % config)
    run_cmd("""echo "%(fixtures_dir)s/histogram_1.root\n%(fixtures_dir)s/histogram_2.root\n" > """ \
            """%(temp_dir)s/executable_hadd_in_cluster_spec/input_histograms_list.txt""" % config)


    # Run task

    run_cmd('python %(scripts_dir)s/hadd_in_cluster.py ' \
            '%(temp_dir)s/executable_hadd_in_cluster_spec/output_histogram.root ' \
            '%(temp_dir)s/executable_hadd_in_cluster_spec/input_histograms_list.txt' % config)


    # Check the result

    root_result_file = '%(temp_dir)s/executable_hadd_in_cluster_spec/output_histogram.root' % config
    result_successful = os.path.isfile(root_result_file)


    # Output result

    if result_successful:
        print('PASSED: Executable for HADD in cluster is WORKING')
    else:
        print('FAILED: Executable for HADD in cluster is NOT WORKING')

    return result_successful
def executable_hadd_in_cluster_spec():

    # Prepare

    run_cmd("rm -rf /home/%(user)s/tmp/executable_hadd_in_cluster_spec/*" % config)
    run_cmd("mkdir -p /home/%(user)s/tmp/executable_hadd_in_cluster_spec/" % config)
    fixtures_dir = '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/' % config
    run_cmd("""echo "%(fixtures_dir)s/histogram_1.root\n%(fixtures_dir)s/histogram_2.root\n" > /home/%(user)s/tmp/executable_hadd_in_cluster_spec/input_histograms_list.txt""" % { 'fixtures_dir': fixtures_dir, 'user': config['user'] })


    # Run task

    run_cmd('python /home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/scripts/hadd_in_cluster.py /home/%(user)s/tmp/executable_hadd_in_cluster_spec/output_histogram.root /home/%(user)s/tmp/executable_hadd_in_cluster_spec/input_histograms_list.txt' % config)


    # Check the result

    root_result_file = '/home/%(user)s/tmp/executable_hadd_in_cluster_spec/output_histogram.root' % config
    result_successful = os.path.isfile(root_result_file)


    # Output result

    if result_successful:
        print('Executable for HADD in cluster is WORKING')
    else:
        print('Executable for HADD in cluster is NOT WORKING')

    return result_successful
def call_histogram_aggregation_on_cluster_node_spec():

    # Prepare

    run_cmd("rm -rf %(temp_dir)s/call_histogram_aggregation_on_cluster_node" % config)
    run_cmd("mkdir -p %(temp_dir)s/call_histogram_aggregation_on_cluster_node/" % config)


    # Add histograms and run task

    pool_id = uuid.uuid4()
    m = sbatchManager(pool_id)
    m.setWorkingDir('%(cmssw_base)s/src/analysis2mu1b1j/analysis2mu1b1j/test' % config)

    try:
        m.hadd_in_cluster(
            inputFiles=[
                '%(fixtures_dir)s/histogram_1.root' % config,
                '%(fixtures_dir)s/histogram_2.root' % config
            ],
            outputFile='%(temp_dir)s/call_histogram_aggregation_on_cluster_node/result.root' % config
        )

        m.waitForJobs()
    except:
        return False


    # Check result

    root_result_file = '%(temp_dir)s/call_histogram_aggregation_on_cluster_node/result.root' % config
    root_file_exists = os.path.isfile(root_result_file)

    if not root_file_exists:
        print('FAILED: HADD on cluster node failed - file is missing')
        return False

    histogram_metadata_file = root_result_file + '.metadata'
    root_file_metadata_txt = run_cmd('cat %s' % histogram_metadata_file)

    expected_metadata_txt = "events_count: 3629292.0"

    if root_file_metadata_txt.find(expected_metadata_txt) == -1:
        print('FAILED: Metadata "%s" is not correct, should be "%s"' % (root_file_metadata_txt, expected_metadata_txt))
        return False

    print('PASSED: HADD on cluster node worked')

    return True
示例#11
0
def check_that_histograms_are_valid_with_invalid_metadata():

    # Prepare

    histogram_with_invalid_metadata = "%(fixtures_dir)s/histogram_with_invalid_metadata.root" % config

    histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config,
        histogram_with_invalid_metadata
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config
    command_arguments = " ".join(histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    command_output = run_cmd(command_with_arguments)

    # Check result
    expected_error_message = 'ERROR: real metadata does not match expected metadata for histogram: %s' % histogram_with_invalid_metadata
    if command_output.find(expected_error_message) == -1:
        print('Output must contain information that metadata does not match')
        return False

    if command_output.find('EXIT_STATUS_WAS: 1') == -1:
        print('Exit status must be 1 if metadata does not match')
        return False

    return True
示例#12
0
 def waitForJobs(self):
     """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
     """
     numJobs = len(self.jobIds)
     # print "<waitForJobs>: numJobs = %i" % numJobs
     if numJobs > 0:
         jobIds_per_poll_group = 500
         num_poll_groups = numJobs / jobIds_per_poll_group
         if (numJobs % jobIds_per_poll_group) > 0:
             num_poll_groups = num_poll_groups + 1
         whoami = getpass.getuser()
         while True:
             numJobs_left = 0
             for idx_poll_group in range(num_poll_groups):
                 idx_first = idx_poll_group * jobIds_per_poll_group
                 idx_last = min((idx_poll_group + 1) *
                                jobIds_per_poll_group, numJobs)
                 jobIds_poll_group = self.jobIds[idx_first:idx_last]
                 command = "%s -u %s | grep \"%s\" | wc -l" % (
                     self.command_poll, whoami, "\\|".join(jobIds_poll_group))
                 # print "idx_poll_group = %i: command = %s" %
                 # (idx_poll_group, command)
                 poll_result = run_cmd(command, True).rstrip("\n")
                 # print " poll_result = %s" % poll_result
                 numJobs_left = numJobs_left + int(poll_result)
                 time.sleep(1)
             # print "numJobs_left = %i" % numJobs_left
             if numJobs_left > 0:
                 time.sleep(self.poll_interval)
             else:
                 break
             logging.info(
                 "Waiting for sbatch to finish (%d jobs still left) ..." % numJobs_left)
示例#13
0
def check_that_histograms_are_valid_spec():

    # Prepare

    valid_histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config,
        "%(fixtures_dir)s/histogram_2.root" % config
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config
    command_arguments = " ".join(valid_histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    result = run_cmd(command_with_arguments)

    # Check result

    if result.find('All input files are ok.') == -1:
        print('Result must contain string "All input files are ok."')
        return False

    if result.find('EXIT_STATUS_WAS: 0') == -1:
        print('Exit status was not 0')
        return False

    return True
def check_that_histograms_are_valid_with_too_small_root_file_spec():

    # Prepare

    too_small_histogram = "%(fixtures_dir)s/histogram_too_small.root" % config

    histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config, too_small_histogram
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config
    command_arguments = " ".join(histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    result = run_cmd(command_with_arguments)

    # Check result

    if result.find('ERROR: root input file is too small (2 bytes): %s' %
                   too_small_histogram) == -1:
        print('Ouput must contain error information what file was too small')
        return False

    if result.find('EXIT_STATUS_WAS: 1') == -1:
        print('Exit status must be 1 if file was too small')
        return False

    return True
def check_that_histograms_are_valid_with_missing_input_histogram_spec():

    # Prepare

    missing_histogram = "%(fixtures_dir)s/histogram_THIS_DOES_NOT_EXIST.root" % config

    histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config, missing_histogram
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config
    command_arguments = " ".join(histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    result = run_cmd(command_with_arguments)

    # Check result

    if result.find('ERROR: root input file is missing: %s' %
                   missing_histogram) == -1:
        print('Ouput must contain error information what file was missing')
        return False

    if result.find('EXIT_STATUS_WAS: 1') == -1:
        print('Exit status must be 1 if file was missing')
        return False

    return True
示例#16
0
def check_that_histograms_are_equal_with_unequal_data():

    # Prepare

    output_histogram = "%(fixtures_dir)s/hadd_of_histogram_1_and_broken.root" % config

    input_histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config,
        "%(fixtures_dir)s/histogram_2.root" % config
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_equal.py' % config
    command_arguments = output_histogram + " " + " ".join(input_histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    result = run_cmd(command_with_arguments)

    # Check result

    if result.find('ERROR: count(output_histogram.events) != count(input_histograms.events)') == -1:
        print('Result must contain string "ERROR: count(output_histogram.events) != count(input_histograms.events)"')
        return False

    if result.find('EXIT_STATUS_WAS: 1') == -1:
        print('Exit status was not 1')
        return False

    return True
def check_that_histograms_are_equal_spec():

    # Prepare

    output_histogram = "%(fixtures_dir)s/hadd_of_histogram_1_and_2.root" % config

    input_histograms = [
        "%(fixtures_dir)s/histogram_1.root" % config,
        "%(fixtures_dir)s/histogram_2.root" % config
    ]

    # Run task

    command = 'python %(scripts_dir)s/check_that_histograms_are_equal.py' % config
    command_arguments = output_histogram + " " + " ".join(input_histograms)
    command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;"

    result = run_cmd(command_with_arguments)

    # Check result

    if result.find(
            'Output histogram event count is same as input histograms event counts sum'
    ) == -1:
        print(
            'Result must contain string "Output histogram event count is same as input histograms event counts sum"'
        )
        return False

    if result.find('EXIT_STATUS_WAS: 0') == -1:
        print('Exit status was not 0')
        return False

    return True
示例#18
0
    def submit(self, cmd_str):
        nof_max_retries = 10
        current_retry = 0
        while current_retry < nof_max_retries:
            # Run command
            cmd_outerr = run_cmd(cmd_str, return_stderr=True)
            try:
                job_id = cmd_outerr[0].split()[-1]
                break
            except IndexError:
                # Fails if stdout returned by the last line is empty
                logging.warning(
                    "Caught an error: '%s'; resubmitting %i-th time" %
                    (cmd_outerr[1], current_retry))
                current_retry += 1
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(
                    60
                )  # Let's wait for 60 seconds until the next resubmission

        # The job ID must be a number, so.. we have to check if it really is one
        try:
            int(job_id)
        except ValueError:
            raise ValueError("job_id = '%s' NaN; sbatch stdout = '%s'; sbatch stderr = '%s'" % \
                              (job_id, cmd_outerr[0], cmd_outerr[1]))
        if job_id in self.submittedJobs:
            raise RuntimeError("Same job ID: %s" % job_id)
        # Is a valid job ID
        return job_id
def check_that_histogram_is_ready_for_usage(input_file):
    print("<check_that_histogram_is_ready_for_usage>: input file = '%s'" % input_file)
    
    polling_delay    = 1 # in seconds
    polling_cmd      = "fuser %s" % input_file

    is_file_ready = False
    while not is_file_ready:

        stdout, stderr = run_cmd(polling_cmd, return_stderr = True)

        print("Executed command '%s':" % polling_cmd)
        print("stdout = '%s'" % stdout)
        print("stderr = '%s'" % stderr)

        if not stdout and not stderr:
            # No one uses this file, it's free to use for everyone
            break
        if not stdout and stderr:
            # The file still doesn't exist?
            print(stderr.rstrip('\n'))
            sys.exit(1)
        else:
            # Both stdout and stderr contain text (PID and filename, respectively); wait ...
            time.sleep(polling_delay)
示例#20
0
def execute_command_on_cluster_node_spec():

    # Prepare

    run_cmd("rm -rf /%(temp_dir)s/execute_command_on_cluster_node_spec/*" %
            config)

    # Run task

    pool_id = uuid.uuid4()
    m = sbatchManager(pool_id)
    m.setWorkingDir('%(cmssw_base)s/src/analysis2mu1b1j/analysis2mu1b1j/test' %
                    config)

    m.submit_job_version2(
        task_name='creating_result.txt',  # BUG: Task name can't include space
        command='''
            export TEST_DIR=%(temp_dir)s/execute_command_on_cluster_node_spec/
            mkdir -p $TEST_DIR
            echo "Worked" > $TEST_DIR/result.txt
        ''' % config,
        output_dir='%(temp_dir)s/execute_command_on_cluster_node_spec/' %
        config)

    # Check the result

    try:
        m.waitForJobs()
    except:
        got_exception = True
    else:
        got_exception = False

    if got_exception:
        return False

    with file('%(temp_dir)s/execute_command_on_cluster_node_spec/result.txt' %
              config) as f:
        result = f.read().strip()

        if result != 'Worked':
            print(
                "$TEST_DIR/ did not contain result.txt with content 'Worked'.")
            print('FAILED: Execute on cluster node failed.')
            return False

    return True
示例#21
0
def is_file_ok(output_file_name, validate_outputs=True, min_file_size=20000):
    if not (output_file_name and os.path.exists(output_file_name)):
        return False

    logging.info("Output file %s already exists" % output_file_name)

    if not output_file_name.lower().endswith('.root'):
        return True

    command = "rm %s" % output_file_name
    ret_value = False
    if min_file_size > 0:
        output_file_size = os.stat(output_file_name).st_size
        if output_file_size > min_file_size:
            if not validate_outputs:
                ret_value = True
        else:
            logging.info(
                "Deleting output file and resubmitting job because it has size smaller than %d bytes"
                % min_file_size)

    if validate_outputs:
        root_tfile = ROOT.TFile(output_file_name, "read")
        if not root_tfile:
            logging.info("Not a valid ROOT file, deleting it")
        else:
            if root_tfile.IsZombie():
                logging.info(
                    "Output file is corrupted, deleting file and resubmitting job"
                )
            else:
                # Let's open the file via bash as well to see if ROOT tries to recover the file
                open_cmd = "root -b -l -q %s 2>&1 > /dev/null | grep 'trying to recover' | wc -l" % output_file_name
                open_out = run_cmd(open_cmd)
                if open_out.rstrip('\n') != '0':
                    logging.info(
                        "Output file is probably corrupted, deleting file and resubmitting job"
                    )
                else:
                    ret_value = True
            root_tfile.Close()

    if not ret_value:
        run_cmd(command)

    return ret_value
示例#22
0
  def run(self, clean):
    record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES)
    target = 'all'
    if clean:
      if not os.path.isfile(self.makefile_path):
        logging.error(
          "The makefile %s is missing and therefore it's not possible to clean anything; "
          "run sync Ntuple production first!" % self.makefile_path
        )
        sys.exit(1)
      target = 'clean'

    nof_parallel_jobs = len(self.channel_info)
    make_cmd          = "make -f %s -j %d %s 2>%s 1>%s" % \
      (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path)
    logging.info("Running the make command: %s" % make_cmd)
    run_cmd(make_cmd)
    logging.info("All done")
示例#23
0
def generate_sbatch_line(executable, cfg_file_name, input_file_names, output_file_name, log_file_name = None,
                         cvmfs_error_log = None):
    if os.path.exists(output_file_name):
        output_file_size = os.stat(output_file_name).st_size
        print "output file %s already exists, size = %i" % (output_file_name, output_file_size)
        if output_file_size > 20000:
            print "--> skipping job because it has size creater than 20000"
            return None
        else:
            print "--> deleting output file and resubmitting job because it has size smaller 20000"
            command = "%s %s" % (executable_rm, output_file_name)
            run_cmd(command)

        if log_file_name and os.path.exists(log_file_name):
            log_file = open(log_file_name)
            is_time = False
            time = None
            is_hostname = False
            hostname = None
            is_cvmfs_error = False
            for line in log_file:
                if line.find("Time") != -1:
                    time = line.split(':')[1].strip()
                if line.find("Hostname") != -1:
                    hostname = line.split(':')[1].strip()
                if line.find("Transport endpoint is not connected") != -1:
                    is_cvmfs_error = True
            log_file.close()
            if is_cvmfs_error:
                print "Problem with cvmfs access reported in log file = '%s':" % log_file_name
                print " host = '%s': time = %s" % (hostname, time)
                if cvmfs_error_log:
                    if not hostname in cvmfs_error_log.keys():
                        cvmfs_error_log[hostname] = []
                    cvmfs_error_log[hostname].append(time)

    return "m.submitJob(%s, '%s', '%s', '%s', %s, '%s', True)" % (
        input_file_names,
        executable,
        cfg_file_name,
        os.path.dirname(output_file_name),
        [ os.path.basename(output_file_name) ],
        log_file_name
    )
def check_that_metadata_is_ok(input_file):
    print("<check_that_metadata_is_ok>: input file = '%s'" % input_file)

    metadata_file = input_file + '.metadata'
    expected_metadata_txt = run_cmd('cat %s' % metadata_file)
    real_metadata_txt = get_histogram_metadata(input_file)

    if real_metadata_txt.find(expected_metadata_txt) == -1:
        print("ERROR: Metadata for input file '%s' does not match expected value !!" % input_file)
        print("computed metadata = '%s'" % real_metadata_txt)
        print("expected metadata = '%s'" % expected_metadata_txt)
        sys.exit(1)
示例#25
0
    def run(self, clean):
        record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out,
                              DEPENDENCIES)
        target = 'all'
        if clean:
            if not os.path.isfile(self.makefile_path):
                logging.error(
                    "The makefile %s is missing and therefore it's not possible to clean anything; "
                    "run sync Ntuple production first!" % self.makefile_path)
                sys.exit(1)
            target = 'clean'

        nof_parallel_jobs = len(self.channel_info)
        make_cmd = "make -f %s -j %d %s 2>%s 1>%s" % \
          (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path)
        if self.running_method.lower() == "makefile":
            run_dir = re.sub('^/home', '/scratch', self.config_dir)
            create_if_not_exists(run_dir)
            make_cmd = re.sub('^make', 'make -C {}'.format(run_dir), make_cmd)
        logging.info("Running the make command: %s" % make_cmd)
        run_cmd(make_cmd)
        logging.info("All done")
def call_histogram_aggregation_on_cluster_node_spec():

    # Prepare

    run_cmd("rm -rf /home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node" % config)
    run_cmd("mkdir -p /home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/" % config)


    # Add histograms and run task

    m = sbatchManager()
    m.setWorkingDir('/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/analysis2mu1b1j/analysis2mu1b1j/test' % config)

    m.hadd_in_cluster(
        inputFiles=[
            '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/histogram_1.root' % config,
            '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/histogram_2.root' % config
        ],
        outputFile='/home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/result.root' % config
    )

    m.waitForJobs()


    # Check result

    root_result_file = '/home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/result.root' % config
    result_successful = os.path.isfile(root_result_file)


    # Output result

    if result_successful:
        print('HADD on cluster node worked')
    else:
        print('HADD on cluster node failed')

    return result_successful
def execute_command_on_cluster_node_spec():

    # Prepare

    run_cmd("rm -rf /home/%(user)s/tmp/execute_command_on_cluster_node_spec/*" % config)


    # Run task

    m = sbatchManager()
    m.setWorkingDir('/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/analysis2mu1b1j/analysis2mu1b1j/test' % config)

    m.submit_job_version2(
        task_name = 'creating_result.txt', # BUG: Task name can't include space
        command = '''
            export TEST_DIR=/home/%(user)s/tmp/execute_command_on_cluster_node_spec/
            mkdir -p $TEST_DIR
            echo "Worked" > $TEST_DIR/result.txt
        '''  % config,
        output_dir = '/home/%(user)s/tmp/execute_command_on_cluster_node_spec/' % config
    )

    m.waitForJobs()

    # Check the result

    with file('/home/%(user)s/tmp/execute_command_on_cluster_node_spec/result.txt' % config) as f:
        result = f.read().strip()

        if result == 'Worked':
            print('Execute on cluster node passed.')
            return True

    print("$TEST_DIR/ did not contain result.txt with content 'Worked'.")
    print('Execute on cluster node failed.')

    return False
示例#28
0
def get_histogram_metadata(histogram):
    sha1sum = run_cmd('sha1sum %s' % histogram).split(' ')[0]
    events_count = get_events_count(histogram)

    metadata = """sha1sum: %s\nevents_count: %i\n""" % (sha1sum, events_count)
    return metadata
示例#29
0
            sh = jinja2.Template(sh_str).render(cmd=cmd)
            sh_file = os.path.join(args.generate_jobs, 'job_%i.sh' % path_idx)
            with open(sh_file, 'w') as f:
                f.write(sh)
            log_file = os.path.join(args.generate_jobs,
                                    'log_%i.txt' % path_idx)
            job_params.append((log_file, sh_file))

        # submit the jobs
        submit_cmds = list(
            map(
                lambda job_param: 'sbatch --partition=small --output=%s %s' %
                job_param, job_params))
        squeue_codes = []
        for submit_cmd in submit_cmds:
            squeue_code = run_cmd(submit_cmd).split()[-1]
            squeue_codes.append(squeue_code)
            logging.info(
                "Submitted sbatch job {jobId}".format(jobId=squeue_code))

        has_completed = not bool(squeue_codes)
        while not has_completed:
            squeue = run_cmd("squeue -j {jobIds} -h | wc -l".format(
                jobIds=','.join(squeue_codes))).rstrip('\n')
            if squeue == '0':
                has_completed = True
            logging.debug(
                "{nofJobs} job(s) still running...".format(nofJobs=squeue))
            time.sleep(5)
        logging.info("All jobs have been finished")
示例#30
0
 def run(self):
     """Runs all Ntuple production jobs -- either locally or on the batch system.
     """
     run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs),
             False, self.stdout_file, self.stderr_file)
示例#31
0
 def run(self):
     """Runs all Ntuple production jobs -- either locally or on the batch system.
     """
     run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs),
         False, self.stdout_file, self.stderr_file)
示例#32
0
    def poll(self, nonBlocking):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobIds out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            job_id for job_id in self.submittedJobs
            if self.submittedJobs[job_id]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
        while nofJobs_left > 0:
            # Get the list of jobs submitted to batch system and convert their jobIds to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(60)
            polled_ids = set()
            if poll_result != '':
                polled_ids = set(poll_result.split(delimiter))

            # Check if number of jobs submitted to batch system is below maxSubmittedJobs;
            # if it is, take jobs from queuedJobs list and submit them,
            # until a total of maxSubmittedJobs is submitted to batch system
            nofJobs_toSubmit = min(len(self.queuedJobs),
                                   self.maxSubmittedJobs - len(polled_ids))
            if nofJobs_toSubmit > 0:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs."
                    .format(len(polled_ids), len(self.queuedJobs),
                            nofJobs_toSubmit))
            else:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing."
                    .format(len(polled_ids), len(self.queuedJobs)))
            for i in range(0, nofJobs_toSubmit):
                # randomly submit a job from the queue
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_idx = len(self.queuedJobs) - 1
                random_idx = random.randint(0, max_idx)
                job = self.queuedJobs.pop(random_idx)
                job['status'] = Status.submitted
                job_id = self.submit(job['sbatch_command'])
                self.submittedJobs[job_id] = job

            # Now check status of jobs submitted to batch system:
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for job_id, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(job_id)
                        elif details.status == Status.running:
                            running_jobs.append(job_id)
                        else:
                            failed_jobs.append(job_id)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[job_id].status for job_id in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for job_id in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=job_id,
                                    exitCode=completion[job_id].exit_code,
                                    derivedExitCode=completion[job_id].
                                    derived_exit_code,
                                    state=completion[job_id].state,
                                    status=Status.toString(
                                        completion[job_id].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.submittedJobs[failed_job][
                                    log[1]]
                                if os.path.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                            if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \
                               completion[failed_job].status == Status.io_error:
                                # The job is eligible for resubmission if the job hasn't been resubmitted more
                                # than a preset limit of resubmissions AND if the job failed due to I/O errors
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED because: {reason} "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=failed_job,
                                        args=self.submittedJobs[failed_job]
                                        ['args'],
                                        reason=Status.toString(
                                            completion[failed_job].status),
                                        attempt=self.submittedJobs[failed_job]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[failed_job]['args'])
                                # The old ID must be deleted, b/c otherwise it would be used to compare against
                                # squeue output and we would resubmit the failed job ad infinitum
                                del self.submittedJobs[failed_job]
                            else:
                                # We've exceeded the maximum number of resubmissions -> fail the workflow
                                raise Status.raiseError(
                                    completion[failed_job].status)
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for job_id in completed_jobs:
                        if not all(
                                map(
                                    lambda outputFile: is_file_ok(
                                        outputFile,
                                        validate_outputs=True,
                                        min_file_size=self.min_file_size), self
                                    .submittedJobs[job_id]['outputFiles'])):
                            if self.submittedJobs[job_id][
                                    'nof_submissions'] < self.max_resubmissions:
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=job_id,
                                        args=self.submittedJobs[job_id]
                                        ['args'],
                                        attempt=self.submittedJobs[job_id]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[job_id]['args'])
                                del self.submittedJobs[job_id]
                            else:
                                raise ValueError(
                                    "Job w/ ID {id} FAILED because it repeatedly produces bogus output "
                                    "file {output} yet the job still exits w/o any errors"
                                    .format(
                                        id=job_id,
                                        output=', '.join(
                                            self.submittedJobs[job_id]
                                            ['outputFiles']),
                                    ))
                        else:
                            # Job completed just fine
                            self.submittedJobs[job_id][
                                'status'] = Status.completed

            jobIds_set = set([
                job_id for job_id in self.submittedJobs
                if self.submittedJobs[job_id]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
            if nofJobs_left > 0:
                if nonBlocking:
                    return False
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                logging.debug("sleeping for %i seconds." % random_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break

        return True
示例#33
0
    def waitForJobs(self):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobs IDs out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            id_ for id_ in self.jobIds
            if self.jobIds[id_]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set)
        while nofJobs_left > 0:
            # Get the list of running jobs and convert them to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                time.sleep(60)
            polled_ids = set(poll_result.split(delimiter))
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for id_, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(id_)
                        elif details.status == Status.running:
                            running_jobs.append(id_)
                        else:
                            failed_jobs.append(id_)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[id_].status for id_ in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for id_ in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=id_,
                                    exitCode=completion[id_].exit_code,
                                    derivedExitCode=completion[id_].
                                    derived_exit_code,
                                    state=completion[id_].state,
                                    status=Status.toString(
                                        completion[id_].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.jobIds[failed_job][log[1]]
                                if os.path.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                        # Raise the first error at hand
                        raise Status.raiseError(errors[0])
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for id_ in completed_jobs:
                        self.jobIds[id_]['status'] = Status.completed

            jobIds_set = set([
                id_ for id_ in self.jobIds
                if self.jobIds[id_]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set)
            if nofJobs_left > 0:
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
示例#34
0
import os
from tthAnalysis.HiggsToTauTau.jobTools import run_cmd

# set tests to fastest priority queue
allowed_sbatch_priorites = ['prio', 'test']

if not os.environ.get('SBATCH_PRIORITY') in allowed_sbatch_priorites:
    print(
        'Will run tests in cluster using SBATCH_PRIORITY="prio". For faster execution on Quasar, use SBATCH_PRIORITY="test". ;)'
    )
    os.environ['SBATCH_PRIORITY'] = 'prio'

# initialize properties

user = run_cmd('whoami').strip()
cmssw_base = run_cmd('echo $CMSSW_BASE').strip()
temp_dir = '/home/%s/tmp/' % user
fixtures_dir = '%s/src/tthAnalysis/HiggsToTauTau/specification/fixtures/' % cmssw_base
sbatch_priority = run_cmd('echo $SBATCH_PRIORITY').strip()
scripts_dir = "%s/src/tthAnalysis/HiggsToTauTau/scripts/" % cmssw_base

# create config

config = {
    'user': user,
    'cmssw_base': cmssw_base,
    'temp_dir': temp_dir,
    'fixtures_dir': fixtures_dir,
    'sbatch_priority': sbatch_priority,
    'scripts_dir': scripts_dir
}
示例#35
0
def hadd(input_files, output_file):
    cmd_str = 'hadd -f %s %s' % (output_file, ' '.join(input_files))
    stdout, stderr = run_cmd(cmd_str, do_not_log=True, return_stderr=True)
    if not stdout or stderr:
        raise RuntimeError('Error: %s' % stderr)
示例#36
0
    )
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        required=False,
        help='R|Verbose output',
    )
    args = parser.parse_args()

    samples = load_dict(args.dictionary, args.sample_name)

    has_dasgoclient = run_cmd('which dasgoclient 2>/dev/null | wc -l',
                              do_not_log=True,
                              return_stderr=False)
    if has_dasgoclient.rstrip('\n') != "1":
        raise ValueError(
            "dasgoclient not available! Set up your 94x environment")
    has_voms_proxy = run_cmd('which voms-proxy-info 2>/dev/null | wc -l',
                             do_not_log=True,
                             return_stderr=False)
    if has_voms_proxy.rstrip('\n') != "1":
        raise ValueError(
            "voms-proxy-* not available! Set up your 94x environment")

    min_voms_proxy_timeleft_hours = 3
    voms_proxy_timeleft = int(
        run_cmd('voms-proxy-info --timeleft',
                do_not_log=True,
示例#37
0
 def run(self):
     """Runs the complete analysis workfow -- either locally or on the batch system.
     """
     run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs),
         False, self.stdout_file, self.stderr_file)
示例#38
0
from tthAnalysis.HiggsToTauTau.jobTools import run_cmd

config = {
    'user': run_cmd('whoami').strip()
}
示例#39
0
    def submitJob(self, inputFiles, executable, cfgFile, outputFilePath, outputFiles, logFile=None, skipIfOutputFileExists=False):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """

        # raise if logfile missing
        if not logFile:
            if not self.logFileDir:
                raise ValueError(
                    "Please call 'setLogFileDir' before calling 'submitJob' !!")
            logFile = os.path.join(self.logFileDir, os.path.basename(
                script_file).replace(".sh", ".log"))

        # if any of the output files exists, returns (Margus: BUG? Because only
        # that file should be skipped, not all?)
        if skipIfOutputFileExists:
            for outputFile in outputFiles:
                if os.path.exists(os.path.join(outputFilePath, outputFile)):
                    print "output file = '%s' exists --> skipping !!" % os.path.join(outputFilePath, outputFile)
                    return

        if not self.workingDir:
            raise ValueError(
                "Please call 'setWorkingDir' before calling 'submitJob' !!")

        # create scratch dir
        scratchDir = "/scratch/%s" % getpass.getuser()
        if not os.path.exists(scratchDir):
            print "Directory '%s' does not yet exist, creating it !!" % scratchDir
            run_cmd(command_create_scratchDir)
        scratchDir = os.path.join(
            scratchDir, "tthAnalysis" + "_" + date.today().isoformat())
        create_if_not_exists(scratchDir)

        # create script for executing jobs
        script_file = cfgFile.replace(".py", ".sh")
        script_file = script_file.replace("_cfg", "")

        wrapper_log_file = logFile.replace('.log', '_wrapper.log')
        executable_log_file = logFile.replace('.log', '_executable.log')

        command = "%s --partition=%s --output=%s %s" % (
            self.command_submit, self.queue, wrapper_log_file, script_file)

        script = jinja2.Template(job_template).render(
            working_dir = self.workingDir,
            scratch_dir = scratchDir,
            exec_name = executable,
            cfg_file = cfgFile,
            inputFiles = " ".join(inputFiles),
            outputDir = outputFilePath,
            outputFiles = " ".join(outputFiles),
            wrapper_log_file = wrapper_log_file,
            executable_log_file = executable_log_file,
            RUNNING_COMMAND = command
        )
        print "writing sbatch script file = '%s'" % script_file
        with codecs.open(script_file, "w", "utf-8") as f:
            f.write(script)

        print "<submitJob>: command = %s" % command
        run_cmd_output = run_cmd(command)
        print "run_cmd_output: %s" % run_cmd_output
        ret_val = run_cmd_output.split()[-1]
        print "ret_val: %s" % ret_val
        job_id = ret_val.split()[-1]
        # print " jobId = %s" % jobId
        self.jobIds.append(job_id)
示例#40
0
 def run(self):
     """Runs the complete analysis workfow -- either locally or on the batch system.
     """
     run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs),
         False, self.stdout_file, self.stderr_file)
示例#41
0
    def check_job_completion(self,
                             jobsId_list,
                             default_completion=Status.completed):
        completion = {
            k: JobCompletion(status=default_completion)
            for k in jobsId_list
        }

        # If the input list is empty, just return here (we don't want to mess up the subprocess commands here)
        if not completion:
            return completion

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','

        # First, let's try with sacct; explanation:
        # 1) sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State
        #      Shows job IDs, exit codes and comments of all submitted, running and finished jobs, one line per job
        #        a) -X -- shows cumulative statistics of each job (has no effect here, though)
        #        b) -P -- output will be '|' delimited without a '|' at the end
        #        c) -n -- omit header
        #        d) -o JobID,ExitCode,DerivedExitCode -- output format
        #        e) -S {datetime} -- look only for jobs submitted after {datetime}
        #        f) -j {jobs} -- filter out only the relevant jobs by their job ID (comma-separated list)
        # 2) sed ':a;N;$!ba;s/\\n/{delimiter}/g'
        #      Place all entries to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        sacct_cmd = "sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State -S {datetime} -j {jobs} | " \
                    "sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format(
          datetime  = self.datetime,
          jobs      = ','.join(jobsId_list),
          delimiter = delimiter,
        )
        sacct_out, sacct_err = run_cmd(sacct_cmd,
                                       do_not_log=not self.log_completion,
                                       return_stderr=True)
        if not sacct_err and sacct_out:
            # The output of sacct contains one line per job, each line has pipe-separated fields the order of which
            # is defined in the command that issued the output
            lines = sacct_out.split(delimiter)
            for line in lines:
                JobID, ExitCode, DerivedExitCode, State = line.split('|')
                if JobID in completion:
                    completion[JobID] = JobCompletion(
                        status=Status.classify_error(ExitCode, DerivedExitCode,
                                                     State),
                        exit_code=ExitCode,
                        derived_exit_code=DerivedExitCode,
                        state=State,
                    )
            return completion
        else:
            # Likely returned along the lines of (due to heavy load on the cluster since SQL DB is overloaded):
            # sacct: error: Problem talking to the database: Connection refused
            logging.info('sacct currently unavailable: %s' % sacct_err)

        # Let's try with scontrol if the sacct commands failed
        # scontrol doesn't have an option to take a list of Job IDs as an argument; thus, we have to grep the job IDs
        # Explanation:
        # 1) scontrol show -od job
        #      Prints out everything about running or recently finished jobs
        #        a) -o -- prints information one line per record
        #        b) -d -- includes more detailed information about the job
        #        c) job -- prints all jobs (it's possible to get information about other units like nodes and clusters)
        # 2) grep '{jobs}'
        #      Filter out jobs by their job ID (by concatenating the list with escaped regex OR operator '|')
        # 3) sed ':a;N;$!ba;s/\\n/{delimiter}/g'
        #      Put all the result on one line, where each record is delimited by {delimiter}
        scontrol_cmd = "scontrol show -od job | grep '{jobs}' | sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format(
            jobs='\\|'.join(jobsId_list),
            delimiter=delimiter,
        )
        scontrol_out, scontrol_err = run_cmd(
            scontrol_cmd,
            do_not_log=not self.log_completion,
            return_stderr=True)
        if not scontrol_err and scontrol_out:
            # The output of scontrol contains one entry per line, each line contains a space-delimited key-value pairs,
            # whereas the keys and values are separated by an equation sign
            # Although the keys do not contain any spaces, the values might, so we have to take care of that
            lines = scontrol_out.split(delimiter)
            for line in lines:
                line_dict = {}
                line_split_eq_spaces = map(lambda x: x.split(),
                                           line.split('='))
                for i in range(len(line_split_eq_spaces) - 1):
                    k = line_split_eq_spaces[i]
                    v = line_split_eq_spaces[i + 1]
                    line_dict[k[-1]] = ' '.join(
                        v[:-1] if i != len(line_split_eq_spaces) - 2 else v)
                if not 'JobId' in line_dict.keys():
                    print("Skipping line = '%s'" % line)
                    continue
                JobId = line_dict['JobId']
                if JobId in completion:
                    completion[JobId] = JobCompletion(
                        status=Status.classify_error(
                            line_dict['ExitCode'],
                            line_dict['DerivedExitCode'],
                            line_dict['JobState'],
                        ),
                        exit_code=line_dict['ExitCode'],
                        derived_exit_code=line_dict['DerivedExitCode'],
                        state=line_dict['JobState'])
            return completion
        else:
            # scontrol probably returned something like:
            # slurm_load_jobs error: Invalid job id specified
            # Probably because too much time has passed since the job completion and checking the exit status here
            logging.info('scontrol has errors: %s' % scontrol_err)

        # scontrol still might fail if too much time has passed since the jobs completion (the metadata about each
        # job is cached for a certain period of time, the length of which I don't know at the moment)
        # None of the SLURM commands work; let's just say that the job completed successfully
        logging.error(
            "Cannot tell if the job has completed successfully or not!")
        return completion