예제 #1
0
def main(iargs=None):

    parser = argparse.ArgumentParser(description='CLI Parser')
    arg_group = parser.add_argument_group('General options:')
    arg_group.add_argument(
        'job_file_name',
        help='The job file that failed with a timeout error.\n')

    inps = parser.parse_args(args=iargs)

    wall_time = putils.extract_walltime_from_job_file(inps.job_file_name)
    new_wall_time = putils.multiply_walltime(wall_time, factor=1.2)
    putils.replace_walltime_in_job_file(inps.job_file_name, new_wall_time)
예제 #2
0
def main(iargs=None):
    """
    summarize job durations
    """

    parser = argparse.ArgumentParser()
    parser = argparse.ArgumentParser(
        description='Utility to summarize job times and service units billed',
        formatter_class=argparse.RawTextHelpFormatter,
        epilog=EXAMPLE)
    parser.add_argument(
        'custom_template_file',
        metavar="FILE",
        default='None',
        nargs='?',
        help='template file to use [default: working directory]')
    parser.add_argument('--local',
                        dest='local_flag',
                        action='store_true',
                        default=False,
                        help='for current (local) directory')

    inps = parser.parse_args(args=iargs)

    try:
        inps = putils.create_or_update_template(inps)
        run_files_dir = inps.work_dir + '/run_files'
    except:
        cwd = os.getcwd()
        if 'run_files' in os.path.basename(cwd):
            inps.work_dir = os.path.dirname(cwd)
            inps.project_name = os.path.basename(inps.work_dir)
            run_files_dir = cwd
        else:
            inps.work_dir = cwd
            inps.project_name = os.path.basename(inps.work_dir)
            run_files_dir = cwd + '/run_files'

    run_stdout_files = glob.glob(
        run_files_dir + '/run_*_*_[0-9][0-9][0-9][0-9]*.o') + glob.glob(
            run_files_dir + '/*/run_*_*_[0-9][0-9][0-9][0-9]*.o')
    run_stdout_files = natsorted(run_stdout_files)

    #run_stdout_files2 = glob.glob(run_files_dir + '/stdout_run_*/run_*.o')
    #run_stdout_files2 = natsorted(run_stdout_files2)
    #run_stdout_files.extend(run_stdout_files2)

    if len(run_stdout_files) == 0:
        run_stdout_files = glob.glob(run_files_dir + '/stdout_run_*/run_*.o')
        run_stdout_files = natsorted(run_stdout_files)

    job_id_list = []

    bursts = glob.glob(inps.work_dir + '/geom_reference/*/hgt*rdr')
    number_of_bursts = len(bursts)

    if len(bursts) == 0:
        number_of_bursts = 1

    out_lines = []
    string = 'run_files_dir:  ' + run_files_dir
    print(string)
    out_lines.append(string)
    text = 'Number of bursts: ' + str(number_of_bursts)
    string = '{:32} {:1}'.format(
        text, "  NNodes  Timelimit   Reserved    Elapsed  Time_per_burst")
    print(string)
    out_lines.append(string)

    num_nodes_list = []
    wall_time_list = []
    reserved_time_list = []
    elapsed_time_list = []

    hostname = subprocess.Popen(
        "hostname -f", shell=True,
        stdout=subprocess.PIPE).stdout.read().decode("utf-8")

    scheduler = None
    for platform in ['frontera', 'stampede2', 'comet']:
        if platform in hostname:
            scheduler = 'SLURM'
            break
    if not scheduler == 'SLURM':
        print('Not on TACC system - return from summarize_job_run_times.py')
        return None

    for fname in run_stdout_files:
        job_id = os.path.basename(fname).split('.o')[0].split('_')[-1]

        command = 'sacct --format=NNodes,Timelimit,reserved,elapsed -j ' + job_id

        process = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        stdout, stderr = process.communicate()
        try:
            out = stdout.splitlines()[2]
        except:
            continue
        num_nodes = out.decode('utf-8').split()[0]
        wall_time = out.decode('utf-8').split()[1]
        reserved_time = out.decode('utf-8').split()[2]
        elapsed_time = out.decode('utf-8').split()[3]

        time_per_burst = putils.multiply_walltime(elapsed_time,
                                                  factor=1 / number_of_bursts)

        string = '{:32} {:1}  {:1}'.format(
            '_'.join(os.path.basename(fname).split('_')[0:-1]),
            out.decode('utf-8'), time_per_burst)
        print(string)
        out_lines.append(string)

        num_nodes_list.append(num_nodes)
        wall_time_list.append(wall_time)
        reserved_time_list.append(reserved_time)
        elapsed_time_list.append(elapsed_time)

    reserved_time_sum = putils.sum_time(reserved_time_list)
    elapsed_time_sum = putils.sum_time(elapsed_time_list)
    total_time = putils.sum_time([reserved_time_sum, elapsed_time_sum])

    service_units = calculate_service_units(num_nodes_list, elapsed_time_list)

    if os.path.exists('run_files/rerun.log'):
        file = open('run_files/rerun.log', mode='r')
        rerun_log = file.read()
        print('\n' + rerun_log)
        out_lines.append('\n' + rerun_log)

    string = '\nTotal reserved (pending), elapsed time: ' + reserved_time_sum + ' ' + elapsed_time_sum
    print(string)
    out_lines.append(string)
    string = 'Total time:                             ' + total_time
    print(string)
    out_lines.append(string)
    string = 'Service units:                          ' + str(
        round(service_units, 1))
    print(string)
    out_lines.append(string)
    string = ' '
    print(string)
    out_lines.append(string)

    home_dir = os.getenv('HOME')
    save_job_run_times_summary(home_dir + '/job_summaries', out_lines,
                               inps.project_name)

    return None
예제 #3
0
    def submit_and_check_job_status(self, job_files, work_dir=None):
        """
        Writes a single job file for launcher to submit as array. This is used to submit jobs in slurm or sge where launcher
        is available (compare to submit_jobs_individually used on pegasus with LSF)
        :return:
        :param batch_file: File containing tasks that we are submitting.
        :param work_dir: the directory to check outputs and error files of job
        """

        job_numbers = []
        jobs_out = []
        jobs_err = []

        for job_file_name in job_files:
            os.system('chmod +x {}'.format(
                os.path.join(work_dir, job_file_name)))
            job_num = self.submit_single_job(job_file_name, work_dir)
            out = os.path.join(
                work_dir,
                "{}_{}.o".format(job_file_name.split('.')[0], job_num))
            err = os.path.join(
                work_dir,
                "{}_{}.e".format(job_file_name.split('.')[0], job_num))
            job_numbers.append(job_num)
            jobs_out.append(out)
            jobs_err.append(err)

        i = 0
        wait_time_sec = 60
        total_wait_time_min = 0
        time.sleep(2)

        if self.scheduler == 'SLURM':
            rerun_job_files = []
            job_status_file = os.path.join(work_dir, 'job_status')
            for job_number, job_file_name in zip(job_numbers, job_files):
                if not job_number == 'None':
                    job_stat = 'wait'
                    while job_stat == 'wait':
                        os.system('sacct --format="State" -j {} > {}'.format(
                            job_number, job_status_file))
                        time.sleep(2)
                        with open(job_status_file, 'r') as stat_file:
                            status = stat_file.readlines()
                            if len(status) < 3:
                                continue
                        if 'PENDING' in status[2] or 'RUNNING' in status[2]:
                            print(
                                "Waiting for job {} output file after {} minutes"
                                .format(job_file_name, total_wait_time_min))
                            total_wait_time_min += wait_time_sec / 60
                            time.sleep(wait_time_sec - 2)
                            i += 1
                        elif 'COMPLETED' in status[2]:
                            job_stat = 'complete'
                        elif 'TIMEOUT' in status[2]:
                            job_stat = 'timeout'
                            rerun_job_files.append(job_file_name)
                        else:
                            job_stat = 'failed'
                            raise RuntimeError(
                                'Error: {} job was terminated with Error'.
                                format(job_file_name))

            if len(rerun_job_files) > 0:
                for job_file_name in rerun_job_files:
                    wall_time = putils.extract_walltime_from_job_file(
                        job_file_name)
                    new_wall_time = putils.multiply_walltime(wall_time,
                                                             factor=1.2)
                    putils.replace_walltime_in_job_file(
                        job_file_name, new_wall_time)

                    dateStr = datetime.strftime(datetime.now(), '%Y%m%d:%H-%M')
                    string = dateStr + ': re-running: ' + os.path.basename(
                        job_file_name
                    ) + ': ' + wall_time + ' --> ' + new_wall_time

                    with open(self.work_dir + '/run_files/rerun.log',
                              'a') as rerun:
                        rerun.writelines(string)

                self.submit_and_check_job_status(rerun_job_files,
                                                 work_dir=self.work_dir)

        else:
            for out, job_file_name in zip(jobs_out, job_files):
                if not 'None' in out:
                    while not os.path.exists(out):
                        print(
                            "Waiting for job {} output file after {} minutes".
                            format(job_file_name, total_wait_time_min))
                        total_wait_time_min += wait_time_sec / 60
                        time.sleep(wait_time_sec)
                        # i += 1

        for job_file_name in job_files:
            error_files = glob.glob(job_file_name.split('.')[0] + '*.e')
            for errfile in error_files:
                job_exit = [
                    check_words_in_file(errfile, 'Segmentation fault'),
                    check_words_in_file(errfile, 'Aborted'),
                    check_words_in_file(errfile, 'ERROR'),
                    check_words_in_file(errfile, 'Error')
                ]
                if np.array(job_exit).any():
                    raise RuntimeError(
                        'Error terminating job: {}'.format(job_file_name))

        return