def main(iargs=None): parser = argparse.ArgumentParser(description='CLI Parser') arg_group = parser.add_argument_group('General options:') arg_group.add_argument( 'job_file_name', help='The job file that failed with a timeout error.\n') inps = parser.parse_args(args=iargs) wall_time = putils.extract_walltime_from_job_file(inps.job_file_name) new_wall_time = putils.multiply_walltime(wall_time, factor=1.2) putils.replace_walltime_in_job_file(inps.job_file_name, new_wall_time)
def main(iargs=None): """ summarize job durations """ parser = argparse.ArgumentParser() parser = argparse.ArgumentParser( description='Utility to summarize job times and service units billed', formatter_class=argparse.RawTextHelpFormatter, epilog=EXAMPLE) parser.add_argument( 'custom_template_file', metavar="FILE", default='None', nargs='?', help='template file to use [default: working directory]') parser.add_argument('--local', dest='local_flag', action='store_true', default=False, help='for current (local) directory') inps = parser.parse_args(args=iargs) try: inps = putils.create_or_update_template(inps) run_files_dir = inps.work_dir + '/run_files' except: cwd = os.getcwd() if 'run_files' in os.path.basename(cwd): inps.work_dir = os.path.dirname(cwd) inps.project_name = os.path.basename(inps.work_dir) run_files_dir = cwd else: inps.work_dir = cwd inps.project_name = os.path.basename(inps.work_dir) run_files_dir = cwd + '/run_files' run_stdout_files = glob.glob( run_files_dir + '/run_*_*_[0-9][0-9][0-9][0-9]*.o') + glob.glob( run_files_dir + '/*/run_*_*_[0-9][0-9][0-9][0-9]*.o') run_stdout_files = natsorted(run_stdout_files) #run_stdout_files2 = glob.glob(run_files_dir + '/stdout_run_*/run_*.o') #run_stdout_files2 = natsorted(run_stdout_files2) #run_stdout_files.extend(run_stdout_files2) if len(run_stdout_files) == 0: run_stdout_files = glob.glob(run_files_dir + '/stdout_run_*/run_*.o') run_stdout_files = natsorted(run_stdout_files) job_id_list = [] bursts = glob.glob(inps.work_dir + '/geom_reference/*/hgt*rdr') number_of_bursts = len(bursts) if len(bursts) == 0: number_of_bursts = 1 out_lines = [] string = 'run_files_dir: ' + run_files_dir print(string) out_lines.append(string) text = 'Number of bursts: ' + str(number_of_bursts) string = '{:32} {:1}'.format( text, " NNodes Timelimit Reserved Elapsed Time_per_burst") print(string) out_lines.append(string) num_nodes_list = [] wall_time_list = [] reserved_time_list = [] elapsed_time_list = [] hostname = subprocess.Popen( "hostname -f", shell=True, stdout=subprocess.PIPE).stdout.read().decode("utf-8") scheduler = None for platform in ['frontera', 'stampede2', 'comet']: if platform in hostname: scheduler = 'SLURM' break if not scheduler == 'SLURM': print('Not on TACC system - return from summarize_job_run_times.py') return None for fname in run_stdout_files: job_id = os.path.basename(fname).split('.o')[0].split('_')[-1] command = 'sacct --format=NNodes,Timelimit,reserved,elapsed -j ' + job_id process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = process.communicate() try: out = stdout.splitlines()[2] except: continue num_nodes = out.decode('utf-8').split()[0] wall_time = out.decode('utf-8').split()[1] reserved_time = out.decode('utf-8').split()[2] elapsed_time = out.decode('utf-8').split()[3] time_per_burst = putils.multiply_walltime(elapsed_time, factor=1 / number_of_bursts) string = '{:32} {:1} {:1}'.format( '_'.join(os.path.basename(fname).split('_')[0:-1]), out.decode('utf-8'), time_per_burst) print(string) out_lines.append(string) num_nodes_list.append(num_nodes) wall_time_list.append(wall_time) reserved_time_list.append(reserved_time) elapsed_time_list.append(elapsed_time) reserved_time_sum = putils.sum_time(reserved_time_list) elapsed_time_sum = putils.sum_time(elapsed_time_list) total_time = putils.sum_time([reserved_time_sum, elapsed_time_sum]) service_units = calculate_service_units(num_nodes_list, elapsed_time_list) if os.path.exists('run_files/rerun.log'): file = open('run_files/rerun.log', mode='r') rerun_log = file.read() print('\n' + rerun_log) out_lines.append('\n' + rerun_log) string = '\nTotal reserved (pending), elapsed time: ' + reserved_time_sum + ' ' + elapsed_time_sum print(string) out_lines.append(string) string = 'Total time: ' + total_time print(string) out_lines.append(string) string = 'Service units: ' + str( round(service_units, 1)) print(string) out_lines.append(string) string = ' ' print(string) out_lines.append(string) home_dir = os.getenv('HOME') save_job_run_times_summary(home_dir + '/job_summaries', out_lines, inps.project_name) return None
def submit_and_check_job_status(self, job_files, work_dir=None): """ Writes a single job file for launcher to submit as array. This is used to submit jobs in slurm or sge where launcher is available (compare to submit_jobs_individually used on pegasus with LSF) :return: :param batch_file: File containing tasks that we are submitting. :param work_dir: the directory to check outputs and error files of job """ job_numbers = [] jobs_out = [] jobs_err = [] for job_file_name in job_files: os.system('chmod +x {}'.format( os.path.join(work_dir, job_file_name))) job_num = self.submit_single_job(job_file_name, work_dir) out = os.path.join( work_dir, "{}_{}.o".format(job_file_name.split('.')[0], job_num)) err = os.path.join( work_dir, "{}_{}.e".format(job_file_name.split('.')[0], job_num)) job_numbers.append(job_num) jobs_out.append(out) jobs_err.append(err) i = 0 wait_time_sec = 60 total_wait_time_min = 0 time.sleep(2) if self.scheduler == 'SLURM': rerun_job_files = [] job_status_file = os.path.join(work_dir, 'job_status') for job_number, job_file_name in zip(job_numbers, job_files): if not job_number == 'None': job_stat = 'wait' while job_stat == 'wait': os.system('sacct --format="State" -j {} > {}'.format( job_number, job_status_file)) time.sleep(2) with open(job_status_file, 'r') as stat_file: status = stat_file.readlines() if len(status) < 3: continue if 'PENDING' in status[2] or 'RUNNING' in status[2]: print( "Waiting for job {} output file after {} minutes" .format(job_file_name, total_wait_time_min)) total_wait_time_min += wait_time_sec / 60 time.sleep(wait_time_sec - 2) i += 1 elif 'COMPLETED' in status[2]: job_stat = 'complete' elif 'TIMEOUT' in status[2]: job_stat = 'timeout' rerun_job_files.append(job_file_name) else: job_stat = 'failed' raise RuntimeError( 'Error: {} job was terminated with Error'. format(job_file_name)) if len(rerun_job_files) > 0: for job_file_name in rerun_job_files: wall_time = putils.extract_walltime_from_job_file( job_file_name) new_wall_time = putils.multiply_walltime(wall_time, factor=1.2) putils.replace_walltime_in_job_file( job_file_name, new_wall_time) dateStr = datetime.strftime(datetime.now(), '%Y%m%d:%H-%M') string = dateStr + ': re-running: ' + os.path.basename( job_file_name ) + ': ' + wall_time + ' --> ' + new_wall_time with open(self.work_dir + '/run_files/rerun.log', 'a') as rerun: rerun.writelines(string) self.submit_and_check_job_status(rerun_job_files, work_dir=self.work_dir) else: for out, job_file_name in zip(jobs_out, job_files): if not 'None' in out: while not os.path.exists(out): print( "Waiting for job {} output file after {} minutes". format(job_file_name, total_wait_time_min)) total_wait_time_min += wait_time_sec / 60 time.sleep(wait_time_sec) # i += 1 for job_file_name in job_files: error_files = glob.glob(job_file_name.split('.')[0] + '*.e') for errfile in error_files: job_exit = [ check_words_in_file(errfile, 'Segmentation fault'), check_words_in_file(errfile, 'Aborted'), check_words_in_file(errfile, 'ERROR'), check_words_in_file(errfile, 'Error') ] if np.array(job_exit).any(): raise RuntimeError( 'Error terminating job: {}'.format(job_file_name)) return