def check_ping_files(self, print_more_warnings = False, print_details = False, fix_problems = False): run_problems = list() queue_problems = list() check_queue = True try: stat_output = subprocess.check_output([self.cc('stat')], stderr = subprocess.STDOUT) except (KeyError, OSError, subprocess.CalledProcessError): # we don't have a stat tool here, if subprocess.CalledProcessError # is raised check_queue = False if print_more_warnings and not check_queue: print("Attention, we cannot check stale queued ping files because " "this host does not have %s." % self.cc('stat')) running_jids = set() if check_queue: for line in stat_output.split("\n"): try: jid = int(line.strip().split(' ')[0]) running_jids.add(str(jid)) except ValueError: # this is not a JID pass now = datetime.datetime.now() for task in self.all_tasks_topologically_sorted: exec_ping_file = task.get_run().get_executing_ping_file() queued_ping_file = task.get_run().get_queued_ping_file() if os.path.exists(exec_ping_file): info = yaml.load(open(exec_ping_file, 'r')) start_time = info['start_time'] last_activity = datetime.datetime.fromtimestamp( abstract_step.AbstractStep.fsc.getmtime(exec_ping_file)) last_activity_difference = now - last_activity if last_activity_difference.total_seconds() > \ abstract_step.AbstractStep.PING_TIMEOUT: run_problems.append((task, exec_ping_file, last_activity_difference, last_activity - start_time)) if os.path.exists(queued_ping_file) and check_queue: info = yaml.load(open(queued_ping_file, 'r')) if not str(info['job_id']) in running_jids: queue_problems.append((task, queued_ping_file, info['submit_time'])) show_hint = False if len(run_problems) > 0: show_hint = True label = "Warning: There are %d stale run ping files." % len(run_problems) print(label) if print_details: print('-' * len(label)) run_problems = sorted(run_problems, key=itemgetter(2, 3), reverse=True) for problem in run_problems: task = problem[0] path = problem[1] last_activity_difference = problem[2] ran_for = problem[3] print("dead since %13s, ran for %13s: %s" % ( misc.duration_to_str(last_activity_difference), misc.duration_to_str(ran_for), task)) print("") if len(queue_problems) > 0: show_hint = True label = "Warning: There are %d tasks marked as queued, but they do not seem to be queued." % len(queue_problems) print(label) if print_details: print('-' * len(label)) queue_problems = sorted(queue_problems, key=itemgetter(2), reverse=True) for problem in queue_problems: task = problem[0] path = problem[1] start_time = problem[2] print("submitted at %13s: %s" % (start_time, task)) print("") if fix_problems: all_problems = run_problems all_problems.extend(queue_problems) for problem in all_problems: path = problem[1] print("Now deleting %s..." % path) os.unlink(path) if show_hint: if print_more_warnings and not print_details or not fix_problems: print("Hint: Run 'uap %s fix-problems --details' to see the " "details." % self.config_file_name) if not fix_problems: print("Hint: Run 'uap %s fix-problems --srsly' to fix these " "problems (that is, delete all problematic ping files)." % self.config_file_name)
def main(args): def exit(signum, frame): sys.exit(1) signal.signal(signal.SIGPIPE, exit) if args.sources: # print all sources (i. e. instances of AbstractSourceStep) args.no_tool_checks = True pipeline.Pipeline(arguments=args).print_source_runs() elif args.job_ids: args.no_tool_checks = True p = pipeline.Pipeline(arguments=args) ids = p.get_cluster_job_ids() print(' '.join(ids)) return elif args.run and not args.details: # print run infos of one or more specific tasks args.no_tool_checks = True p = pipeline.Pipeline(arguments=args) for task in p.get_task_with_list(): print('## %s\n' % task) report = task.get_run().as_dict(commands=True) print(yaml.dump(report, Dumper=misc.UAPDumper, default_flow_style=False)) elif args.graph: p = pipeline.Pipeline(arguments=args) step_order = p.topological_step_order indents = [0 for _ in step_order] for index, line in enumerate(step_order): step_name = step_order[index] child_count = len(p.steps[step_name].children_step_names) indent = child_count * 2 for _ in range(index + 1, len(step_order)): indents[_] += indent lines = list() for index, step_name in enumerate(step_order): lines.append(list(' ' * indents[index])) # draw horizontal line parts for index, step_name in enumerate(step_order): child_order = [ _ for _ in step_order if _ in p.steps[step_name].children_step_names] for child_index, child in enumerate(child_order): x0 = indents[index] + 1 + child_index * 2 y = step_order.index(child) x1 = indents[y] for x in range(x0, x1): lines[y][x] = "─" lines[y][x0 - 1] = "└" # draw vertical line parts for index, step_name in enumerate(step_order): child_order = [ _ for _ in step_order if _ in p.steps[step_name].children_step_names] for child_index, child in enumerate(child_order): x = indents[index] + child_index * 2 y0 = index + 1 y1 = step_order.index(child) for y in range(y0, y1): lines[y][x] = "│" # write step names and states for index, line in enumerate(lines): original_step_name_label = '' step = p.steps[step_order[index]] if step.get_step_name() != step.get_step_type(): original_step_name_label = ' (%s)' % step.get_step_type() if args.no_tool_checks: run_info = '%d runs' % len(step.get_runs()) else: run_info = step.get_run_info_str( progress=True, do_hash=args.hash) line = "%s%s%s [%s]" % (''.join(line).replace( "─└", "─┴"), step.get_step_name(), original_step_name_label, run_info) print(line) elif args.details: p = pipeline.Pipeline(arguments=args) n_per_state = dict() tasks = p.get_task_with_list() for i, task in enumerate(tasks): len_tag = '[%d/%d] ' % (i + 1, len(tasks)) sys.stdout.write(len_tag) state = task.get_task_state(do_hash=args.hash) n_per_state.setdefault(state, 0) n_per_state[state] += 1 if state == p.states.FINISHED: message = '%s is finished' % task if args.hash: message += ' and sha256sum(s) correct' print(message) elif state == p.states.READY: print('%s has all inputs and is ready to start running' % task) elif state == p.states.EXECUTING: exec_ping_file = task.get_run().get_executing_ping_file() try: with open(exec_ping_file, 'r') as buff: info = yaml.load(buff, Loader=yaml.FullLoader) except IOError as e: if os.path.exists(exec_ping_file): raise e else: print( '%s is executing but seems to stop just now' % task) else: print('%s is executing since %s' % (task, info['start_time'])) elif state == p.states.QUEUED: queued_ping_file = task.get_run().get_queued_ping_file() try: with open(queued_ping_file, 'r') as buff: info = yaml.load(buff, Loader=yaml.FullLoader) except IOError as e: if os.path.exists(queued_ping_file): raise e print('%s is queued but seems to stop just now' % task) else: print('%s is queued with id %s since %s' % (task, info['cluster job id'], info['submit_time'])) elif state == p.states.VOLATILIZED: print('%s was volatilized and must be re-run if the data is ' 'needed' % task) elif state == p.states.WAITING: parents = [str(parent) for parent in task.get_parent_tasks() if parent.get_task_state() != p.states.FINISHED] print('%s is waiting for %s' % (task, parents)) elif state == p.states.CHANGED: heading = 'changes in task %s' % task print(heading) print('-' * len(len_tag + heading)) run = task.get_run() anno_data = run.written_anno_data() has_date_change = False has_only_date_change = True if not anno_data: has_only_date_change = False print('No annotation file.') else: changes = run.get_changes() if changes: has_only_date_change = False print(yaml.dump(dict(changes), Dumper=misc.UAPDumper, default_flow_style=False)) for line in run.file_changes(do_hash=args.hash, report_correct=True): if ' modification date after' in line: has_date_change = True else: has_only_date_change = False print(line) if has_date_change and has_only_date_change: print("\nThis may be fixed with " "'uap %s fix-problems --file-modification-date " "--srsly'." % args.config.name) print('') elif state == p.states.BAD: heading = 'errors of bad task %s' % task print(heading) print('-' * len(heading) + '\n') run = task.get_run() found_error = False anno_file = run.get_annotation_path() stale = run.is_stale() if stale: print('Marked as executing but did not show activity ' 'for %s.\n' % misc.duration_to_str(stale)) anno_data = run.written_anno_data() if not anno_data: print('No annotation file.') else: failed = dict() stderr = dict() try: procs = anno_data['pipeline_log'].get('processes', []) for proc in procs: if proc.get('exit_code', 0) == 0: continue failed[proc['name']] = { 'command': list2cmdline(proc['args']), 'exit code': proc['exit_code'] } stderr[proc['name']] = proc['stderr_copy']['tail'] if isinstance(stderr[proc['name']], bytes): stderr[proc['name']] = stderr[proc['name']]\ .decode('utf-8') except KeyError as e: print('The annotation file "%s" seems badly ' 'formated: %s\n' % (anno_file, e)) else: time = anno_data.get('end_time', 'not available') rd = anno_data.get('run', dict()) host = rd.get('hostname', 'not available') job_id = rd.get('cluster job id', 'not available') print('host: %s' % host) print('time: %s' % time) print('cluster job id: %s' % job_id) print('') if failed: found_error = True print('## FAILED COMMANDS ##') for tool, failings in failed.items(): sys.stdout.write(yaml.dump({tool: failings}, Dumper=misc.UAPDumper, default_flow_style=False)) err = stderr[tool].replace('\n', '\n ') print(' stderr: %s' % err) else: print('No failed commands found in the annotation file.') print( "Run 'uap %s fix-problems --first-error' to investigate.'\n" % args.config.name) run_data = anno_data.get('run', []) if 'error' in run_data: found_error = True print('## ERROR ##\n%s\n' % run_data['error']) if not found_error: print('No errors found.') print('') else: print('%s has unknown state "%s"' % (task, state.lower())) sys.stdout.flush() stat_counts = ["%d %s" % (n_per_state.get(state, 0), state.lower()) for state in p.states.order] print("\nruns: %d total, %s" % (sum(n_per_state.values()), ', '.join(stat_counts))) if p.states.CHANGED in n_per_state.keys(): print( "If you want to force overwrite of the changed tasks, run\n" "'uap %s run-locally --force' or 'uap %s submit-to-cluster --force'." % (args.config.name, args.config.name)) print( "If you want to ignore the changes and consider the tasks finished, run\n" "'uap %s run-locally --ignore' or 'uap %s submit-to-cluster --ignore'." % (args.config.name, args.config.name)) # now check ping files and print some warnings and instructions if # something's fishy p.check_ping_files( print_more_warnings=True if args.verbose > 0 else False) # Now check whether we can volatilize files, but don't do it. p.check_volatile_files() else: # print all runs ''' prints a summary of all runs, indicating whether each run is - ``[r]eady`` - ``[w]aiting`` - ``[q]ueued`` - ``[e]xecuting`` - ``[b]ad`` - ``[f]inished`` - ``[c]hanged`` - ``[v]olatilized`` ''' p = pipeline.Pipeline(arguments=args) output = list() tasks_for_status = {} tasks = p.all_tasks_topologically_sorted task_iter = tqdm( tasks, desc='tasks', bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}') try: for task in task_iter: state = task.get_task_state(do_hash=args.hash) tasks_for_status.setdefault(state, list()) tasks_for_status[state].append(task) except BaseException: task_iter.close() raise for status in p.states.order: if status not in tasks_for_status: continue heading = "%s runs" % string.capwords(status) output.append(heading) output.append('-' * len(heading)) if args.summarize: step_count = dict() step_order = list() for task in tasks_for_status[status]: if not str(task.step) in step_count: step_count[str(task.step)] = 0 step_order.append(str(task.step)) step_count[str(task.step)] += 1 for step_name in step_order: output.append("[%s]%4d %s" % (status.lower()[0], step_count[step_name], step_name) ) else: for task in tasks_for_status[status]: output.append("[%s] %s" % ( status[0].lower(), task)) output.append('') output.append("runs: %d total, %s" % (len(p.all_tasks_topologically_sorted), ', '.join(["%d %s" % ( len(tasks_for_status[_]), _.lower()) for _ in p.states.order if _ in tasks_for_status]))) pydoc.pager("\n".join(output)) if p.states.CHANGED in tasks_for_status.keys() \ or p.states.BAD in tasks_for_status.keys() \ or p.states.WAITING in tasks_for_status.keys(): print("\nRun 'uap %s status --details' to inspect states." % args.config.name) # now check ping files and print some warnings and instructions if # something's fishy p.check_ping_files( print_more_warnings=True if args.verbose > 0 else False) # Now check whether we can volatilize files, but don't do it. p.check_volatile_files()
def render_pipeline_hash(log): def pid_hash(pid, suffix = ''): hashtag = "%s/%s/%d/%s" % (log['step']['name'], log['run']['run_id'], pid, suffix) return misc.str_to_sha1(hashtag) def file_hash(path): if path in log['step']['known_paths']: if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] return misc.str_to_sha1(path) hash = dict() hash['nodes'] = dict() hash['edges'] = dict() hash['clusters'] = dict() hash['graph_labels'] = dict() def add_file_node(path): if not path in log['step']['known_paths']: return if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] label = log['step']['known_paths'][path]['label'] color = '#ffffff' if log['step']['known_paths'][path]['type'] == 'fifo': color = '#c4f099' elif log['step']['known_paths'][path]['type'] == 'file': color = '#8ae234' elif log['step']['known_paths'][path]['type'] == 'step_file': color = '#97b7c8' if path in log['step']['known_paths']: if 'size' in log['step']['known_paths'][path]: label += "\\n%s" % misc.bytes_to_str( log['step']['known_paths'][path]['size']) hash['nodes'][misc.str_to_sha1(path)] = { 'label': label, 'fillcolor': color } for proc_info in copy.deepcopy(log['pipeline_log']['processes']): pid = proc_info['pid'] label = "PID %d" % pid name = '(unknown)' if 'name' in proc_info: name = proc_info['name'] label = "%s" % (proc_info['name']) if 'writes' in proc_info['hints']: for path in proc_info['hints']['writes']: add_file_node(path) if 'args' in proc_info: stripped_args = [] for arg in copy.deepcopy(proc_info['args']): if arg in log['step']['known_paths']: add_file_node(arg) if arg in log['step']['known_paths']: if log['step']['known_paths'][arg]['type'] != 'step_file': arg = log['step']['known_paths'][arg]['label'] else: arg = os.path.basename(arg) else: if arg[0:4] != '/dev': arg = os.path.basename(arg) if (len(arg) > 16) and re.match('^[A-Z]+$', arg): arg = "%s[...]" % arg[:16] stripped_args.append(arg.replace('\t', '\\t').replace( '\\', '\\\\')) tw = textwrap.TextWrapper( width = 50, break_long_words = False, break_on_hyphens = False) label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args)))) if 'args' in proc_info: output_fifo = False for arg in proc_info['args']: fifo_type = None if name == 'cat4m' and arg == '-o': output_fifo = True elif name == 'dd' and arg.startswith('of='): output_fifo = True for known_path in log['step']['known_paths'].keys(): if known_path in arg: add_file_node(known_path) if name in ['cat4m', 'dd']: if output_fifo: fifo_type = 'output' else: fifo_type = 'input' else: # we can't know whether the fifo is for input or # output, first look at the hints, then use the # designation (if any was given) if 'reads' in proc_info['hints'] and \ arg in proc_info['hints']['reads']: fifo_type = 'input' if 'writes' in proc_info['hints'] and \ arg in proc_info['hints']['writes']: fifo_type = 'output' if fifo_type is None: fifo_type = log['step']['known_paths'][arg]\ ['designation'] if fifo_type == 'input': # add edge from file to proc hash['edges'][(file_hash(arg), pid_hash(pid))] \ = dict() elif fifo_type == 'output': # add edge from proc to file hash['edges'][(pid_hash(pid), file_hash(arg))] \ = dict() if 'writes' in proc_info['hints']: for path in proc_info['hints']['writes']: hash['edges'][(pid_hash(pid), file_hash(path))] = dict() # add proc something_went_wrong = False if 'signal' in proc_info: something_went_wrong = True elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: something_went_wrong = True else: something_went_wrong = True color = "#fce94f" if something_went_wrong: if not pid in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info: label = "%s\\n(received %s%s)" % ( label, 'friendly ' \ if pid in log['pipeline_log']['ok_to_fail'] else '', proc_info['signal_name'] if 'signal_name' in \ proc_info else 'signal %d' % proc_info['signal']) elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info['exit_code']) else: label = "%s\\n(no exit code)" % label if 'max' in log['pipeline_log']['process_watcher']: if pid in log['pipeline_log']['process_watcher']['max']: label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % ( log['pipeline_log']['process_watcher']['max'][pid]\ ['cpu_percent'], misc.bytes_to_str( log['pipeline_log']['process_watcher']['max'][pid]\ ['rss']), log['pipeline_log']['process_watcher']['max'][pid]\ ['memory_percent']) hash['nodes'][pid_hash(pid)] = { 'label': label, 'fillcolor': color } for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: if ('exit_code' in proc_info[key]) and \ (proc_info[key]['exit_code'] == 0) and \ ('length' in proc_info[key]) and \ (proc_info[key]['length'] == 0) and \ (not 'sink_full_path' in proc_info[key]): # skip this stdout/stderr box if it leads to nothing continue size_label = '(empty)' if ('length' in proc_info[key]) and \ (proc_info[key]['length'] > 0): speed = float(proc_info[key]['length']) / ( proc_info[key]['end_time'] - proc_info[key]['start_time']).total_seconds() speed_label = "%s/s" % misc.bytes_to_str(speed) size_label = "%s / %s lines (%s)" % ( misc.bytes_to_str(proc_info[key]['length']), "{:,}".format(proc_info[key]['lines']), speed_label) label = "%s\\n%s" % (which, size_label) something_went_wrong = False if 'signal' in proc_info[key]: something_went_wrong = True elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: something_went_wrong = True else: something_went_wrong = True color = "#fdf3a7" if something_went_wrong: if not pid in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info[key]: label = "%s\\n(received %s%s)" % ( label, "friendly " if pid in \ log['pipeline_log']['ok_to_fail'] else '', proc_info[key]['signal_name'] if 'signal_name'\ in proc_info[key] else 'signal %d' % proc_info[key]['signal']) elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info[key]['exit_code']) else: label = "%s\\n(no exit code)" % label # add proc_which hash['nodes'][pid_hash(pid, which)] = { 'label': label, 'fillcolor': color } if 'sink_full_path' in proc_info[key]: path = proc_info[key]['sink_full_path'] add_file_node(path) for proc_info in copy.deepcopy(log['pipeline_log']['processes']): pid = proc_info['pid'] if 'use_stdin_of' in proc_info: other_pid = proc_info['use_stdin_of'] hash['edges'][(pid_hash(other_pid, 'stdout'), pid_hash(pid))] \ = dict() for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: other_pid = proc_info[key]['pid'] hash['edges'][(pid_hash(pid), pid_hash(pid, which))] = dict() if 'sink_full_path' in proc_info[key]: hash['edges'][( pid_hash(pid, which), file_hash(proc_info[key]['sink_full_path']))] = dict() # define nodes which go into subgraph step_file_nodes = dict() for path, path_info in log['step']['known_paths'].items(): if path_info['type'] == 'step_file': step_file_nodes[file_hash(path)] = path_info['designation'] task_name = "%s/%s" % (log['step']['name'], log['run']['run_id']) cluster_hash = misc.str_to_sha1(task_name) hash['clusters'][cluster_hash] = dict() hash['clusters'][cluster_hash]['task_name'] = task_name hash['clusters'][cluster_hash]['group'] = list() for node in hash['nodes'].keys(): if not node in step_file_nodes: hash['clusters'][cluster_hash]['group'].append(node) start_time = log['start_time'] end_time = log['end_time'] duration = end_time - start_time text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % ( task_name, socket.gethostname(), misc.duration_to_str(duration, long = True) ) hash['graph_labels'][task_name] = text if 'max' in log['pipeline_log']['process_watcher']: text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % ( log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'], log['step']['cores'], misc.bytes_to_str(log['pipeline_log']['process_watcher']['max']\ ['sum']['rss']), log['pipeline_log']['process_watcher']['max']['sum']['memory_percent']) hash['graph_labels'][task_name] += text if 'signal' in log: hash['graph_labels'][task_name] += "Caught signal: %s\\l" % ( process_pool.ProcessPool.SIGNAL_NAMES[log['signal']]) hash['graph_labels'][task_name] += "\\l" return hash
def create_hash_from_annotation(log): def pid_hash(pid, suffix = ''): hashtag = "%s/%s/%d/%s" % (log['step']['name'], log['run']['run_id'], pid, suffix) return misc.str_to_sha1(hashtag) def file_hash(path): if path in log['step']['known_paths']: if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] return misc.str_to_sha1(path) pipe_hash = dict() pipe_hash['nodes'] = dict() pipe_hash['edges'] = dict() pipe_hash['clusters'] = dict() pipe_hash['graph_labels'] = dict() def add_file_node(path): if not path in log['step']['known_paths']: return if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] label = os.path.basename(path) color = '#ffffff' if log['step']['known_paths'][path]['type'] in ['fifo', 'directory']: color = '#c4f099' elif log['step']['known_paths'][path]['type'] == 'file': color = '#8ae234' elif log['step']['known_paths'][path]['type'] == 'step_file': color = '#97b7c8' label = log['step']['known_paths'][path]['label'] if path in log['step']['known_paths']: if 'size' in log['step']['known_paths'][path]: label += "\\n%s" % misc.bytes_to_str( log['step']['known_paths'][path]['size']) pipe_hash['nodes'][misc.str_to_sha1(path)] = { 'label': label, 'fillcolor': color } for proc_info in log['pipeline_log']['processes']: pid = proc_info['pid'] # Set name and label variable try: name = proc_info['name'] label = "%s" % (proc_info['name']) except KeyError: name = '(unknown)' label = "PID %d" % pid try: # Add file nodes for every file in hints for path in proc_info['hints']['writes']: add_file_node(path) pipe_hash['edges'][(pid_hash(pid), file_hash(path))] = dict() except KeyError: pass try: # Add all the info for each process to pipe_hash stripped_args = [] is_output_file = False for arg in proc_info['args']: # Try to investigate how fifos are integrated in data stream # Hier muss die Entscheidung rein ob eine Datei für Input oder # Output genutzt wird io_type = None if name == 'cat': is_output_file = False elif name == 'dd' and arg.startswith('of='): is_output_file = True elif name == 'dd' and arg.startswith('if='): is_output_file = False elif name in ['mkdir', 'mkfifo']: is_output_file = True for known_path in log['step']['known_paths'].keys(): # Check if arg contains a known path ... if known_path in arg: # ... if so add this file to the graph add_file_node(known_path) # Is the process able to in-/output files? if name in ['cat', 'dd', 'mkdir', 'mkfifo']: if is_output_file: io_type = 'output' else: io_type = 'input' elif name == 'fix_cutadapt.py': if arg == proc_info['args'][-2]: io_type = 'input' elif arg == proc_info['args'][-1]: io_type = 'output' elif proc_info[ proc_info['args'].index(arg) - 1 ] \ == '--R2-in': io_type = 'input' elif proc_info[ proc_info['args'].index(arg) - 1 ] \ == '--R2-out': io_type = 'output' else: # we can't know whether the fifo is for input or # output, first look at the hints, then use the # designation (if any was given) if 'reads' in proc_info['hints'] and \ arg in proc_info['hints']['reads']: io_type = 'input' if 'writes' in proc_info['hints'] and \ arg in proc_info['hints']['writes']: io_type = 'output' if io_type is None: io_type = log['step']['known_paths'][known_path]\ ['designation'] if io_type is None: io_type = 'input' print('io_type: %s\nknown_path: %s' % (io_type, known_path) ) if io_type == 'input': # add edge from file to proc pipe_hash['edges']\ [(file_hash(known_path),pid_hash(pid))] = dict() elif io_type == 'output': # add edge from proc to file pipe_hash['edges']\ [(pid_hash(pid), file_hash(known_path))] = dict() basename = os.path.basename(known_path) #if log['step']['known_paths'][known_path]['type'] != \ # 'step_file': arg = arg.replace(known_path, basename) # break # else: # arg = basename # break else: # if arg[0:4] != '/dev': # arg = os.path.basename(arg) if (len(arg) > 16) and re.match('^[A-Z]+$', arg): arg = "%s[...]" % arg[:16] stripped_args.append(arg.replace('\t', '\\t').replace( '\\', '\\\\')) tw = textwrap.TextWrapper( width = 50, break_long_words = False, break_on_hyphens = False) label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args)))) # If any key wasn't around let's go on except KeyError: pass # add proc something_went_wrong = False if 'signal' in proc_info: something_went_wrong = True elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: something_went_wrong = True else: something_went_wrong = True color = "#fce94f" if something_went_wrong: if not pid in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info: label = "%s\\n(received %s%s)" % ( label, 'friendly ' \ if pid in log['pipeline_log']['ok_to_fail'] else '', proc_info['signal_name'] if 'signal_name' in \ proc_info else 'signal %d' % proc_info['signal']) elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info['exit_code']) else: label = "%s\\n(no exit code)" % label if 'max' in log['pipeline_log']['process_watcher']: if pid in log['pipeline_log']['process_watcher']['max']: label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % ( log['pipeline_log']['process_watcher']['max'][pid]\ ['cpu_percent'], misc.bytes_to_str( log['pipeline_log']['process_watcher']['max'][pid]\ ['rss']), log['pipeline_log']['process_watcher']['max'][pid]\ ['memory_percent']) pipe_hash['nodes'][pid_hash(pid)] = { 'label': label, 'fillcolor': color, 'start_time': proc_info['start_time'] } for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: if ('exit_code' in proc_info[key]) and \ (proc_info[key]['exit_code'] == 0) and \ ('length' in proc_info[key]) and \ (proc_info[key]['length'] == 0) and \ (not 'sink_full_path' in proc_info[key]): # skip this stdout/stderr box if it leads to nothing continue size_label = '(empty)' if ('length' in proc_info[key]) and \ (proc_info[key]['length'] > 0): speed = float(proc_info[key]['length']) / ( proc_info[key]['end_time'] - proc_info[key]['start_time']).total_seconds() speed_label = "%s/s" % misc.bytes_to_str(speed) size_label = "%s / %s lines (%s)" % ( misc.bytes_to_str(proc_info[key]['length']), "{:,}".format(proc_info[key]['lines']), speed_label) label = "%s\\n%s" % (which, size_label) something_went_wrong = False if 'signal' in proc_info[key]: something_went_wrong = True elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: something_went_wrong = True else: something_went_wrong = True color = "#fdf3a7" if something_went_wrong: if not pid in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info[key]: label = "%s\\n(received %s%s)" % ( label, "friendly " if pid in \ log['pipeline_log']['ok_to_fail'] else '', proc_info[key]['signal_name'] if 'signal_name'\ in proc_info[key] else 'signal %d' % proc_info[key]['signal']) elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info[key]['exit_code']) else: label = "%s\\n(no exit code)" % label # add proc_which pipe_hash['nodes'][pid_hash(pid, which)] = { 'label': label, 'fillcolor': color } if 'sink_full_path' in proc_info[key]: path = proc_info[key]['sink_full_path'] add_file_node(path) for proc_info in copy.deepcopy(log['pipeline_log']['processes']): pid = proc_info['pid'] if 'use_stdin_of' in proc_info: other_pid = proc_info['use_stdin_of'] pipe_hash['edges'][(pid_hash(other_pid, 'stdout'), pid_hash(pid))] \ = dict() for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: other_pid = proc_info[key]['pid'] pipe_hash['edges'][(pid_hash(pid), pid_hash(pid, which))] = dict() if 'sink_full_path' in proc_info[key]: pipe_hash['edges'][( pid_hash(pid, which), file_hash(proc_info[key]['sink_full_path']))] = dict() # define nodes which go into subgraph step_file_nodes = dict() for path, path_info in log['step']['known_paths'].items(): if path_info['type'] == 'step_file': step_file_nodes[file_hash(path)] = path_info['designation'] task_name = "%s/%s" % (log['step']['name'], log['run']['run_id']) cluster_hash = misc.str_to_sha1(task_name) pipe_hash['clusters'][cluster_hash] = dict() pipe_hash['clusters'][cluster_hash]['task_name'] = task_name pipe_hash['clusters'][cluster_hash]['group'] = list() for node in pipe_hash['nodes'].keys(): if not node in step_file_nodes: pipe_hash['clusters'][cluster_hash]['group'].append(node) start_time = log['start_time'] end_time = log['end_time'] duration = end_time - start_time text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % ( task_name, socket.gethostname(), misc.duration_to_str(duration, long = True) ) pipe_hash['graph_labels'][task_name] = text if 'max' in log['pipeline_log']['process_watcher']: text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % ( log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'], log['step']['cores'], misc.bytes_to_str(log['pipeline_log']['process_watcher']['max']\ ['sum']['rss']), log['pipeline_log']['process_watcher']['max']['sum']['memory_percent']) pipe_hash['graph_labels'][task_name] += text if 'signal' in log: pipe_hash['graph_labels'][task_name] += "Caught signal: %s\\l" % ( process_pool.ProcessPool.SIGNAL_NAMES[log['signal']]) pipe_hash['graph_labels'][task_name] += "\\l" return pipe_hash
def create_hash_from_annotation(log): def pid_hash(pid, suffix=''): hashtag = "%s/%s/%d/%s" % (log['step']['name'], log['run']['run_id'], pid, suffix) return misc.str_to_sha256(hashtag) def file_hash(path): if path in log['step']['known_paths']: if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] return misc.str_to_sha256(path) pipe_hash = dict() pipe_hash['nodes'] = dict() pipe_hash['edges'] = dict() pipe_hash['clusters'] = dict() pipe_hash['graph_labels'] = dict() def add_file_node(path): if path not in log['step']['known_paths']: return if 'real_path' in log['step']['known_paths'][path]: path = log['step']['known_paths'][path]['real_path'] label = os.path.basename(path) color = '#ffffff' if log['step']['known_paths'][path]['type'] in ['fifo', 'directory']: color = '#c4f099' elif log['step']['known_paths'][path]['type'] == 'file': color = '#8ae234' elif log['step']['known_paths'][path]['type'] == 'step_file': color = '#97b7c8' label = log['step']['known_paths'][path]['label'] if path in log['step']['known_paths']: if 'size' in log['step']['known_paths'][path]: label += "\\nFilesize: %s" % misc.bytes_to_str( log['step']['known_paths'][path]['size']) pipe_hash['nodes'][misc.str_to_sha256(path)] = { 'label': label, 'fillcolor': color } for proc_info in log['pipeline_log']['processes']: pid = proc_info['pid'] # Set name and label variable try: name = proc_info['name'] label = "%s" % (proc_info['name']) except KeyError: name = '(unknown)' label = "PID %d" % pid try: # Add file nodes for every file in hints for path in proc_info['hints']['writes']: add_file_node(path) pipe_hash['edges'][(pid_hash(pid), file_hash(path))] = dict() except KeyError: pass try: # Add all the info for each process to pipe_hash stripped_args = [] is_output_file = False for arg in proc_info['args']: # Try to investigate how fifos are integrated in data stream # Hier muss die Entscheidung rein ob eine Datei für Input oder # Output genutzt wird io_type = None if name == 'cat': is_output_file = False elif name == 'dd' and arg.startswith('of='): is_output_file = True elif name == 'dd' and arg.startswith('if='): is_output_file = False elif name in ['mkdir', 'mkfifo']: is_output_file = True for known_path in log['step']['known_paths'].keys(): # Check if arg contains a known path ... if known_path in arg: # ... if so add this file to the graph add_file_node(known_path) # Is the process able to in-/output files? if name in ['cat', 'dd', 'mkdir', 'mkfifo']: if is_output_file: io_type = 'output' else: io_type = 'input' elif name == 'fix_cutadapt.py': if arg == proc_info['args'][-2]: io_type = 'input' elif arg == proc_info['args'][-1]: io_type = 'output' elif proc_info[proc_info['args'].index(arg) - 1] \ == '--R2-in': io_type = 'input' elif proc_info[proc_info['args'].index(arg) - 1] \ == '--R2-out': io_type = 'output' else: # we can't know whether the fifo is for input or # output, first look at the hints, then use the # designation (if any was given) if 'reads' in proc_info['hints'] and \ arg in proc_info['hints']['reads']: io_type = 'input' if 'writes' in proc_info['hints'] and \ arg in proc_info['hints']['writes']: io_type = 'output' if io_type is None: io_type = log['step']['known_paths'][known_path]['designation'] if io_type is None: io_type = 'input' msg = 'io_type: %s\nknown_path: %s' print(msg % (io_type, known_path)) if io_type == 'input': # add edge from file to proc key = (file_hash(known_path), pid_hash(pid)) pipe_hash['edges'][key] = dict() elif io_type == 'output': # add edge from proc to file key = (pid_hash(pid), file_hash(known_path)) pipe_hash['edges'][key] = dict() basename = os.path.basename(known_path) arg = arg.replace(known_path, basename) else: if (len(arg) > 16) and re.match('^[A-Z]+$', arg): arg = "%s[...]" % arg[:16] stripped_args.append(arg.replace('\t', '\\t').replace( '\\', '\\\\')) tw = textwrap.TextWrapper( width=50, break_long_words=False, break_on_hyphens=False) label = "%s" % ("\\n".join(tw.wrap(' '.join(stripped_args)))) start_time = proc_info['start_time'] end_time = proc_info['end_time'] duration = end_time - start_time runtime = misc.duration_to_str(duration, long=True) label += '\\nRuntime: %s' % runtime # If any key wasn't around let's go on except KeyError: pass # add proc something_went_wrong = False if 'signal' in proc_info: something_went_wrong = True elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: something_went_wrong = True else: # TODO: in this case False? so pass? something_went_wrong = True color = "#fce94f" if something_went_wrong: if pid not in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info: label = "%s\\n(received %s%s)" % ( label, 'friendly ' if pid in log['pipeline_log']['ok_to_fail'] else '', proc_info['signal_name'] if 'signal_name' in proc_info else 'signal %d' % proc_info['signal']) elif 'exit_code' in proc_info: if proc_info['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info['exit_code']) else: label = "%s\\n(no exit code)" % label if 'max' in log['pipeline_log']['process_watcher']: if pid in log['pipeline_log']['process_watcher']['max']: label += "\\n%1.1f%% CPU, %s RAM (%1.1f%%)" % ( log['pipeline_log']['process_watcher']['max'][pid]['cpu_percent'], misc.bytes_to_str( log['pipeline_log']['process_watcher']['max'][pid]['rss'] ), log['pipeline_log']['process_watcher']['max'][pid]['memory_percent'] ) pipe_hash['nodes'][pid_hash(pid)] = { 'label': label, 'fillcolor': color, 'start_time': proc_info['start_time'] } for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: if ('exit_code' in proc_info[key]) and \ (proc_info[key]['exit_code'] == 0) and \ ('length' in proc_info[key]) and \ (proc_info[key]['length'] == 0) and \ ('sink_full_path' not in proc_info[key]): # skip this stdout/stderr box if it leads to nothing continue size_label = '(empty)' if ('length' in proc_info[key]) and \ (proc_info[key]['length'] > 0): speed = float(proc_info[key]['length']) / ( proc_info[key]['end_time'] - proc_info[key]['start_time']).total_seconds() speed_label = "%s/s" % misc.bytes_to_str(speed) size_label = "%s / %s lines (%s)" % ( misc.bytes_to_str(proc_info[key]['length']), "{:,}".format(proc_info[key]['lines']), speed_label) label = "%s\\n%s" % (which, size_label) something_went_wrong = False if 'signal' in proc_info[key]: something_went_wrong = True elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: something_went_wrong = True else: something_went_wrong = True color = "#fdf3a7" if something_went_wrong: if pid not in log['pipeline_log']['ok_to_fail']: color = "#d5291a" if 'signal' in proc_info[key]: label = "%s\\n(received %s%s)" % ( label, "friendly " if pid in log['pipeline_log']['ok_to_fail'] else '', proc_info[key]['signal_name'] if 'signal_name' in proc_info[key] else 'signal %d' % proc_info[key]['signal']) elif 'exit_code' in proc_info[key]: if proc_info[key]['exit_code'] != 0: label = "%s\\n(failed with exit code %d)" % ( label, proc_info[key]['exit_code']) else: label = "%s\\n(no exit code)" % label # add proc_which pipe_hash['nodes'][pid_hash(pid, which)] = { 'label': label, 'fillcolor': color } if 'sink_full_path' in proc_info[key]: path = proc_info[key]['sink_full_path'] add_file_node(path) for proc_info in copy.deepcopy(log['pipeline_log']['processes']): pid = proc_info['pid'] if 'use_stdin_of' in proc_info: other_pid = proc_info['use_stdin_of'] key_value = (pid_hash(other_pid, 'stdout'), pid_hash(pid)) pipe_hash['edges'][key_value] = dict() for which in ['stdout', 'stderr']: key = "%s_copy" % which if key in proc_info: other_pid = proc_info[key]['pid'] key_value = (pid_hash(pid), pid_hash(pid, which)) pipe_hash['edges'][key_value] = dict() if 'sink_full_path' in proc_info[key]: pipe_hash['edges'][( pid_hash(pid, which), file_hash(proc_info[key]['sink_full_path']))] = dict() # define nodes which go into subgraph step_file_nodes = dict() for path, path_info in log['step']['known_paths'].items(): if path_info['type'] == 'step_file': step_file_nodes[file_hash(path)] = path_info['designation'] task_name = "%s/%s" % (log['step']['name'], log['run']['run_id']) cluster_hash = misc.str_to_sha256(task_name) pipe_hash['clusters'][cluster_hash] = dict() pipe_hash['clusters'][cluster_hash]['task_name'] = task_name pipe_hash['clusters'][cluster_hash]['group'] = list() for node in pipe_hash['nodes'].keys(): if node not in step_file_nodes: pipe_hash['clusters'][cluster_hash]['group'].append(node) start_time = log['start_time'] end_time = log['end_time'] duration = end_time - start_time text = "Task: %s\\lHost: %s\\lDuration: %s\\l" % ( task_name, socket.gethostname(), misc.duration_to_str(duration, long=True) ) pipe_hash['graph_labels'][task_name] = text if 'max' in log['pipeline_log']['process_watcher']: text = "CPU: %1.1f%%, %d CORES_Requested , RAM: %s (%1.1f%%)\\l" % ( log['pipeline_log']['process_watcher']['max']['sum']['cpu_percent'], log['step']['cores'], misc.bytes_to_str( log['pipeline_log']['process_watcher']['max']['sum']['rss'] ), log['pipeline_log']['process_watcher']['max']['sum']['memory_percent']) pipe_hash['graph_labels'][task_name] += text if 'signal' in log: pipe_hash['graph_labels'][task_name] += "Caught signal: %s\\l" % ( process_pool.ProcessPool.SIGNAL_NAMES[log['signal']]) pipe_hash['graph_labels'][task_name] += "\\l" return pipe_hash
def run(self, run_id): ''' Create a temporary output directory and execute a run. After the run has finished, it is checked that all output files are in place and the output files are moved to the final output location. Finally, YAML annotations are written. ''' # this is the run we'll execute now run = self.get_run(run_id) # create the output directory if it doesn't exist yet if not os.path.isdir(run.get_output_directory()): os.makedirs(run.get_output_directory()) # now write the run ping file executing_ping_path = run.get_executing_ping_file() if os.path.exists(executing_ping_path): raise UAPError("%s/%s seems to be already running, exiting..." % (self, run_id)) queued_ping_path = run.get_queued_ping_file() try: with open(queued_ping_path, 'r') as buff: info = yaml.load(buff, Loader=yaml.FullLoader) job_id = info['cluster job id'] except (IOError, KeyError): job_id = None # create a temporary directory for the output files temp_directory = run.get_temp_output_directory() os.makedirs(temp_directory) # prepare known_paths known_paths = dict() for tag, tag_info in run.get_output_files_abspath().items(): for output_path, input_paths in tag_info.items(): # add the real output path if output_path is not None and input_paths is not None: known_paths[output_path] = { 'designation': 'output', 'label': os.path.basename(output_path), 'type': 'step_file' } # ...and also add the temporary output path known_paths[os.path.join( temp_directory, os.path.basename(output_path))] = { 'designation': 'output', 'label': "%s\\n(%s)" % (os.path.basename(output_path), tag), 'type': 'step_file', 'real_path': output_path } for input_path in input_paths: if input_path is not None: known_paths[input_path] = { 'designation': 'input', 'label': os.path.basename(input_path), 'type': 'step_file' } # now write the run ping file executing_ping_info = dict() executing_ping_info['start_time'] = datetime.now() executing_ping_info['host'] = socket.gethostname() executing_ping_info['pid'] = os.getpid() executing_ping_info['user'] = pwd.getpwuid(os.getuid())[0] executing_ping_info['temp_directory'] = run.get_temp_output_directory() if job_id: executing_ping_info['cluster job id'] = job_id with open(executing_ping_path, 'w') as f: f.write(yaml.dump(executing_ping_info, default_flow_style=False)) executing_ping_pid = os.fork() if executing_ping_pid == 0: # this is the chid process try: signal.signal(signal.SIGTERM, signal.SIG_DFL) signal.signal(signal.SIGINT, signal.SIG_IGN) while True: time.sleep(AbstractStep.PING_RENEW) # if the executing ping file is gone and the touching # operation fails, then SO BE IT! os.utime(executing_ping_path, None) finally: os._exit(0) def kill_exec_ping(): try: os.kill(executing_ping_pid, signal.SIGTERM) os.waitpid(executing_ping_pid, 0) except OSError: # if the ping process was already killed, it's gone anyway pass self.remove_ping_file(executing_ping_path) p = self.get_pipeline() def ping_on_term(signum, frame): logger.warning('Recived SIGTERM and moving execution ping file...') kill_exec_ping() self.remove_ping_file(queued_ping_path, bad_copy=True) p.caught_signal = signum process_pool.ProcessPool.kill() raise UAPError('Recived TERM signal (canceled job).') def ping_on_int(signum, frame): logger.warning('Recived SIGINT and moving execution ping file...') kill_exec_ping() self.remove_ping_file(queued_ping_path, bad_copy=True) p.caught_signal = signum process_pool.ProcessPool.kill() raise UAPError('Recived INT signal (keybord interrupt).') original_term_handler = signal.signal(signal.SIGTERM, ping_on_term) original_int_handler = signal.signal(signal.SIGINT, ping_on_int) self.start_time = datetime.now() message = "[START] starting %s/%s on %s" % \ (self, run_id, socket.gethostname()) if job_id: message += " with job id %s" % job_id p.notify(message) caught_exception = None self._state = AbstractStep.states.EXECUTING base_working_dir = os.getcwd() os.chdir(run.get_temp_output_directory()) try: self.execute(run_id, run) except BaseException: # Oh my. We have a situation. This is awkward. Tell the process # pool to wrap up. This way, we can try to get process stats before # shutting everything down. process_pool.ProcessPool.kill() # Store the exception, re-raise it later caught_exception = sys.exc_info() error = ''.join( traceback.format_exception(*caught_exception)[-2:]).strip() logger.debug(error) finally: signal.signal(signal.SIGTERM, original_term_handler) signal.signal(signal.SIGINT, original_int_handler) self._state = AbstractStep.states.DEFAULT # changes relative paths os.chdir(base_working_dir) self.end_time = datetime.now() # step has completed invalidate the FS cache because things have # changed by now... run.reset_fsc() to_be_moved = dict() if not p.caught_signal and not caught_exception: # if we're here, we can assume the step has finished successfully # now log file stats try: for tag in run.get_output_files().keys(): for out_file in run.get_output_files()[tag].keys(): # don't try to rename files if they were not meant to exist # in our temporary directory # 1. out_file should not be None (empty output connection) # 2. out_file should not contain a '/' (file belongs to a # source step) if out_file is None or '/' in out_file: continue source_path = os.path.join( run.get_temp_output_directory(), os.path.basename(out_file)) new_path = os.path.join(run.get_output_directory(), os.path.basename(out_file)) # first, delete a possibly existing volatile placeholder # file path_volatile = new_path + AbstractStep.VOLATILE_SUFFIX if os.path.exists(path_volatile): logger.info("Now deleting: %s" % path_volatile) os.unlink(path_volatile) if os.path.exists(source_path): known_paths.pop(source_path, None) known_paths.setdefault(new_path, dict()) if known_paths[new_path][ 'designation'] == 'output': to_be_moved[source_path] = new_path size = run.fsc.getsize(source_path) mtime = datetime.fromtimestamp( run.fsc.getmtime(source_path)) known_paths[new_path]['size'] = size known_paths[new_path][ 'modification time'] = mtime if known_paths[new_path]['type'] != 'step_file': logger.debug( "Set %s 'type' info to 'step_file'" % new_path) known_paths[new_path]['type'] = 'step_file' else: raise UAPError( 'The step failed to produce an ' 'announced output file: "%s".\n' 'Source file doesn\'t exists: "%s"' % (out_file, source_path)) except BaseException: caught_exception = sys.exc_info() pool = None class SignalError(Exception): def __init__(self, signum): self.signum = signum m = "Recived signal %s during hashing!" % \ process_pool.ProcessPool.SIGNAL_NAMES[signum] super(SignalError, self).__init__(m) if caught_exception is None and to_be_moved: p.notify("[INFO] %s/%s hashing %d output file(s)." % (str(self), run_id, len(to_be_moved))) if p.has_interactive_shell() \ and logger.getEffectiveLevel() > 20: show_progress = True else: show_progress = False try: def stop(signum, frame): raise SignalError(signum) original_term_handler = signal.signal(signal.SIGTERM, stop) original_int_handler = signal.signal(signal.SIGINT, stop) pool = multiprocessing.Pool(self.get_cores()) total = len(to_be_moved) file_iter = pool.imap(misc.sha_and_file, to_be_moved.keys()) file_iter = tqdm( file_iter, total=total, leave=False, bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}', disable=not show_progress, desc='files') for i, (hashsum, path) in enumerate(file_iter): run.fsc.sha256sum_of(to_be_moved[path], value=hashsum) known_paths[to_be_moved[path]]['sha256'] = hashsum if not show_progress: logger.info("sha256 [%d/%d] %s %s" % (i + 1, total, hashsum, path)) except BaseException: caught_exception = sys.exc_info() try: # removing the progress bar file_iter.close() except BaseException: pass error = caught_exception[1] if caught_exception[0] is SignalError: p.caught_signal = error.signum logger.error(error) if pool: pool.terminate() else: pool.close() signal.signal(signal.SIGTERM, original_term_handler) signal.signal(signal.SIGINT, original_int_handler) run.add_known_paths(known_paths) if not p.caught_signal and not caught_exception: try: for source_path, new_path in to_be_moved.items(): logger.debug("Moving %s to %s." % (source_path, new_path)) os.rename(source_path, new_path) except BaseException: caught_exception = sys.exc_info() error = None if p.caught_signal is not None: signum = p.caught_signal signame = process_pool.ProcessPool.SIGNAL_NAMES[signum] error = 'Pipeline stopped because it caught signal %d - %s' % \ (signum, signame) elif caught_exception is not None: error = ''.join( traceback.format_exception(*caught_exception)[-2:]).strip() annotation_path = run.write_annotation_file(run.get_output_directory(), error=error, job_id=job_id) kill_exec_ping() self._state = AbstractStep.states.DEFAULT if error: message = "[BAD] %s/%s failed on %s after %s\n" % \ (str(self), run_id, socket.gethostname(), misc.duration_to_str(self.end_time - self.start_time)) message += "Here are the details: " + annotation_path + '\n' attachment = None if os.path.exists(annotation_path + '.png'): attachment = dict() attachment['name'] = 'details.png' attachment['data'] = open(annotation_path + '.png').read() p.notify(message, attachment) self.remove_ping_file(queued_ping_path, bad_copy=True) if caught_exception is not None: raise caught_exception[1].with_traceback(caught_exception[2]) else: # finally, remove the temporary directory if it's empty try: os.rmdir(temp_directory) except OSError as e: logger.info('Coult not remove temp dir "%s": %s' % (temp_directory, e)) temp = os.path.normpath(os.path.join(temp_directory, '..')) try: os.rmdir(temp) except OSError: # there may still be tasks in process pass remaining_task_info = self.get_run_info_str() message = "[OK] %s/%s successfully finished on %s after %s\n" % \ (str(self), run_id, socket.gethostname(), misc.duration_to_str(self.end_time - self.start_time)) message += str(self) + ': ' + remaining_task_info + "\n" attachment = None if os.path.exists(annotation_path + '.png'): attachment = dict() attachment['name'] = 'details.png' attachment['data'] = open(annotation_path + '.png').read() p.notify(message, attachment) self.remove_ping_file(queued_ping_path) self._reset() if pool is not None: pool.join()
def check_ping_files(self, print_more_warnings=False, print_details=False, fix_problems=False): run_problems = list() queue_problems = list() check_queue = True try: stat_output = subprocess.check_output( [self.get_cluster_command('stat')], stderr=subprocess.STDOUT) except (KeyError, OSError, subprocess.CalledProcessError): # we don't have a stat tool here, if subprocess.CalledProcessError # is raised check_queue = False if print_more_warnings and not check_queue: try: ce = self.get_cluster_command('stat') except KeyError: ce = "a cluster engine" print( "Attention, we cannot check stale queued ping files because " "this host does not have %s." % ce) running_jids = set() if check_queue: for line in stat_output.split("\n"): try: jid = int(line.strip().split(' ')[0]) running_jids.add(str(jid)) except ValueError: # this is not a JID pass now = datetime.datetime.now() for task in self.all_tasks_topologically_sorted: exec_ping_file = task.get_run().get_executing_ping_file() queued_ping_file = task.get_run().get_queued_ping_file() if os.path.exists(exec_ping_file): info = yaml.load(open(exec_ping_file, 'r')) start_time = info['start_time'] last_activity = datetime.datetime.fromtimestamp( abstract_step.AbstractStep.fsc.getmtime(exec_ping_file)) last_activity_difference = now - last_activity if last_activity_difference.total_seconds() > \ abstract_step.AbstractStep.PING_TIMEOUT: run_problems.append( (task, exec_ping_file, last_activity_difference, last_activity - start_time)) if os.path.exists(queued_ping_file) and check_queue: info = yaml.load(open(queued_ping_file, 'r')) if not str(info['job_id']) in running_jids: queue_problems.append( (task, queued_ping_file, info['submit_time'])) show_hint = False if len(run_problems) > 0: show_hint = True label = "Warning: There are %d stale run ping files." % len( run_problems) print(label) if print_details: print('-' * len(label)) run_problems = sorted(run_problems, key=itemgetter(2, 3), reverse=True) for problem in run_problems: task = problem[0] path = problem[1] last_activity_difference = problem[2] ran_for = problem[3] print("dead since %13s, ran for %13s: %s" % (misc.duration_to_str(last_activity_difference), misc.duration_to_str(ran_for), task)) print("") if len(queue_problems) > 0: show_hint = True label = "Warning: There are %d tasks marked as queued, but they do not seem to be queued." % len( queue_problems) print(label) if print_details: print('-' * len(label)) queue_problems = sorted(queue_problems, key=itemgetter(2), reverse=True) for problem in queue_problems: task = problem[0] path = problem[1] start_time = problem[2] print("submitted at %13s: %s" % (start_time, task)) print("") if fix_problems: all_problems = run_problems all_problems.extend(queue_problems) for problem in all_problems: path = problem[1] print("Now deleting %s..." % path) os.unlink(path) if show_hint: if print_more_warnings and not print_details or not fix_problems: print( "Hint: Run 'uap %s fix-problems --details' to see the " "details." % self.get_config_filepath()) if not fix_problems: print( "Hint: Run 'uap %s fix-problems --srsly' to fix these " "problems (that is, delete all problematic ping files)." % self.get_config_filepath())
def check_ping_files(self, print_more_warnings=False, print_details=False, fix_problems=False): run_problems = list() queue_problems = list() bad_problems = list() check_queue = True try: stat_output = subprocess.check_output( [self.get_cluster_command('stat')], stderr=subprocess.STDOUT).decode('utf-8') except (KeyError, OSError, subprocess.CalledProcessError): # we don't have a stat tool here, if subprocess.CalledProcessError # is raised check_queue = False if print_more_warnings and not check_queue: try: ce = self.get_cluster_command('stat') except KeyError: ce = "a cluster engine" print("Attention, we cannot check stale queued ping files because " "this host does not have %s." % ce) running_jids = set() if check_queue: for line in stat_output.split("\n"): if 'COMPLETING' in line: # this is sluem specific and if a closing job is stuck continue try: jid = int(line.strip().split(' ')[0].split('_')[0]) running_jids.add(str(jid)) except ValueError: # this is not a JID pass for task in self.all_tasks_topologically_sorted: queued_ping_file = task.get_run().get_queued_ping_file() bad_queued_ping_file = queued_ping_file + '.bad' exec_ping_file = task.get_run().get_executing_ping_file() stale = task.get_run().is_stale() if stale: try: info = yaml.load(open(exec_ping_file, 'r'), Loader=yaml.FullLoader) except IOError as e: if os.path.exists(exec_ping_file): raise e else: start_time = info['start_time'] last_activity = datetime.datetime.fromtimestamp( task.get_run().fsc.getmtime(exec_ping_file)) run_problems.append((task, exec_ping_file, stale, last_activity - start_time)) if check_queue: try: info = yaml.load(open(queued_ping_file, 'r'), Loader=yaml.FullLoader) except IOError as e: if os.path.exists(queued_ping_file): raise e else: if not str(info['cluster job id']) in running_jids: queue_problems.append( (task, queued_ping_file, info['submit_time'], info['cluster job id'])) try: info = yaml.load(open(bad_queued_ping_file, 'r'), Loader=yaml.FullLoader) except IOError as e: if os.path.exists(bad_queued_ping_file): raise e else: bad_problems.append( (task, bad_queued_ping_file, info['submit_time'], info['cluster job id'])) show_hint = False if len(run_problems) > 0: show_hint = True label = "Warning: There are %d stale run ping files." % len( run_problems) print(label) if print_details: print('-' * len(label)) run_problems = sorted(run_problems, key=itemgetter(2, 3), reverse=True) for problem in run_problems: task = problem[0] path = problem[1] last_activity_difference = problem[2] ran_for = problem[3] print("dead since %13s, ran for %13s: %s" % (misc.duration_to_str(last_activity_difference), misc.duration_to_str(ran_for), task)) print("") if len(queue_problems) > 0: show_hint = True label = "Warning: There are %d tasks marked as queued, but they "\ "do not seem to be queued." % len(queue_problems) print(label) if print_details: print('-' * len(label)) queue_problems = sorted(queue_problems, key=itemgetter(2), reverse=True) for problem in queue_problems: task = problem[0] path = problem[1] start_time = problem[2] job_id = problem[3] print("submitted job %s at %13s: %s" % (job_id, start_time, task)) print("") if len(bad_problems) > 0: label = "Info: Found %d queue files of failed tasks." % len( bad_problems) print(label) if print_details: print('-' * len(label)) bad_problems = sorted(bad_problems, key=itemgetter(2), reverse=True) for problem in bad_problems: task = problem[0] path = problem[1] start_time = problem[2] job_id = problem[3] print("submitted job %s at %13s: %s" % (job_id, start_time, task)) print("") if fix_problems: all_problems = run_problems all_problems.extend(queue_problems) all_problems.extend(bad_problems) for problem in all_problems: path = problem[1] print("Now deleting %s..." % path) os.unlink(path) if show_hint: if print_more_warnings and not print_details or not fix_problems: print("Hint: Run 'uap %s fix-problems --details' to see the " "details." % self.args.config.name) if print_more_warnings and not fix_problems: print("Hint: Run 'uap %s fix-problems --first-error' to " "investigate what happended." % self.args.config.name) if not fix_problems: print( "Hint: Run 'uap %s fix-problems --srsly' to fix these " "problems (that is, delete all problematic ping files)." % self.args.config.name) else: print('No problematic ping files were found.')