def synthesis(prepare_res, analysis_res): separator, _, _, filename, _, labels, dw, bad_dw, skipped_dw, fds, success_fd, _, = prepare_res # Analysis may have gotten a perfectly legitimate EOF if something # went wrong in the reader process, so we need to check that all # went well. try: reader_res = os.read(success_fd, 1) except OSError: reader_res = None if reader_res != b"\0": raise Exception("Reader process failed") good_counts = [] bad_counts = [] skipped_counts = [] for sliceno, (good_count, bad_count, skipped_count) in enumerate(analysis_res): dw.set_lines(sliceno, good_count) if bad_dw: bad_dw.set_lines(sliceno, bad_count) if skipped_dw: skipped_dw.set_lines(sliceno, skipped_count) good_counts.append(good_count) bad_counts.append(bad_count) skipped_counts.append(skipped_count) res = DotDict( num_lines=sum(good_counts), lines_per_slice=good_counts, num_broken_lines=sum(bad_counts), broken_lines_per_slice=bad_counts, num_skipped_lines=sum(skipped_counts), skipped_lines_per_slice=skipped_counts, ) blob.save(res, 'import') write_report(res, labels)
def analysis(sliceno): data = {sliceno} if options.inner: if options.file.sliced: value = options.file.load(sliceno) assert value == data else: try: options.file.load(sliceno) raise Exception("Allowed sliced load of unsliced file") except AssertionError: pass blob.save({'inner': sliceno}, 'inner.pickle', sliceno, temp=False) json_save({'inner': sliceno}, 'inner.json', sliceno) else: blob.save(data, 'data', sliceno, temp=False)
def synthesis(params, job): data = {'foo'} if options.inner: if options.file.sliced: try: options.file.load() raise Exception("Allowed unsliced load of sliced file") except AssertionError: pass else: value = options.file.load() assert value == data blob.save({'inner': None}, 'inner.pickle') json_save({'inner': None}, 'inner.json') else: blob.save(data, 'data') # use different ways to construct the jwf so both get tested. verify(params, JobWithFile(params.jobid, 'data')) verify(params, job.withfile('data', True))
def call_analysis(analysis_func, sliceno_, delayed_start, q, preserve_result, parent_pid, output_fds, **kw): try: q.make_writer() # tell iowrapper our PID, so our output goes to the right status stack. # (the pty is not quite a transparent transport ('\n' transforms into # '\r\n'), so we use a fairly human readable encoding.) writeall(output_fds[sliceno_], b'%16x' % (os.getpid(), )) # use our iowrapper fd instead of stdout/stderr os.dup2(output_fds[sliceno_], 1) os.dup2(output_fds[sliceno_], 2) for fd in output_fds: os.close(fd) os.close(_prof_fd) slicename = 'analysis(%d)' % (sliceno_, ) setproctitle(slicename) if delayed_start: os.close(delayed_start[1]) update = statmsg._start( 'waiting for concurrency limit (%d)' % (sliceno_, ), parent_pid, True) if os.read(delayed_start[0], 1) != b'a': raise AcceleratorError('bad delayed_start, giving up') update(slicename) os.close(delayed_start[0]) else: statmsg._start(slicename, parent_pid, True) kw['sliceno'] = g.sliceno = sliceno_ for dw in dataset._datasetwriters.values(): if dw._for_single_slice is None: dw._set_slice(sliceno_) res = analysis_func(**kw) if preserve_result: # Remove defaultdicts until we find one with a picklable default_factory. # (This is what you end up doing manually anyway.) def picklable(v): try: pickle.dumps(v, pickle.HIGHEST_PROTOCOL) return True except Exception: return False def fixup(d): if isinstance( d, defaultdict) and not picklable(d.default_factory): if not d: return {} v = next(iteritems(d)) if isinstance( v, defaultdict) and not picklable(v.default_factory): return {k: fixup(v) for k, v in iteritems(d)} else: return dict(d) else: return d def save(item, name): blob.save(fixup(item), name, sliceno=sliceno_, temp=True) if isinstance(res, tuple): if sliceno_ == 0: blob.save(len(res), "Analysis.tuple", temp=True) for ix, item in enumerate(res): save(item, "Analysis.%d." % (ix, )) else: if sliceno_ == 0: blob.save(False, "Analysis.tuple", temp=True) save(res, "Analysis.") from accelerator.extras import saved_files dw_lens = {} dw_minmax = {} dw_compressions = {} for name, dw in dataset._datasetwriters.items(): if dw._for_single_slice or sliceno_ == 0: dw_compressions[name] = dw._compressions if dw._for_single_slice in ( None, sliceno_, ): dw.close() dw_lens[name] = dw._lens dw_minmax[name] = dw._minmax c_fflush() q.put(( sliceno_, monotonic(), saved_files, dw_lens, dw_minmax, dw_compressions, None, )) q.close() except: c_fflush() msg = fmt_tb(1) print(msg) q.put(( sliceno_, monotonic(), {}, {}, {}, {}, msg, )) q.close() sleep(5) # give launcher time to report error (and kill us) exitfunction()
def execute_process(workdir, jobid, slices, concurrency, result_directory, common_directory, input_directory, index=None, workdirs=None, server_url=None, subjob_cookie=None, parent_pid=0): WORKDIRS.update(workdirs) g.job = jobid setproctitle('launch') path = os.path.join(workdir, jobid) try: os.chdir(path) except Exception: print("Cannot cd to workdir", path) exit(1) g.params = params = job_params() method_ref = import_module(params.package + '.a_' + params.method) g.sliceno = -1 g.job = CurrentJob(jobid, params, result_directory, input_directory) g.slices = slices g.options = params.options g.datasets = params.datasets g.jobs = params.jobs method_ref.options = params.options method_ref.datasets = params.datasets method_ref.jobs = params.jobs g.server_url = server_url g.running = 'launch' statmsg._start('%s %s' % ( jobid, params.method, ), parent_pid) def dummy(): pass prepare_func = getattr(method_ref, 'prepare', dummy) analysis_func = getattr(method_ref, 'analysis', dummy) synthesis_func = getattr(method_ref, 'synthesis', dummy) synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func) fd2pid, names, masters, slaves = iowrapper.setup( slices, prepare_func is not dummy, analysis_func is not dummy) def switch_output(): fd = slaves.pop() os.dup2(fd, 1) os.dup2(fd, 2) os.close(fd) if analysis_func is dummy: q = None else: q = LockFreeQueue() iowrapper.run_reader(fd2pid, names, masters, slaves, q=q) for fd in masters: os.close(fd) # A chain must be finished from the back, so sort on that. sortnum_cache = {} def dw_sortnum(name): if name not in sortnum_cache: dw = dataset._datasetwriters.get(name) if not dw: # manually .finish()ed num = -1 elif dw.previous and dw.previous.startswith(jobid + '/'): pname = dw.previous.split('/')[1] num = dw_sortnum(pname) + 1 else: num = 0 sortnum_cache[name] = num return sortnum_cache[name] prof = {} if prepare_func is dummy: prof['prepare'] = 0 # truthish! else: t = monotonic() switch_output() g.running = 'prepare' g.subjob_cookie = subjob_cookie setproctitle(g.running) with statmsg.status(g.running): g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare)) to_finish = [ dw.name for dw in dataset._datasetwriters.values() if dw._started ] if to_finish: with statmsg.status("Finishing datasets"): for name in sorted(to_finish, key=dw_sortnum): dataset._datasetwriters[name].finish() c_fflush() prof['prepare'] = monotonic() - t switch_output() setproctitle('launch') from accelerator.extras import saved_files if analysis_func is dummy: prof['per_slice'] = [] prof['analysis'] = 0 else: t = monotonic() g.running = 'analysis' g.subjob_cookie = None # subjobs are not allowed from analysis with statmsg.status( 'Waiting for all slices to finish analysis') as update: g.update_top_status = update prof['per_slice'], files, g.analysis_res = fork_analysis( slices, concurrency, analysis_func, args_for(analysis_func), synthesis_needs_analysis, slaves, q) del g.update_top_status prof['analysis'] = monotonic() - t saved_files.update(files) t = monotonic() g.running = 'synthesis' g.subjob_cookie = subjob_cookie setproctitle(g.running) with statmsg.status(g.running): synthesis_res = synthesis_func(**args_for(synthesis_func)) if synthesis_res is not None: blob.save(synthesis_res, temp=False) if dataset._datasetwriters: with statmsg.status("Finishing datasets"): for name in sorted(dataset._datasetwriters, key=dw_sortnum): dataset._datasetwriters[name].finish() if dataset._datasets_written: blob.save(dataset._datasets_written, 'DS/LIST', temp=False, _hidden=True) c_fflush() t = monotonic() - t prof['synthesis'] = t from accelerator.subjobs import _record return None, (prof, saved_files, _record)
def save(item, name): blob.save(fixup(item), name, sliceno=sliceno_, temp=True)
def call_analysis(analysis_func, sliceno_, q, preserve_result, parent_pid, output_fds, **kw): try: # tell iowrapper our PID, so our output goes to the right status stack. os.write(output_fds[sliceno_], pack("=Q", os.getpid())) # use our iowrapper fd instead of stdout/stderr os.dup2(output_fds[sliceno_], 1) os.dup2(output_fds[sliceno_], 2) for fd in output_fds: os.close(fd) slicename = 'analysis(%d)' % (sliceno_,) status._start(slicename, parent_pid, 't') setproctitle(slicename) os.close(_prof_fd) kw['sliceno'] = g.sliceno = sliceno_ for dw in dataset._datasetwriters.values(): if dw._for_single_slice is None: dw._set_slice(sliceno_) res = analysis_func(**kw) if preserve_result: # Remove defaultdicts until we find one with a picklable default_factory. # (This is what you end up doing manually anyway.) def picklable(v): try: pickle.dumps(v, pickle.HIGHEST_PROTOCOL) return True except Exception: return False def fixup(d): if isinstance(d, defaultdict) and not picklable(d.default_factory): if not d: return {} v = next(iteritems(d)) if isinstance(v, defaultdict) and not picklable(v.default_factory): return {k: fixup(v) for k, v in iteritems(d)} else: return dict(d) else: return d def save(item, name): blob.save(fixup(item), name, sliceno=sliceno_, temp=True) if isinstance(res, tuple): if sliceno_ == 0: blob.save(len(res), "Analysis.tuple", temp=True) for ix, item in enumerate(res): save(item, "Analysis.%d." % (ix,)) else: if sliceno_ == 0: blob.save(False, "Analysis.tuple", temp=True) save(res, "Analysis.") from accelerator.extras import saved_files dw_lens = {} dw_minmax = {} for name, dw in dataset._datasetwriters.items(): if dw._for_single_slice in (None, sliceno_,): dw.close() dw_lens[name] = dw._lens dw_minmax[name] = dw._minmax c_fflush() q.put((sliceno_, time(), saved_files, dw_lens, dw_minmax, None,)) except: c_fflush() q.put((sliceno_, time(), {}, {}, {}, fmt_tb(1),)) print_exc() sleep(5) # give launcher time to report error (and kill us) exitfunction()