def __enter__(self): self._status = status('Saving ' + self.filename) self._status.__enter__() # stupid python3 feels that w and x are exclusive, while python2 requires both. fh = getattr(self, '_open', open)(self.tmp_filename, 'xb' if PY3 else 'wbx') self.close = fh.close return fh
def sort(columniter): with status('Determining sort order'): info = datasets.source.columns special_handling = set() for column in options.sort_columns: if info[column].type.startswith( 'float') or info[column].type == 'number': # for NaN special_handling.add(column) if info[column].none_support: special_handling.add(column) if special_handling: # At least one sort column can have unsortable values first = True iters = [] for column in options.sort_columns: it = columniter(column, status_reporting=first) first = False if column in special_handling: it = filter_unsortable(column, it) iters.append(it) if len(iters) == 1: # Special case to not make tuples when there is only one column. lst = list(iters[0]) else: lst = list(izip(*iters)) else: columns = options.sort_columns if len(columns) == 1: # Special case to not make tuples when there is only one column. columns = columns[0] lst = list(columniter(columns)) if options.trigger_column: if len(options.sort_columns) == 1: sort_extra = lst else: with status('Creating trigger list'): ix = options.sort_columns.index(options.trigger_column) sort_extra = [el[ix] for el in lst] else: sort_extra = None reverse = (options.sort_order == 'descending') with status('Creating sort list'): return sorted(range(len(lst)), key=lst.__getitem__, reverse=reverse), sort_extra
def pickle_load(filename='result.pickle', jobid=None, sliceno=None, encoding='bytes'): filename = _fn(filename, jobid, sliceno) with status('Loading ' + filename): with open(filename, 'rb') as fh: if PY3: return pickle.load(fh, encoding=encoding) else: return pickle.load(fh)
def analysis(sliceno, params, prepare_res): dw, ds_list, sort_idx = prepare_res if options.sort_across_slices: sort_idx = sort_idx[sliceno] columniter = partial(ds_list.iterate, None, copy_mode=True) else: sort_idx, _ = sort(partial(ds_list.iterate, sliceno)) columniter = partial(ds_list.iterate, sliceno, copy_mode=True) for ix, column in enumerate(datasets.source.columns, 1): colstat = '%r (%d/%d)' % (column, ix, len(datasets.source.columns),) with status('Reading ' + colstat): lst = list(columniter(column)) with status('Writing ' + colstat): w = dw.writers[column].write for idx in sort_idx: w(lst[idx]) # Delete the list before making a new one, so we use less memory. del lst
def build(method, options={}, datasets={}, jobs={}, name=None, caption=None, **kw): """Just like urd.build, but for making subjobs""" global _a, _bad_kws assert g.running != 'analysis', "Analysis is not allowed to make subjobs" assert g.subjob_cookie, "Can't build subjobs: out of cookies" if not _a: _a = Automata(g.server_url, subjob_cookie=g.subjob_cookie) _a.update_method_info() _a.record[None] = _a.jobs = globals()['jobs'] _bad_kws = set(getarglist(_a.call_method)) bad_kws = _bad_kws & set(kw) if bad_kws: raise Exception('subjobs.build does not accept these keywords: %r' % (bad_kws, )) def run(): return _a.call_method(method, options=options, datasets=datasets, jobs=jobs, record_as=name, caption=caption, **kw) try: if name or caption: msg = 'Building subjob %s' % (name or method, ) if caption: msg += ' "%s"' % (caption, ) with status(msg): jid = run() else: jid = run() except ServerError as e: raise ServerError(e.args[0]) except JobError as e: raise JobError(e.job, e.method, e.status) for d in _a.job_retur.jobs.values(): if d.link not in _record: _record[d.link] = bool(d.make) return jid
def prepare(params): if options.trigger_column: assert options.sort_across_slices, 'trigger_column is meaningless without sort_across_slices' assert options.trigger_column in options.sort_columns, 'can only trigger on a column that is sorted on' d = datasets.source ds_list = d.chain(stop_ds={datasets.previous: 'source'}) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, datasets=ds_list) sort_idx, sort_extra = sort(columniter) total = len(sort_idx) per_slice = [total // params.slices] * params.slices extra = total % params.slices if extra: # spread the left over length over pseudo-randomly selected slices # (using the start of sort_idx to select slices). # this will always select the first slices if data is already sorted # but at least it's deterministic. selector = sorted(range(min(params.slices, total)), key=sort_idx.__getitem__) for sliceno in selector[:extra]: per_slice[sliceno] += 1 # Switch to tracking what line the slices end at slice_end = [] end = 0 for cnt in per_slice: end += cnt slice_end.append(end) if options.trigger_column: # extra definitely changed value last to simplify loop sort_extra.append(object()) sort_idx.append(-1) # move slice_end counts around to only switch when trigger_column changes def fixup_fwd(cnt): trigger_v = sort_extra[sort_idx[cnt - 1]] while trigger_v == sort_extra[sort_idx[cnt]]: cnt += 1 return cnt def fixup_bck(cnt, min_cnt): trigger_v = sort_extra[sort_idx[cnt - 1]] while cnt > min_cnt and trigger_v == sort_extra[sort_idx[cnt]]: cnt -= 1 return cnt with status('Adjusting for trigger_column'): prev = 0 for sliceno, cnt in enumerate(slice_end[:-1]): if cnt: cnt = max(cnt, prev) choosen = fwd = fixup_fwd(cnt) bck = fixup_bck(cnt, prev) # This could be smarter if (cnt - bck) <= (fwd < cnt): choosen = bck prev = slice_end[sliceno] = choosen # and now switch sort_idx to be per slice sort_idx = [ sort_idx[start:end] for start, end in zip([0] + slice_end, slice_end) ] assert sum(len(part) for part in sort_idx) == total # all rows used if not options.trigger_column: assert len(set( len(part) for part in sort_idx)) < 3 # only 1 or 2 lengths possible else: sort_idx = None if options.sort_across_slices: hashlabel = None else: hashlabel = d.hashlabel if len(ds_list) == 1: filename = d.filename else: filename = None dw = DatasetWriter( columns=d.columns, caption=params.caption, hashlabel=hashlabel, filename=filename, previous=datasets.previous, ) return dw, ds_list, sort_idx
def execute_process(workdir, jobid, slices, concurrency, result_directory, common_directory, input_directory, index=None, workdirs=None, server_url=None, subjob_cookie=None, parent_pid=0): WORKDIRS.update(workdirs) g.job = jobid setproctitle('launch') path = os.path.join(workdir, jobid) try: os.chdir(path) except Exception: print("Cannot cd to workdir", path) exit(1) g.params = params = job_params() method_ref = import_module(params.package + '.a_' + params.method) g.sliceno = -1 g.job = CurrentJob(jobid, params, result_directory, input_directory) g.slices = slices g.options = params.options g.datasets = params.datasets g.jobs = params.jobs method_ref.options = params.options method_ref.datasets = params.datasets method_ref.jobs = params.jobs g.server_url = server_url g.running = 'launch' statmsg._start('%s %s' % ( jobid, params.method, ), parent_pid) def dummy(): pass prepare_func = getattr(method_ref, 'prepare', dummy) analysis_func = getattr(method_ref, 'analysis', dummy) synthesis_func = getattr(method_ref, 'synthesis', dummy) synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func) fd2pid, names, masters, slaves = iowrapper.setup( slices, prepare_func is not dummy, analysis_func is not dummy) def switch_output(): fd = slaves.pop() os.dup2(fd, 1) os.dup2(fd, 2) os.close(fd) if analysis_func is dummy: q = None else: q = LockFreeQueue() iowrapper.run_reader(fd2pid, names, masters, slaves, q=q) for fd in masters: os.close(fd) # A chain must be finished from the back, so sort on that. sortnum_cache = {} def dw_sortnum(name): if name not in sortnum_cache: dw = dataset._datasetwriters.get(name) if not dw: # manually .finish()ed num = -1 elif dw.previous and dw.previous.startswith(jobid + '/'): pname = dw.previous.split('/')[1] num = dw_sortnum(pname) + 1 else: num = 0 sortnum_cache[name] = num return sortnum_cache[name] prof = {} if prepare_func is dummy: prof['prepare'] = 0 # truthish! else: t = monotonic() switch_output() g.running = 'prepare' g.subjob_cookie = subjob_cookie setproctitle(g.running) with statmsg.status(g.running): g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare)) to_finish = [ dw.name for dw in dataset._datasetwriters.values() if dw._started ] if to_finish: with statmsg.status("Finishing datasets"): for name in sorted(to_finish, key=dw_sortnum): dataset._datasetwriters[name].finish() c_fflush() prof['prepare'] = monotonic() - t switch_output() setproctitle('launch') from accelerator.extras import saved_files if analysis_func is dummy: prof['per_slice'] = [] prof['analysis'] = 0 else: t = monotonic() g.running = 'analysis' g.subjob_cookie = None # subjobs are not allowed from analysis with statmsg.status( 'Waiting for all slices to finish analysis') as update: g.update_top_status = update prof['per_slice'], files, g.analysis_res = fork_analysis( slices, concurrency, analysis_func, args_for(analysis_func), synthesis_needs_analysis, slaves, q) del g.update_top_status prof['analysis'] = monotonic() - t saved_files.update(files) t = monotonic() g.running = 'synthesis' g.subjob_cookie = subjob_cookie setproctitle(g.running) with statmsg.status(g.running): synthesis_res = synthesis_func(**args_for(synthesis_func)) if synthesis_res is not None: blob.save(synthesis_res, temp=False) if dataset._datasetwriters: with statmsg.status("Finishing datasets"): for name in sorted(dataset._datasetwriters, key=dw_sortnum): dataset._datasetwriters[name].finish() if dataset._datasets_written: blob.save(dataset._datasets_written, 'DS/LIST', temp=False, _hidden=True) c_fflush() t = monotonic() - t prof['synthesis'] = t from accelerator.subjobs import _record return None, (prof, saved_files, _record)