def write1d(outfile, result, res_desc, CImethod): """ Write the result of a fitting and its evaluation to a CSV file. :param str outfile: Name of the file to write to :param ResultStruct result: Result of the fitting evaluation (e.g. output of :py:func:`fit_evaluation`) :param str res_desc: Description of the residuals (in more details than just the name of the residuals) :param str CImethod: Description of the confidence interval estimation method """ with open(outfile, CSV_WRITE_FLAGS) as f: w = csv_writer(f) w.writerow(["Function", result.fct.fct.description]) w.writerow(["Residuals", result.res_name, res_desc]) w.writerow(["Parameter", "Value"]) for pn, pv in izip(result.param_names, result.popt): w.writerow([pn, "%.20g" % pv]) #TODO w.writerow(["Regression Evaluation"]) w.writerow([]) w.writerow(["Data"]) w.writerow([ result.xname, result.yname, result.fct_desc, "Residuals: %s" % result.res_name ]) w.writerows(c_[result.xdata, result.ydata, result.yopts, result.res]) w.writerow([]) w.writerow(['Model validation']) w.writerow( [result.yname, 'Normalized residuals', 'Theoretical quantiles']) w.writerows(c_[result.sorted_yopts, result.scaled_res, result.normq]) if result.eval_points is not result.xdata: w.writerow([]) w.writerow(["Interpolated data"]) w.writerow([result.xname, result.yname]) w.writerows(c_[result.eval_points, result.interpolation]) if result.CI: w.writerow([]) w.writerow(["Confidence interval"]) w.writerow(["Method", CImethod]) head = ["Parameters"] + \ list(chain(*[["%g%% - low" % v, "%g%% - high" % v] for v in result.CI])) w.writerow(head) #print(result.CIs[1]) for cis in izip(result.param_names, *chain(*result.CIs[1])): cistr = [cis[0]] + ["%.20g" % v for v in cis[1:]] w.writerow(cistr) w.writerow([result.yname]) head[0] = result.xname w.writerow(head) w.writerows(c_[tuple(chain([result.eval_points], *result.CIs[0]))])
def plot1d(result, loc=0, fig=None, res_fig=None): """ Use matplotlib to display the result of a fit, and return the list of plots used :rtype: :py:class:`Plot1dResult` :returns: hangles to the various figures and plots """ if fig is None: fig = figure() else: try: figure(fig) except TypeError: figure(fig.number) p_est = plot(result.eval_points, result.interpolation, label='estimated')[0] p_data = plot(result.xdata, result.ydata, '+', label='data')[0] p_CIs = [] if result.CI: for p, (low, high) in izip(result.CI, result.CIs[0]): l = plot(result.eval_points, low, '--', label='%g%% CI' % (p, ))[0] h = plot(result.eval_points, high, l.get_color() + '--')[0] p_CIs += [l, h] if result.param_names: param_strs = ", ".join( "%s=%g" % (n, v) for n, v in izip(result.param_names, result.popt)) else: param_strs = ", ".join("%g" % v for v in result.popt) param_strs = "$%s$" % (param_strs, ) title("Estimated function %s with params %s" % (result.fct_desc, param_strs)) xlabel(result.xname) ylabel(result.yname) legend(loc=loc) plots = {"figure": fig, "estimate": p_est, "data": p_data, "CIs": p_CIs} prt = plot_residual_tests( result.xdata, result.yopts, result.res, "{0} with params {1}".format(result.fct_desc, param_strs), result.xname, result.yname, result.res_name, result.sorted_yopts, result.scaled_res, result.normq, res_fig) plots.update(prt._asdict()) return Plot1dResult(**plots)
def sort(columniter): with status('Determining sort order'): info = datasets.source.columns if sum(info[column].type not in nononehandling_types for column in options.sort_columns): # At least one sort column can have unsortable values first = True iters = [] for column in options.sort_columns: it = columniter(column, status_reporting=first) first = False if info[column].type not in nononehandling_types: it = filter_unsortable(column, it) iters.append(it) if len(iters) == 1: # Special case to not make tuples when there is only one column. lst = list(iters[0]) else: lst = list(izip(*iters)) else: columns = options.sort_columns if len(columns) == 1: # Special case to not make tuples when there is only one column. columns = columns[0] lst = list(columniter(columns)) reverse = (options.sort_order == 'descending') with status('Creating sort list'): return sorted(range(len(lst)), key=lst.__getitem__, reverse=reverse)
def __init__(self, slices): slices = range(slices) self._slices = iter(slices) tuple_len = pickle_load("Analysis.tuple") if tuple_len is False: self._is_tupled = False else: self._is_tupled = True self._loaders = [self._loader(ix, iter(slices)) for ix in range(tuple_len)] self._tupled = izip(*self._loaders)
def bootstrap_result(worker, start_repeats, end_repeats): #print("Starting worker {} from {} to {}".format(worker, start_repeats, end_repeats)) try: for i in irange(start_repeats, end_repeats): #print("Worker {} runs iteration {} with fit: {}".format(worker, i, fit)) new_fit = fit(shuffled_x[..., i % nx, :], shuffled_y[i % ny, :], *fit_args, **fit_kwrds) new_fit.fit() #print("new_fit = {}".format(new_fit)) result_array[i + 1] = new_fit(eval_points) for ea, attr in izip(extra_arrays, extra_attrs): ea[i + 1] = getattr(new_fit, attr) except Exception: traceback.print_exc(None, sys.stderr) raise
def getCIs(CI, *arrays): #sorted_arrays = [ np.sort(a, axis=0) for a in arrays ] if not np.iterable(CI): CI = (CI,) def make_CI(a): return np.zeros((len(CI), 2) + a.shape[1:], dtype=float) CIs = tuple(make_CI(a) for a in arrays) for i, ci in enumerate(CI): ci = (100. - ci) / 2 for cis, arr in izip(CIs, arrays): low = np.percentile(arr, ci, axis=0) high = np.percentile(arr, 100 - ci, axis=0) cis[i] = [low, high] return CIs
def main(n, timer): times = [] for i in xrange(n): t0 = timer() u = [1] * DEFAULT_N for dummy in xrange(10): v = eval_AtA_times_u(u) u = eval_AtA_times_u(v) vBv = vv = 0 for ue, ve in izip(u, v): vBv += ue * ve vv += ve * ve tk = timer() times.append(tk - t0) return times
def main(n:int, timer): times = [] for i in xrange(n): t0 = timer() u = [1] * DEFAULT_N for dummy in xrange(10): v = eval_AtA_times_u(u) u = eval_AtA_times_u(v) vBv = vv = 0 for ue, ve in izip(u, v): vBv += ue * ve vv += ve * ve tk = timer() times.append(tk - t0) return times
def _iterate_datasets(to_iter, columns, pre_callback, post_callback, filter_func, translation_func, translators, want_tuple, range, status_reporting): skip_ds = None def argfixup(func, is_post): if func: if len(getargspec(func).args) == 1: seen_ds = [None] def wrapper(d, sliceno=None): if d != seen_ds[0]: if is_post: if seen_ds[0] and seen_ds[0] != skip_ds: func(seen_ds[0]) else: func(d) seen_ds[0] = d return wrapper, True return func, False pre_callback, unsliced_pre_callback = argfixup(pre_callback, False) post_callback, unsliced_post_callback = argfixup(post_callback, True) if not to_iter: return if range: range_k, ( range_bottom, range_top, ) = next(iteritems(range)) range_check = range_check_function(range_bottom, range_top) if range_k in columns and range_k not in translators and not translation_func: has_range_column = True range_i = columns.index(range_k) if want_tuple: range_f = lambda t: range_check(t[range_i]) else: range_f = range_check else: has_range_column = False if status_reporting: from status import status else: from status import dummy_status as status def fmt_dsname(d, sliceno, rehash): if rehash: return d + ':REHASH' else: return '%s:%d' % (d, sliceno) if len(to_iter) == 1: msg_head = 'Iterating ' + fmt_dsname(*to_iter[0]) def update_status(update, ix, d, sliceno, rehash): pass else: msg_head = 'Iterating %s to %s' % ( fmt_dsname(*to_iter[0]), fmt_dsname(*to_iter[-1]), ) def update_status(update, ix, d, sliceno, rehash): update('%s, %d/%d (%s)' % (msg_head, ix, len(to_iter), fmt_dsname(d, sliceno, rehash))) with status(msg_head) as update: for ix, (d, sliceno, rehash) in enumerate(to_iter, 1): if unsliced_post_callback: post_callback(d) update_status(update, ix, d, sliceno, rehash) if pre_callback: if d == skip_ds: continue try: pre_callback(d, sliceno) except SkipSlice: if unsliced_pre_callback: skip_ds = d continue except SkipJob: skip_ds = d continue it = d._iterator(None if rehash else sliceno, columns) for ix, trans in translators.items(): it[ix] = imap(trans, it[ix]) if want_tuple: it = izip(*it) else: it = it[0] if rehash: it = d._hashfilter(sliceno, rehash, it) if translation_func: it = imap(translation_func, it) if range: c = d.columns[range_k] if c.min is not None and (not range_check(c.min) or not range_check(c.max)): if has_range_column: it = ifilter(range_f, it) else: if rehash: filter_it = d._hashfilter( sliceno, rehash, d._column_iterator(None, range_k)) else: filter_it = d._column_iterator( sliceno, range_k) it = compress(it, imap(range_check, filter_it)) if filter_func: it = ifilter(filter_func, it) yield it if post_callback and not unsliced_post_callback: post_callback(d, sliceno) if unsliced_post_callback: post_callback(None)
def setup(): global TMP_PATH TMP_PATH = tempfile.mkdtemp() for _, fn in izip(xrange(NUM_FILES), generate_files()): with open(fn, "w") as f: f.write(fn)
def csvexport(sliceno, filename, labelsonfirstline): assert len(options.separator) == 1 assert options.quote_fields in ( '', "'", '"', ) d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobids.previous: prev_source = job_params(jobids.previous).datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): mkwrite = mkwrite_gz elif filename.lower().endswith('.csv'): mkwrite = mkwrite_uncompressed else: raise Exception( "Filename should end with .gz for compressed or .csv for uncompressed" ) iters = [] first = True for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ('float32', 'float64', 'number'): it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with mkwrite(filename) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write( enc( sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def bootstrap(fit, xdata, ydata, CI, shuffle_method=bootstrap_residuals, shuffle_args=(), shuffle_kwrds={}, repeats=3000, eval_points=None, full_results=False, nb_workers=None, extra_attrs=(), fit_args=(), fit_kwrds={}): """ This function implement the bootstrap algorithm for a regression algorithm. It is capable of spreading the load across many threads using shared memory and the :py:mod:`multiprocess` module. :type fit: callable :param fit: Method used to compute regression. The call is:: f = fit(xdata, ydata, *fit_args, **fit_kwrds) Fit should return an object that would evaluate the regression on a set of points. The next call will be:: f(eval_points) :type xdata: ndarray of shape (N,) or (k,N) for function with k predictors :param xdata: The independent variable where the data is measured :type ydata: ndarray :param ydata: The dependant data :type CI: tuple of float :param CI: List of percentiles to extract :type shuffle_method: callable :param shuffle_method: Create shuffled dataset. The call is:: shuffle_method(xdata, ydata, y_est, repeat=repeats, *shuffle_args, **shuffle_kwrds) where ``y_est`` is the estimated dependant variable on the xdata. :type shuffle_args: tuple :param shuffle_args: List of arguments for the shuffle method :type shuffle_kwrds: dict :param shuffle_kwrds: Dictionnary of arguments for the shuffle method :type repeats: int :param repeats: Number of repeats for the bootstraping :type eval_points: ndarray or None :param eval_points: List of points to evaluate. If None, eval_point is xdata. :type full_results: bool :param full_results: if True, output also the whole set of evaluations :type nb_workers: int or None :param nb_worders: Number of worker threads. If None, the number of detected CPUs will be used. And if 1 or less, a single thread will be used. :type extra_attrs: tuple of str :param extra_attrs: List of attributes of the fitting method to extract on top of the y values for confidence intervals :type fit_args: tuple :param fit_args: List of extra arguments for the fit callable :type fit_kwrds: dict :param fit_kwrds: Dictionnary of extra named arguments for the fit callable :rtype: :py:class:`BootstrapResult` :return: Estimated y on the data, on the evaluation points, the requested confidence intervals and, if requested, the shuffled X, Y and the full estimated distributions. """ xdata = np.asarray(xdata) ydata = np.asarray(ydata) y_fit = fit(xdata, ydata, *fit_args, **fit_kwrds) y_fit.fit() shuffled_x, shuffled_y = shuffle_method(y_fit, xdata, ydata, repeats=repeats, *shuffle_args, **shuffle_kwrds) nx = shuffled_x.shape[-2] ny = shuffled_y.shape[0] extra_values = [] for attr in extra_attrs: extra_values.append(getattr(y_fit, attr)) if eval_points is None: eval_points = xdata if nb_workers is None: nb_workers = mp.cpu_count() multiprocess = nb_workers > 1 # Copy everything in shared mem if multiprocess: ra = sharedmem.zeros((repeats + 1, len(eval_points)), dtype=float) result_array = ra.np sx = sharedmem.array(shuffled_x) sy = sharedmem.array(shuffled_y) ep = sharedmem.array(eval_points) def make_ea(ev): return sharedmem.zeros((repeats + 1, len(ev)), dtype=float) eas = [make_ea(ev) for ev in extra_values] extra_arrays = [ea.np for ea in eas] pool = mp.Pool(mp.cpu_count(), bootstrap_workers.initialize_shared, (nx, ny, ra, eas, sx, sy, ep, extra_attrs, fit, fit_args, fit_kwrds)) else: result_array = np.empty((repeats + 1, len(eval_points)), dtype=float) def make_ea(ev): return np.empty((repeats + 1, len(ev)), dtype=float) extra_arrays = [make_ea(ev) for ev in extra_values] bootstrap_workers.initialize(nx, ny, result_array, extra_arrays, shuffled_x, shuffled_y, eval_points, extra_attrs, fit, fit_args, fit_kwrds) result_array[0] = y_fit(eval_points) for ea, ev in izip(extra_arrays, extra_values): ea[0] = ev base_repeat = repeats // nb_workers if base_repeat * nb_workers < repeats: base_repeat += 1 for i in irange(nb_workers): end_repeats = (i + 1) * base_repeat if end_repeats > repeats: end_repeats = repeats if multiprocess: pool.apply_async(bootstrap_workers.bootstrap_result, (i, i * base_repeat, end_repeats)) else: bootstrap_workers.bootstrap_result(i, i * base_repeat, end_repeats) if multiprocess: pool.close() pool.join() CIs = getCIs(CI, result_array, *extra_arrays) # copy the array to not return a view on a larger array y_eval = np.array(result_array[0]) if not full_results: shuffled_y = shuffled_x = result_array = None extra_arrays = () elif multiprocess: result_array = result_array.copy() # copy in local memory extra_arrays = [ea.copy for ea in extra_arrays] return BootstrapResult(y_fit, y_fit(xdata), eval_points, y_eval, tuple(CI), CIs, shuffled_x, shuffled_y, result_array)
def _iterate_datasets(to_iter, columns, pre_callback, post_callback, filter_func, translation_func, translators, want_tuple, range): skip_jobid = None def argfixup(func, is_post): if func: if len(getargspec(func).args) == 1: seen_jobid = [None] def wrapper(jobid, sliceno=None): if jobid != seen_jobid[0]: if is_post: if seen_jobid[ 0] and seen_jobid[0] != skip_jobid: func(seen_jobid[0]) else: func(jobid) seen_jobid[0] = jobid return wrapper, True return func, False pre_callback, unsliced_pre_callback = argfixup(pre_callback, False) post_callback, unsliced_post_callback = argfixup(post_callback, True) if not to_iter: return if range: range_k, ( range_bottom, range_top, ) = next(iteritems(range)) range_check = range_check_function(range_bottom, range_top) if range_k in columns and range_k not in translators and not translation_func: has_range_column = True range_i = columns.index(range_k) if want_tuple: range_f = lambda t: range_check(t[range_i]) else: range_f = range_check else: has_range_column = False starting_at = '%s:%d' % ( to_iter[0][0], to_iter[0][2], ) if len(to_iter) == 1: msg = 'Iterating ' + starting_at else: msg = 'Iterating %d dataset slices starting at %s' % ( len(to_iter), starting_at, ) with status(msg): for ix, (jobid, d, sliceno, rehash) in enumerate(to_iter): if unsliced_post_callback: post_callback(jobid) if pre_callback: if jobid == skip_jobid: continue try: pre_callback(jobid, sliceno) except SkipSlice: if unsliced_pre_callback: skip_jobid = jobid continue except SkipJob: skip_jobid = jobid continue it = d._iterator(None if rehash else sliceno, columns) for ix, trans in translators.items(): it[ix] = imap(trans, it[ix]) if want_tuple: it = izip(*it) else: it = it[0] if rehash: it = d._hashfilter(sliceno, rehash, it) if translation_func: it = imap(translation_func, it) if range: c = d.columns[range_k] if c.min is not None and (not range_check(c.min) or not range_check(c.max)): if has_range_column: it = ifilter(range_f, it) else: if rehash: filter_it = d._hashfilter( sliceno, rehash, d._column_iterator(None, range_k)) else: filter_it = d._column_iterator( sliceno, range_k) it = compress(it, imap(range_check, filter_it)) if filter_func: it = ifilter(filter_func, it) with status('(%d/%d) %s:%s' % ( ix, len(to_iter), jobid, 'REHASH' if rehash else sliceno, )): yield it if post_callback and not unsliced_post_callback: post_callback(jobid, sliceno) if unsliced_post_callback: post_callback(None)