def meanbin(dataset, stepsize=None, numbins=None, dim=None, mastercolumn=0, mastermax=None, mastermin=None, nanthreshold=0.5, original_if_supersample = False): """ Do a mean binnning of the dataset dataset: a dataset stepsize: the size of the steps in the new rebinning (this mode centers the endpoints to get a fixed number bins) numbins: the number of bins (overrides stepsize) dim: the dimension to do the binning along (None means largest) mastercolumn: the column to use as master (default 0) mastermax: the maximum value needed mastermin: the minimum value needed nanthreshold: the amount of nans in a bin to make it a nan Stepsize or numbins must be provided. """ data = numpy.array(dataset["data"], dtype=numpy.dtype('float')) if data.ndim != 2: raise TypeError("Data must be 2-d") # if autodim, find largest dimension if dim is None: dim = 0 if numpy.size(data, 0) > numpy.size(data, 1) else 1 if dim == 1: data = data.transpose() if stepsize is None and numbins is None: raise TypeError("Must provide either stepsize or numbins") if stepsize and numbins: raise TypeError("Must provide either stepsize or numbins, but not both") # Sort data on mastercolumn sortorder = data.argsort(axis=0)[:, mastercolumn] data = data[sortorder,:] if mastermin is None: mastermin = data[0,0] if mastermax is None: mastermax = data[-1,0] # FIXME datalen was not defined # the extract metod seems to be to slow for large datasets # maybe consider removing min max extractors master = data[:, mastercolumn] extract = numpy.all([(master > mastermin),(master <= mastermax)], 0) data = data[extract,:] datalen = numpy.size(data, dim) # Using a stepsize if stepsize is not None: if type(stepsize) not in [float, int]: raise TypeError("Stepsize must be a number") numbins = ((mastermax - mastermin)/stepsize) + 1 extra = ((numbins - int(numbins))*stepsize)/2 mastermin += extra mastermax -= extra numbins = int(numbins) # If trying to supersample if numbins > datalen: if original_if_supersample: dataset["derived"] = True if "warnings" not in dataset: dataset["warnings"] = [] dataset["warnings"].append("Data was not meanbined, as the sampling parameters would result in supersampled data") # limit master = data[:, mastercolumn] extract = numpy.all([(master > mastermin),(master <= mastermax)], 0) data = data[extract,:] if dim == 1: data = data.transpose() dataset["data"] = nantonone(data.tolist()) return dataset else: raise TypeError("Specified parameters results in supersampling") (bins, binwidth) = numpy.linspace(mastermin, mastermax, num=numbins, retstep=True) binhalf = binwidth/2.0 master = data[:, mastercolumn] outdata = numpy.empty([numbins,numpy.size(data, 1)]) localend = 0 for [idx, bin] in enumerate(bins): (binmin, binmax) = (bin-binhalf, bin+binhalf) # Non-optimized: #extract = numpy.all([(master > binmin),(master <= binmax)], 0) #local = data[extract,:] # Optimized extract of values, due to previous sort localstart = localend while localend < datalen and data[localend, 0] <= binmax: localend += 1 local = data[localstart:localend, :] sums = numpy.nansum(local, 0) nans = numpy.isnan(local) nancount = (nans == True).sum(0) nonnancount = (nans == False).sum(0) row = sums/nonnancount nancover = nancount.astype(numpy.float)/(nonnancount+nancount) nanindex = (nancover > nanthreshold) # if the nancover is more than threshold, # convert that point into a NaN row[nanindex] = numpy.NaN # set the index element to the bin instead of mean master value # for large bins for evenly spaced data, these should be almost the # same, except for the endpoint row[0] = bin outdata[idx,:] = row if dim == 1: outdata = outdata.transpose() dataset["data"] = nantonone(outdata.tolist()) dataset["rows"] = len(dataset["data"]) dataset["derived"] = True return dataset
def spans_to_time(depth, time): """ Put discrete data on timescale depth: sequence of depth values time: sequence of time values nan_threshold: the fraction of samples needed to return a non-nan value from the interval nan_ignore: ignore nans Based on Sune Olander Rasmussen 23 March 2006 data_on_timescale.m which in turn is based on resampling code from Bo M. Vinther """ ## Check consistency if depth["columns"] != 2: raise TypeError("depth must be specified with two columns of data") if time["columns"] != 2: raise TypeError("time must be specified with two columns of data") if depth["sequence"]["index_marker_type"] != "span": raise TypeError("depth must be a spanning sequence") if time["sequence"]["index_marker_type"] != "point": raise TypeError("time must be a point sequence") depthdata = numpy.asarray(depth["data"]) timedata = numpy.asarray(time["data"]) M = depth["rows"] N = time["rows"]-1 output = numpy.zeros((N, 3)) output[:,0] = timedata[1:, 0] output[:,1] = timedata[1:, 1] minj = 0 maxj = 0 for i in range(N): # find first data sample in time interval for j in range(minj, M): if timedata[i, 0] < depthdata[j, 0]: minj = j break for j in range(minj, M): if timedata[i+1, 0] <= depthdata[j, 0]: maxj = j break mm = maxj-minj+1 #number of samples in the time interval # FIXME, if time starts before data or the reverse # if minj == maxj and minj == 0: # output[i, 2] = numpy.nan # continue dz = numpy.zeros((mm, 1)) # FIXME: ignore nan stuff dz[0] = depthdata[minj, 0] - timedata[i, 0] for j in range(1, mm-1): dz[j] = depthdata[minj+j, 0] - depthdata[minj+j-1, 0] dz[mm-1] = timedata[i+1, 0] - depthdata[maxj-1, 0] DZ = numpy.sum(dz) #FIXME nan stuff for j in range(mm): val = (dz[j] * depthdata[minj+j, 1]) / DZ; output[i, 2] += val lst = nantonone(output.tolist()) time["data"] = lst time["current_parameters"].append(depth["current_parameters"][0]) print depth.keys() return time