def clean_outliers(data, thresh): ''' (by Alejandro N |uacute| |ntilde| ez) Cleans a data from outliers by replacing them with numpy nans. A point *x* is identified as an outlier if \| *x* - *med* \| / *MAD* > *thresh*, where *med* is the median of the data values and *MAD* is the median absolute deviation, defined as 1.482 * median(\| *x* - *med* \|). This function mimics IDL mc_findoutliers (by Mike Cushing), with output differences. *data* Array with data values. *thresh* The sigma threshold that defines data outliers. ''' # Check inputs try: data[0] except TypeError: print('Data invalid.') return # Calculate median and median absolute deviation med = sps.nanmedian(data) mad = 1.482 * sps.nanmedian(abs(data-med)) dataClean = np.array(data).copy() if mad == 0: print('MAD is equal to zero.') else: outlierIdx = np.where(abs((dataClean - med) / mad) > thresh) if len(outlierIdx) != 0: dataClean[outlierIdx] = np.nan return dataClean
def clean_outliers(data, thresh): ''' (by Alejandro N |uacute| |ntilde| ez) Cleans a data from outliers by replacing them with numpy nans. A point *x* is identified as an outlier if \| *x* - *med* \| / *MAD* > *thresh*, where *med* is the median of the data values and *MAD* is the median absolute deviation, defined as 1.482 * median(\| *x* - *med* \|). This function mimics IDL mc_findoutliers (by Mike Cushing), with output differences. *data* Array with data values. *thresh* The sigma threshold that defines data outliers. ''' # Check inputs try: data[0] except TypeError: print('Data invalid.') return # Calculate median and median absolute deviation med = sps.nanmedian(data) mad = 1.482 * sps.nanmedian(abs(data - med)) dataClean = np.array(data).copy() if mad == 0: print('MAD is equal to zero.') else: outlierIdx = np.where(abs((dataClean - med) / mad) > thresh) if len(outlierIdx) != 0: dataClean[outlierIdx] = np.nan return dataClean
def adaptiveMedianFilt(dat, kernel_size): """ Perform a median filter with a sliding window. For edges, we shrink window. """ assert kernel_size % 2 == 1, 'kernel_size must be odd' nobs = dat.size filt_data = np.nan * np.ones_like(dat) # Beginning region halfWindow = 0 for i in range(kernel_size // 2): filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1]) halfWindow += 1 # Middle region halfWindow = kernel_size // 2 for i in range(halfWindow, nobs - halfWindow): filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1]) # Ending region halfWindow -= 1 for i in range(nobs - halfWindow, nobs): filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1]) halfWindow -= 1 return filt_data
def calc_norm_summary_tables(accuracy_tbl, time_tbl): """ Calculate normalized performance/ranking summary, as numpy matrices as usual for convenience, and matrices of additional statistics (min, max, percentiles, etc.) Here normalized means relative to the best which gets a 1, all others get the ratio resulting from dividing by the performance of the best. """ # Min across all minimizers, i.e. for each fit problem what is the lowest chi-squared and the lowest time min_sum_err_sq = np.nanmin(accuracy_tbl, 1) min_runtime = np.nanmin(time_tbl, 1) # create normalised tables norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None] norm_runtimes = time_tbl / min_runtime[:, None] summary_cells_acc = np.array([np.nanmin(norm_acc_rankings, 0), np.nanmax(norm_acc_rankings, 0), stats.nanmean(norm_acc_rankings, 0), stats.nanmedian(norm_acc_rankings, 0) ]) summary_cells_runtime = np.array([np.nanmin(norm_runtimes, 0), np.nanmax(norm_runtimes, 0), stats.nanmean(norm_runtimes, 0), stats.nanmedian(norm_runtimes, 0) ]) return norm_acc_rankings, norm_runtimes, summary_cells_acc, summary_cells_runtime
def plotForce(): figure(size=3,aspect=0.5) subplot(1,2,1) from EvalTraj import plotFF plotFF(vp=351,t=28,f=900,cm=0.6,foffset=8) subplot_annotate() subplot(1,2,2) for i in [1,2,3,4]: R=np.squeeze(np.load('Rdpse%d.npy'%i)) R=stats.nanmedian(R,axis=2)[:,1:,:] dps=np.linspace(-1,1,201)[1:] plt.plot(dps,R[:,:,2].mean(0)); plt.legend([0,0.1,0.2,0.3],loc=3) i=2 R=np.squeeze(np.load('Rdpse%d.npy'%i)) R=stats.nanmedian(R,axis=2)[:,1:,:] mn=np.argmin(R,axis=1) y=np.random.randn(mn.shape[0])*0.00002+0.0438 plt.plot(np.sort(dps[mn[:,2]]),y,'+',mew=1,ms=6,mec=[ 0.39 , 0.76, 0.64]) plt.xlabel('Displacement of Force Origin') plt.ylabel('Average Net Force Magnitude') hh=dps[mn[:,2]] err=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.975,hh.shape[0]) err2=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.75,hh.shape[0]) m=np.mean(hh) print m, m-err,m+err np.save('force',[m, m-err,m+err,m-err2,m+err2]) plt.xlim([-0.5,0.5]) plt.ylim([0.0435,0.046]) plt.grid(b=True,axis='x') subplot_annotate()
def normalization_shift_rows_or_cols(self, axis, stacker, history): endshape = [-1, -1] endshape[axis] = 1 if self.combine_replicates: all_plates = stacker(self.normalization_plate_values.keys()) controls = (stacker([self.normalization_control_maps[pl] for pl, _ in self.normalization_plate_values.keys()]) != CONTROL_POPULATION) all_plates[controls] = np.nan # use conservative_nanmedian to avoid taking median of too few values offsets = fix_nans(conservative_nanmedian(all_plates, axis)) if offsets is None: # too many NaNs to use conservative_nanmedian, try again. offsets = fix_nans(nanmedian(all_plates, axis)) assert offsets is not None, "Too many bad values to correct row/column" offsets = offsets.reshape(endshape) # shift offsets to zero-median to keep things identifiable offsets -= np.median(offsets) history += offsets return dict(((plate, repindex), values - offsets) for ((plate, repindex), values) in self.normalization_plate_values.iteritems()) else: offsets = {} for repindex in range(self.num_replicates): rep_plates = stacker([v for (_, rep), v in self.normalization_plate_values.iteritems() if repindex == rep]) controls = (stacker([self.normalization_control_maps[pl] for pl, rep in self.normalization_plate_values.keys() if repindex == rep]) != CONTROL_POPULATION) rep_plates[controls] = np.nan # use conservative_nanmedian to avoid taking median of too few values offsets[repindex] = fix_nans(conservative_nanmedian(rep_plates, axis)) if offsets[repindex] is None: # too many NaNs to use conservative_nanmedian, try again. offsets[repindex] = fix_nans(nanmedian(rep_plates, axis)) assert offsets[repindex] is not None, "Too many bad values to correct row/column" offsets[repindex] = offsets[repindex].reshape(endshape) # shift offsets to zero-median to keep things identifiable offsets[repindex] -= np.median(offsets[repindex]) history[repindex] += offsets[repindex] return dict(((plate, repindex), values - offsets[repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
def robust_median(v,m=2.): print "Robust Method: \t\t\t Median-Distanz-Test" median = stats.nanmedian(v) dist_median = numpy.absolute(v - stats.nanmedian(v)) # absolute distanz der messdaten zum median ratio_dist_median = dist_median/median # verhaeltnis zwischen distanz und median, je groesser diese # distanz umso wahrscheinlicher is der messwert ein aussreisser v=numpy.where(ratio_dist_median>=m,numpy.nan,v) if median else 0 # setzte alle aussreisser auf nan return v,ratio_dist_median
def return_speedup_stats(x, y): speedup_stats = { 'ratio_of_the_means': stats.nanmean(x) / stats.nanmean(y), 'ratio_of_the_medians': stats.nanmedian(x) / stats.nanmedian(y), 'ratio_of_the_stddevs': stats.nanstd(x) / stats.nanstd(y), 'ratio_max_to_min': np.amax(x) / np.amin(y), 'ratio_min_to_max': np.amin(x) / np.amax(y) } return speedup_stats
def removeoutliers(inarray,stdcut=3.0): #bonehead outlier cut, stdcut is how many sigma, replace with nearest neighbor #first mark the bad numbers inarray[np.logical_not(np.isfinite(inarray))]=0. indexarray=np.arange(len(inarray)) badi=indexarray[np.abs(inarray-nanmedian(inarray)) > stdcut*nanstd(inarray) ] goodi=indexarray[np.abs(inarray-nanmedian(inarray)) <= stdcut*nanstd(inarray) ] outarray=inarray for i in badi: outarray[i]=inarray[np.abs(goodi-i).argmin()] return outarray
def normalization_shift_rows_or_cols(self, axis, stacker, history): endshape = [-1, -1] endshape[axis] = 1 if self.combine_replicates: all_plates = stacker(self.normalization_plate_values.keys()) controls = (stacker([ self.normalization_control_maps[pl] for pl, _ in self.normalization_plate_values.keys() ]) != CONTROL_POPULATION) all_plates[controls] = np.nan # use conservative_nanmedian to avoid taking median of too few values offsets = fix_nans(conservative_nanmedian(all_plates, axis)) if offsets is None: # too many NaNs to use conservative_nanmedian, try again. offsets = fix_nans(nanmedian(all_plates, axis)) assert offsets is not None, "Too many bad values to correct row/column" offsets = offsets.reshape(endshape) # shift offsets to zero-median to keep things identifiable offsets -= np.median(offsets) history += offsets return dict( ((plate, repindex), values - offsets) for ((plate, repindex), values) in self.normalization_plate_values.iteritems()) else: offsets = {} for repindex in range(self.num_replicates): rep_plates = stacker([ v for ( _, rep), v in self.normalization_plate_values.iteritems() if repindex == rep ]) controls = (stacker([ self.normalization_control_maps[pl] for pl, rep in self.normalization_plate_values.keys() if repindex == rep ]) != CONTROL_POPULATION) rep_plates[controls] = np.nan # use conservative_nanmedian to avoid taking median of too few values offsets[repindex] = fix_nans( conservative_nanmedian(rep_plates, axis)) if offsets[ repindex] is None: # too many NaNs to use conservative_nanmedian, try again. offsets[repindex] = fix_nans(nanmedian(rep_plates, axis)) assert offsets[ repindex] is not None, "Too many bad values to correct row/column" offsets[repindex] = offsets[repindex].reshape(endshape) # shift offsets to zero-median to keep things identifiable offsets[repindex] -= np.median(offsets[repindex]) history[repindex] += offsets[repindex] return dict( ((plate, repindex), values - offsets[repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
def normalization_align_plates(self): offsets = {} for (plate, repindex), values in self.normalization_plate_values.iteritems(): control_map = self.normalization_control_maps[plate] if self.alignment_method == ALIGN_POPULATION: align_values = values[control_map == CONTROL_POPULATION] elif self.alignment_method == ALIGN_EVERYTHING: align_values = values else: assert False, "Unknown normalization method: %s" % ( self.alignment_method) # XXX - should not shift plates that are more than half filled by controls # compute an offset per-plate and per-replicate if len(align_values) > 0: offsets[plate, repindex] = np.median(align_values) else: offsets[plate, repindex] = np.nan # shift offsets to zero-median to keep things identifiable if self.combine_replicates: # keep overall shift at 0 global_shift = nanmedian(offsets.values()) for plate, repindex in offsets: if np.isnan(offsets[plate, repindex]): offsets[plate, repindex] = 0.0 else: offsets[plate, repindex] -= global_shift self.normalization_total_plate_shifts[ plate, repindex] += offsets[plate, repindex] return dict( ((plate, repindex), values - offsets[plate, repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems()) else: replicate_indices = np.array([repindex for _, repindex in offsets]) offset_vals = np.array(offsets.values()) per_replicate_shifts = dict( (repindex, nanmedian(offset_vals[replicate_indices == repindex])) for repindex in range(self.num_replicates)) for plate, repindex in offsets: if np.isnan(offsets[plate, repindex]): offsets[plate, repindex] = 0.0 else: offsets[plate, repindex] -= per_replicate_shifts[repindex] self.normalization_total_plate_shifts[ plate, repindex] += offsets[plate, repindex] return dict( ((plate, repindex), values - offsets[plate, repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
def make_plots(self, num_bins=50): import matplotlib.pyplot as p ## Histogram of Widths widths = [float(x) for x in self.dataframe["Widths"] if is_float_try(x)] widths_stats = [nanmean(widths), nanstd(widths), nanmedian(widths)] ## Histogram of Lengths lengths = self.dataframe["Lengths"] lengths_stats = [nanmean(lengths), nanstd(lengths), nanmedian(lengths)] ## Histogram of Curvature rht_curvature = self.dataframe["RHT Curvature"] rht_curvature_stats = [nanmean(rht_curvature), nanstd(rht_curvature), nanmedian(rht_curvature)] if self.verbose: print "Widths Stats: %s" % (widths_stats) print "Lengths Stats: %s" % (lengths_stats) print "Curvature Stats: %s" % (rht_curvature_stats) p.subplot(131) p.hist(widths, num_bins) p.xlabel("Widths (pc)") p.subplot(132) p.hist(lengths, num_bins) p.xlabel("Lengths (pc)") p.subplot(133) p.hist(curvature, num_bins) p.xlabel("Curvature") p.show() if self.save: p.hist(widths, num_bins) p.xlabel("Widths (pc)") p.savefig("".join([self.save_name,"_widths.pdf"])) p.close() p.hist(lengths, num_bins) p.xlabel("Lengths (pc)") p.savefig("".join([self.save_name,"_lengths.pdf"])) p.close() p.hist(rht_curvature, num_bins) p.xlabel("RHT Curvature") p.savefig("".join([self.save_name,"_rht_curvature.pdf"])) p.close() return self
def median_f(self, x): """Compute median over time varying axis of a front relative quantity, x. """ # TODO: the axis used in nanmean is different for U and Uf # calcs - change Uf dims to make consistent? return stats.nanmedian(x, axis=1)
def calc_stats(a, maskzero=False): statsDict = {} a = np.array(a) if maskzero: a = np.where(np.equal(a, 0.0), np.nan, a) # Check that array is not all NaNs statsDict['npix'] = int(np.sum(np.where(np.isnan(a), 0.0, 1.0))) if statsDict['npix'] >= 2: statsDict['stdev'] = float(stats.nanstd(a.flatten())) statsDict['mean'] = float(stats.nanmean(a.flatten())) statsDict['median'] = float(stats.nanmedian(a.flatten())) statsDict['max'] = float(np.nanmax(a)) statsDict['min'] = float(np.nanmin(a)) statsDict['centmax'] = list(np.unravel_index(np.nanargmax(a), a.shape)) statsDict['madfm'] = float(MAD(a.flatten())) statsDict['npix'] = int(np.sum(np.where(np.isnan(a), 0.0, 1.0))) statsDict['success'] = True else: statsDict['npix'] == 0 statsDict['stdev'] = 0.0 statsDict['mean'] = 0.0 statsDict['median'] = 0.0 statsDict['max'] = 0.0 statsDict['min'] = 0.0 statsDict['centmax'] = (0.0, 0.0) statsDict['madfm'] = 0.0 statsDict['success'] = False return statsDict
def ComputeDensityChange(hpImage,lpImage,ws,ov): SizeZVol = hpImage.shape[0] SizeXVol = hpImage.shape[1] SizeYVol = hpImage.shape[2] hpImage = (254.0)/(1.0+np.exp(-(hpImage-200.0)/-100.0))+2.0 lpImage = (254.0)/(1.0+np.exp(-(lpImage-200.0)/-100.0))+2.0 BlocSpeed = int((1-ov) * ws) if BlocSpeed == 0 : print("The overlap ration is to big the bloc matching speed is set to 1 pixel") BlocSpeed = 1 densityMap = np.zeros((SizeZVol,SizeXVol,SizeYVol)) iterationMap = np.zeros((SizeZVol,SizeXVol,SizeYVol)) 'Ready For Loop' for xRef in np.arange(0,SizeXVol -1 ,BlocSpeed): for yRef in np.arange(0,SizeYVol -1,BlocSpeed): for zRef in np.arange(0,SizeZVol -1,BlocSpeed): print xRef, yRef, zRef xMinRef = xRef-int(ws/2) yMinRef = yRef-int(ws/2) zMinRef = zRef-int(ws/2) xMaxRef = xRef+int(ws/2) yMaxRef = yRef+int(ws/2) zMaxRef = zRef+int(ws/2) if xMinRef < 0 : xMinRef = 0 if yMinRef < 0 : yMinRef = 0 if zMinRef < 0 : zMinRef = 0 if xMaxRef >= SizeXVol : xMaxRef = SizeXVol - 1 if yMaxRef >= SizeYVol : yMaxRef = SizeYVol - 1 if zMaxRef >= SizeZVol : zMaxRef = SizeZVol - 1 BlocRefHP = hpImage[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef]+1.0 BlocRefLP = lpImage[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef]+1.0 BlocRef = (BlocRefLP - BlocRefHP) densityMap[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef] += stats.nanmedian(BlocRef,axis=None) iterationMap[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef] += 1 with np.errstate(divide='ignore', invalid='ignore'): c = np.true_divide(densityMap,iterationMap) c[c == np.inf] = 0 c = np.nan_to_num(c) return c
def get_munged_clean_data(self): train_df = self.datamart['train'] Y_train_df = train_df['Target'] Y_train = np.array(Y_train_df) del train_df['Target'] test_df = self.datamart['test'] assert np.all(train_df.columns == test_df.columns) X_train = np.array(train_df) del train_df X_test = np.array(test_df) del test_df X_train_nan = np.isnan(X_train) X_test_nan = np.isnan(X_test) X_train = np.hstack((X_train,X_train_nan)) X_test = np.hstack((X_test,X_test_nan)) X_train_median = stats.nanmedian(X_train,axis=0) for i in xrange(X_train.shape[1]): X_train[np.isnan(X_train[:,i]),i] = X_train_median[i] for i in xrange(X_test.shape[1]): X_test[np.isnan(X_test[:,i]),i] = X_train_median[i] keep_not_boring = X_train.std(axis=0) > 0.0 X_train = X_train[:,keep_not_boring] X_test = X_test[:,keep_not_boring] return X_train, Y_train, X_test
def calc_stats_old(a, maskzero=False): """Calculate the statistics of an array""" statsDict = {} a = np.array(a) if maskzero: a = np.where( np.equal(a, 0.0), np.nan, a) # Check that array is not all NaNs statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0))) if statsDict['npix']>=2: statsDict['stdev'] = float(stats.nanstd(a.flatten())) statsDict['mean'] = float(stats.nanmean(a.flatten())) statsDict['median'] = float(stats.nanmedian(a.flatten())) statsDict['max'] = float(np.nanmax(a)) statsDict['min'] = float(np.nanmin(a)) statsDict['centmax'] = list(np.unravel_index(np.nanargmax(a), a.shape)) statsDict['madfm'] = float(MAD(a.flatten())) statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0))) statsDict['success'] = True else: statsDict['npix'] == 0 statsDict['stdev'] = 0.0 statsDict['mean'] = 0.0 statsDict['median'] = 0.0 statsDict['max'] = 0.0 statsDict['min'] = 0.0 statsDict['centmax'] = (0.0, 0.0) statsDict['madfm'] = 0.0 statsDict['success'] = False return statsDict
def make_oiiewbins(zmin=_zmin, zmax=_zmax): """ """ nbin = 2 + 3 + 4 + 5 oiiewmin = np.zeros(nbin) oiiewmax = np.zeros(nbin) oiiewbin = np.zeros(nbin) oiiewmin[0:2] = [_EPS, 50.0] oiiewmax[0:2] = [50.0, 200.] oiiewmin[2:2 + 3] = [_EPS, 40.0, 70.0] oiiewmax[2:2 + 3] = [40.0, 70.0, 200.] oiiewmin[5:5 + 4] = [_EPS, 30.0, 50.0, 80.0] oiiewmax[5:5 + 4] = [30.0, 50.0, 80.0, 200.] oiiewmin[9:9 + 5] = [_EPS, 25.0, 45.0, 60.0, 90.0] oiiewmax[9:9 + 5] = [25.0, 45.0, 60.0, 90.0, 200.] oiilummin = np.zeros(nbin) oiilummax = np.zeros(nbin) oiilumbin = np.zeros(nbin) oiilummin[0:2] = [40.0, 41.6] oiilummax[0:2] = [41.6, 43.5] oiilummin[2:2 + 3] = [40.0, 41.4, 41.8] oiilummax[2:2 + 3] = [41.4, 41.8, 43.5] oiilummin[5:5 + 4] = [40.0, 41.3, 41.6, 41.9] oiilummax[5:5 + 4] = [41.3, 41.6, 41.9, 43.5] oiilummin[9:9 + 5] = [40.0, 41.2, 41.5, 41.7, 42.0] oiilummax[9:9 + 5] = [41.2, 41.5, 41.7, 42.0, 43.5] # Calculate the medians objs_ori = elg_readin() vac_objs = elg_readin(vac=True) nobj = objs_ori.size zindex = (np.where( np.logical_and( np.logical_and( np.logical_and(objs_ori['zGOOD'] == 1, objs_ori['Z'] > zmin), objs_ori['Z'] < zmax), objs_ori['CLASS'] == 'GALAXY')))[0] oiiew = vac_objs['OIIEW'][zindex] logoiilum = np.log10(vac_objs['OIILUM'][zindex]) for i in np.arange(nbin): oiiewbin[i] = nanmedian(oiiew[((oiiew > oiiewmin[i]) & (oiiew < oiiewmax[i]))]) oiilumbin[i] = nanmedian(logoiilum[((logoiilum > oiilummin[i]) & (logoiilum < oiilummax[i]))]) return (oiiewmin, oiiewmax, oiiewbin, oiilummin, oiilummax, oiilumbin)
def calc_clipped_stats_old(data, clip=3.0, nIter=10): """Calculate the mean and stdev of an array given a sigma clip""" data = np.array(data).flatten() mean = float(stats.nanmean(data)) std = float(stats.nanstd(data)) mad = float(MAD(data)) if clip > 0.0: convergeFlg = 0 itCnt = 0 while convergeFlg==0 and itCnt<nIter: meanOld, stdOld, madOld = mean, std, mad minVal = mean - (clip * mad) maxVal = mean + (clip * mad) # Blank values outside the 3-sigma range dataMsk = np.where(np.greater(data, maxVal), np.nan, data) dataMsk = np.where(np.less(data, minVal), np.nan, dataMsk) # Measure the statistics mean = stats.nanmean(dataMsk) median = stats.nanmedian(dataMsk) std = stats.nanstd(dataMsk) mad = MAD(dataMsk) npix = np.sum(np.where(np.isnan(dataMsk),0.0,1.0)) dataMsk = [] if mean == meanOld and mad == madOld: convergFlg = 1 itCnt += 1 # Assemble the measurements into a dictionary m = {} m['mean'] = float(mean) m['median'] = float(median) m['stdev'] = float(std) m['madfm'] = float(mad) m['npix'] =int(npix) m['max'] = float(np.nanmax(data)) m['min'] = float(np.nanmin(data)) del data # If all nans if m['npix'] == 0: m['stdev'] = 0.0 m['mean'] = 0.0 m['median'] = 0.0 m['max'] = 0.0 m['min'] = 0.0 m['centmax'] = (0.0,0.0) m['madfm'] = 0.0 m['success'] = False else: m['success'] = True return m
def summary(self): """ return summary statistics for the dataset """ r="RT (all): Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f\n"%(np.nanmin(self.RT), np.nanmax(self.RT), stats.nanmean(self.RT), stats.nanmedian(self.RT)) for cond in range(self.design.nconditions()): r+="RT ({cond}): Min={minrt}, Max={maxrt}, Mean={meanrt}, Median={medrt}\n".format( cond=":".join(self.design.condidx(cond)), minrt=np.nanmin(self.RT[self.condition==cond]), maxrt=np.nanmax(self.RT[self.condition==cond]), meanrt=stats.nanmean(self.RT[self.condition==cond]), medrt=stats.nanmedian(self.RT[self.condition==cond])) r+='errors (all GO): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format( nerr=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)])), ntrials=len(self.correct[np.isnan(self.SSD)]), errperc=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)]))/float(len(self.correct[np.isnan(self.SSD)]))) for cond in range(self.design.nconditions()): r+='errors ({cond}): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format( cond=":".join(self.design.condidx(cond)), nerr=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)])), ntrials=len(self.correct[(self.condition==cond) & np.isnan(self.SSD)]), errperc=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)])) /float(len(self.correct[(self.condition==cond) & np.isnan(self.SSD)]))) r+='miss GO (all): {nmiss}/{ntrials} ({missperc:.2f} %)\n'.format( nmiss=np.sum(np.isnan(self.RT[np.isnan(self.SSD)])), ntrials=self.ntrials, missperc=100.*np.sum(np.isnan(self.RT[np.isnan(self.SSD)]))/float(self.ntrials) ) for cond in range(self.design.nconditions()): r+="miss GO ({cond}): {nmiss}/{ntrials} ({missperc:.2f} %)\n".format( cond=":".join(self.design.condidx(cond)), ntrials=len(self.RT[self.condition==cond]), missperc=100.*np.sum(np.isnan(self.RT[(self.condition==cond) & np.isnan(self.SSD)]))/float(self.ntrials), nmiss=np.sum(np.isnan(self.RT[(self.condition==cond) & (np.isnan(self.SSD))]))) r+="SSD-distribution\n" a=stats.itemfreq(self.SSD[np.isfinite(self.SSD)])#.astype(np.int) r+= " NUM | "+" ".join(["%7i"%int(i) for i in (a[:,1])]) + "\n" r+= " SSD | "+" ".join(["%7.2f"%(i) for i in (a[:,0])]) +"\n" return r
def buildtable(self): """ builds the table of stars """ import numpy as np epochs = len(self.objids) stars = len(self.stars) if fileexists('/work2/jwe/NGC2281/' + self.filter + 'array.npy'): m = np.load('/work2/jwe/NGC2281/' + self.filter + 'array.npy') else: from datasource import DataSource from framecal import FrameCal fc = FrameCal(self.filter) m = np.zeros([epochs, stars]) # objid is specific to a filter so we only need to query the objid wifsip = DataSource(host='pina', database='wifsip', user='******') for objid in self.objids: k = self.objids.index(objid) print k, epochs, objid, query = """SELECT matched.id, phot.mag_auto, phot.mag_errauto FROM phot, matched WHERE phot.objid like '%s' AND (matched.objid,matched.star) = (phot.objid,phot.star) AND phot.flags = 0;""" % objid result = wifsip.query(query) starids = [s[0] for s in result] mags = [s[1] for s in result] err = [s[2] for s in result] slope, intercept, _, _, _ = fc.calframe(objid) print len(mags) for starid in starids: i = self.stars.index(starid) m[k, i] = mags[starids.index(starid)] * slope + intercept np.save('/work2/jwe/NGC2281/' + self.filter + 'array.npy', m) wifsip.close() i = np.where(m == 0.0) m[i] = np.nan from scipy import stats # calculate the observed average for the stars avg = stats.nanmean(m, axis=0) for k in range(epochs): print k, epochs, self.objids[k] # calculate the mean of offsets off = stats.nanmedian(m[k, :] - avg) # correct epoch for mean of offsets m[k, :] += off # calculate new corrected means avg = stats.nanmean(m, axis=0) std = stats.nanstd(m, axis=0) for i in range(len(self.stars)): print self.stars[i], avg[i], std[i]
def calc_clipped_stats_old(data, clip=3.0, nIter=10): data = np.array(data).flatten() mean = float(stats.nanmean(data)) std = float(stats.nanstd(data)) mad = float(MAD(data)) if clip > 0.0: convergeFlg = 0 itCnt = 0 while convergeFlg == 0 and itCnt < nIter: meanOld, stdOld, madOld = mean, std, mad minVal = mean - (clip * mad) maxVal = mean + (clip * mad) # Blank values outside the 3-sigma range dataMsk = np.where(np.greater(data, maxVal), np.nan, data) dataMsk = np.where(np.less(data, minVal), np.nan, dataMsk) # Measure the statistics mean = stats.nanmean(dataMsk) median = stats.nanmedian(dataMsk) std = stats.nanstd(dataMsk) mad = MAD(dataMsk) npix = np.sum(np.where(np.isnan(dataMsk), 0.0, 1.0)) dataMsk = [] if mean == meanOld and mad == madOld: convergFlg = 1 itCnt += 1 # Assemble the measurements into a dictionary m = {} m['mean'] = float(mean) m['median'] = float(median) m['stdev'] = float(std) m['madfm'] = float(mad) m['npix'] = int(npix) m['max'] = float(np.nanmax(data)) m['min'] = float(np.nanmin(data)) del data # If all nans if m['npix'] == 0: m['stdev'] = 0.0 m['mean'] = 0.0 m['median'] = 0.0 m['max'] = 0.0 m['min'] = 0.0 m['centmax'] = (0.0, 0.0) m['madfm'] = 0.0 m['success'] = False else: m['success'] = True return m
def mgii_composite_jackknife(): (master_wave, rest_allflux, rest_allivar) = rest_allspec_readin() master_loglam = np.log10(master_wave) # mask out useless wavelength ranges # left 2300 wave_pos = np.array([2200.]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[0:rest_loc[0],:] = 0. # Fe II 2350 wave_pos = np.array([2330., 2420]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[rest_loc[0]:rest_loc[1],:] = 0. # Fe II 2600 wave_pos = np.array([2570., 2640]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[rest_loc[0]:rest_loc[1],:] = 0. # Mg II 2800 wave_pos = np.array([2770., 2820]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[rest_loc[0]:rest_loc[1],:] = 0. # Mg I 2853 wave_pos = np.array([2843., 2863]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[rest_loc[0]:rest_loc[1],:] = 0. # right 2900 wave_pos = np.array([2900.]) rest_loc = np.searchsorted(master_wave, wave_pos) rest_allivar[rest_loc[0]:,:] = 0. normalized_rest_allflux = rest_allflux for i in np.arange((rest_allflux.shape)[1]): imask = (np.where(rest_allivar[:,i]>0.))[0] if imask.size>0: x = np.log10(master_wave[imask]) y = rest_allflux[imask, i] z = np.polyfit(x, y, 3) p = np.poly1d(z) continuum = p(master_loglam) normalized_rest_allflux[:,i] = rest_allflux[:,i]/continuum wave_pos = np.array([2200., 2900.]) rest_loc = np.searchsorted(master_wave, wave_pos) outwave = master_wave[rest_loc[0]:rest_loc[1]] #tmp_fluxmean = nanmean(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1) #tmp_fluxmedian = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1) fluxused = normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index] njack = len(_mgii_index) nwave = outwave.size fluxmean = np.zeros((nwave, njack)) fluxmedian = np.zeros((nwave, njack)) for ijack in np.arange(njack): fluxmean[:,ijack] = nanmean(normalized_rest_allflux[rest_loc[0]:rest_loc[1], np.r_[_mgii_index[:ijack], _mgii_index[ijack+1:]]], 1) fluxmedian[:,ijack] = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], np.r_[_mgii_index[:ijack], _mgii_index[ijack+1:]]], 1) #tmp_fluxmedian = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1) return (outwave, fluxmean, fluxmedian, fluxused)
def fivenum(v): v = np.array(v) try: np.sum(v) except TypeError: print('Error: you must provide a list or array of only numbers') q1 = scoreatpercentile(v[~np.isnan(v)],25) q3 = scoreatpercentile(v[~np.isnan(v)],75) md = nanmedian(v) return np.nanmin(v), q1, md, q3, np.nanmax(v),
def aggregate_ftr_matrix(ftr_matrix): sig = [] for ftr in ftr_matrix: median = stats.nanmedian(ftr) mean = stats.nanmean(ftr) std = stats.nanstd(ftr) # Invalid double scalars warning appears here skew = stats.skew(ftr) if any(ftr) else 0.0 kurtosis = stats.kurtosis(ftr) sig.extend([median, mean, std, skew, kurtosis]) return sig
def normalization_align_plates(self): offsets = {} for (plate, repindex), values in self.normalization_plate_values.iteritems(): control_map = self.normalization_control_maps[plate] if self.alignment_method == ALIGN_POPULATION: align_values = values[control_map == CONTROL_POPULATION] elif self.alignment_method == ALIGN_EVERYTHING: align_values = values else: assert False, "Unknown normalization method: %s"%(self.alignment_method) # XXX - should not shift plates that are more than half filled by controls # compute an offset per-plate and per-replicate if len(align_values) > 0: offsets[plate, repindex] = np.median(align_values) else: offsets[plate, repindex] = np.nan # shift offsets to zero-median to keep things identifiable if self.combine_replicates: # keep overall shift at 0 global_shift = nanmedian(offsets.values()) for plate, repindex in offsets: if np.isnan(offsets[plate, repindex]): offsets[plate, repindex] = 0.0 else: offsets[plate, repindex] -= global_shift self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex] return dict(((plate, repindex), values - offsets[plate, repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems()) else: replicate_indices = np.array([repindex for _, repindex in offsets]) offset_vals = np.array(offsets.values()) per_replicate_shifts = dict((repindex, nanmedian(offset_vals[replicate_indices == repindex])) for repindex in range(self.num_replicates)) for plate, repindex in offsets: if np.isnan(offsets[plate, repindex]): offsets[plate, repindex] = 0.0 else: offsets[plate, repindex] -= per_replicate_shifts[repindex] self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex] return dict(((plate, repindex), values - offsets[plate, repindex]) for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
def calc_summary_table(minimizers, group_results): """ Calculates a summary from problem-individual results. At the moment the only summary statistic calculated is the median. The output is produced as numpy matrices. @param minimizers :: list of minimizers used (their names) @param group_results :: results from running fitting tests on different problems (list of lists, where the first level is the group, and the second level is the individual test). @returns two numpy matrices (where columns are the groups, and rows are the minimizers) with summary statistic (median) from the problem-individual results. """ num_groups = len(group_results) num_minimizers = len(minimizers) groups_norm_acc = np.zeros((num_groups, num_minimizers)) groups_norm_runtime = np.zeros((num_groups, num_minimizers)) for group_idx, results_per_test in enumerate(group_results): num_tests = len(results_per_test) accuracy_tbl = np.zeros((num_tests, num_minimizers)) time_tbl = np.zeros((num_tests, num_minimizers)) for test_idx in range(0, num_tests): for minimiz_idx in range(0, num_minimizers): accuracy_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].sum_err_sq time_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].runtime # Min across all alternative runs/minimizers min_sum_err_sq = np.nanmin(accuracy_tbl, 1) min_runtime = np.nanmin(time_tbl, 1) norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None] norm_runtime_rankings = time_tbl / min_runtime[:, None] groups_norm_acc[group_idx, :] = stats.nanmedian(norm_acc_rankings, 0) groups_norm_runtime[group_idx, :] = stats.nanmedian(norm_runtime_rankings, 0) return groups_norm_acc, groups_norm_runtime
def fivenum(v): """Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input vector, a list or array of numbers based on 1.5 times the interquartile distance""" try: numpy.sum(v) except TypeError: print('Error: you must provide a list or array of only numbers') q1 = scoreatpercentile(v,25) q3 = scoreatpercentile(v,75) iqd = q3-q1 md = nanmedian(v) whisker = 1.5*iqd return numpy.nanmin(v), md-whisker, md, md+whisker, numpy.nanmax(v),
def read_data(filename): data = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True) data = np.asarray(data.values, dtype = float) col_mean = stats.nanmedian(data,axis = 0) inds = np.where(np.isnan(data)) data[inds] = np.take(col_mean,inds[1]) #data=[np.concatenate((np.array([data[:,1]]).T,data[:,6:]),axis=1)] X_train = data[:,6: ] Y_train = data[:,1:6] svm_x = map(lambda xr: { i+1: xr[i] for i in range(xr.shape[0]) if not np.isnan(xr[i]) } , X_train ) svm_y_ary = map( lambda i : [ y for y in Y_train[:,i]], range(Y_train.shape[1]) ) return svm_x, svm_y_ary
def get_no_nan_median(self): tmp = 1*self.C tmp[tmp == 127] = np.nan cur_med = stats.nanmedian(tmp,axis=1) if np.sum(np.isnan(cur_med)) > 0: cur_med_is_not_nan_idx = np.logical_not(np.isnan(cur_med)) self.prev_med[cur_med_is_not_nan_idx] = cur_med[cur_med_is_not_nan_idx] else: self.prev_med = cur_med return 1*self.prev_med
def binMean(X,Y,numBins=8,xmin=None,xmax=None): if xmin is None: xmin = X.min() if xmax is None: xmax = X.max() bins = np.linspace(xmin,xmax,numBins+1) # print bins,Y YY = np.array([nanmean(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)]) YYmedian = np.array([nanmedian(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)]) YYstd = np.array([np.std(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)]) return bins[:-1]+(bins[1]-bins[0])*0.5,YY,YYmedian,YYstd
def median(self, files=[], bands=[1], doReproject=True, maskName='mask', **kwargs): '''Calculate median of input bands Memory and CPU greedy method. Generates 3D cube from bands of all input images and calculates median. Adds median bands to self Parameters ----------- files : list list of input files bands : list list of names/band_numbers to be processed doReproject : boolean, [True] reproject input files? maskName : str, ['mask'] name of the mask in input files nClass : child of Nansat, [Nansat] This class is used to read input files eResampleAlg : int, [0] agorithm for reprojection, see Nansat.reproject() period : [datetime0, datetime1] Start and stop datetime objects from pyhon datetime. ''' # check inputs if len(files) == 0: self.logger.error('No input files given!') return # modify default values self.bandIDs = bands self.doReproject = doReproject self.maskName = maskName self._set_defaults(kwargs) lastN = self._get_layer_image(files[-1]) # add medians of all bands for band in bands: bandCube, mask = self._get_cube(files, band) bandMedian = st.nanmedian(bandCube, axis=0) # get metadata of this band from the last image parameters = lastN.get_metadata(bandID=band) # add band and std with metadata self.add_band(array=bandMedian, parameters=parameters) self.add_band(array=mask, parameters={'name': 'mask'})
def na_median(X): ''' returns a copy of X with NAs replaced by the median of the non NAs for each column ''' col_median = nanmedian(X,axis=0) a=np.copy(X) inds = np.where(np.isnan(a)) import sys sys.stderr.write(str(inds)+'\n') if inds[0].shape[0]>0: a[inds]=col_median return a
def main(): dat=pd.read_table('data/train_v2.csv',sep=',') print "reading done, train" loss=np.asarray(dat.loss) dat=dat.drop(['loss','id'],1) dat['new1']=dat['f528']-dat['f527'] #golden feature 1 dat['new2']=dat['f528']-dat['f274'] #golden feature 2 dat=np.asarray(dat.values, dtype=float) col_med = stats.nanmedian(dat,axis=0) print "calculated medians, train" inds = np.where(np.isnan(dat)) dat[inds]=np.take(col_med,inds[1]) print "median imputation done, train" scaler=preprocessing.Scaler().fit(dat) dat=scaler.transform(dat) print "scaling done, train" labels=(loss>0).astype(int) np.save('data/x_train.npy',dat) np.save('data/y_train.npy',labels) np.save('data/loss.npy',loss) print "trainset done" dat=pd.read_table('data/test_v2.csv',sep=',') print "reading done, test" ids=np.asarray(dat.id) dat=dat.drop(['id'],1) dat['new1']=dat['f528']-dat['f527'] #golden feature 1 dat['new2']=dat['f528']-dat['f274'] #golden feature 2 dat=np.asarray(dat.values,dtype=float) col_med=stats.nanmedian(dat,axis=0) print "calculated medians, test" inds=np.where(np.isnan(dat)) dat[inds]=np.take(col_med,inds[1]) print "imputation done, test" dat=scaler.transform(dat) print "scaling done, test" np.save('data/x_test.npy',dat) np.save('data/ids.npy',ids) print "testset done"
def main(): dat = pd.read_table('data/train_v2.csv', sep=',') print "reading done, train" loss = np.asarray(dat.loss) dat = dat.drop(['loss', 'id'], 1) dat['new1'] = dat['f528'] - dat['f527'] #golden feature 1 dat['new2'] = dat['f528'] - dat['f274'] #golden feature 2 dat = np.asarray(dat.values, dtype=float) col_med = stats.nanmedian(dat, axis=0) print "calculated medians, train" inds = np.where(np.isnan(dat)) dat[inds] = np.take(col_med, inds[1]) print "median imputation done, train" scaler = preprocessing.Scaler().fit(dat) dat = scaler.transform(dat) print "scaling done, train" labels = (loss > 0).astype(int) np.save('data/x_train.npy', dat) np.save('data/y_train.npy', labels) np.save('data/loss.npy', loss) print "trainset done" dat = pd.read_table('data/test_v2.csv', sep=',') print "reading done, test" ids = np.asarray(dat.id) dat = dat.drop(['id'], 1) dat['new1'] = dat['f528'] - dat['f527'] #golden feature 1 dat['new2'] = dat['f528'] - dat['f274'] #golden feature 2 dat = np.asarray(dat.values, dtype=float) col_med = stats.nanmedian(dat, axis=0) print "calculated medians, test" inds = np.where(np.isnan(dat)) dat[inds] = np.take(col_med, inds[1]) print "imputation done, test" dat = scaler.transform(dat) print "scaling done, test" np.save('data/x_test.npy', dat) np.save('data/ids.npy', ids) print "testset done"
def getAnnulusCounts(self, im, annulusInner, annulusOuter, center): startpx = int(np.round(center[0])) startpy = int(np.round(center[1])) innerMask = aperture(startpx, startpy, annulusInner) outerMask = aperture(startpx, startpy, annulusOuter) annulusMask = outerMask-innerMask nanMask = np.isnan(im) annulusPixels = np.array(np.where(np.logical_and(annulusMask==1, nanMask==False))) nAnnPix = annulusPixels.shape[1] annulusCounts = nanmedian(im[annulusPixels[0],annulusPixels[1]])*nAnnPix if self.verbose: print "Annulus Counts = ", annulusCounts print "Annulus pixels = ", nAnnPix return [annulusCounts, nAnnPix]
def sdize_vector( vec, ignore_zeroes=True, use_median=True ): ## note this is inplace! If don't want, pass vec.copy() !! v = vec if ignore_zeroes: v = vec[ vec != 0 ] if use_median: from scipy.stats import nanmedian mn = nanmedian(v) sd = mad(v) else: mn = np.nanmean( v ) sd = np.nanstd( v ) vec -= mn vec /= (sd + 0.001) ## try to minimize copies? return vec
def plotAgdist(): ''' plot average distance of the pursued agents''' dist,discard,the,rest=computeTrackInfo() del discard,the,rest plt.figure(0,figsize=(10,8)) for vp in range(1,5): xlim=500 ys=dist[vp-1] dat=np.zeros((len(ys),int(HZ*xlim/1000.0),2))*np.nan datrev=np.zeros((len(ys),int(HZ*500/1000.0),2))*np.nan #datN=np.zeros((len(ys),xlim/20)) for i in range(len(ys)): ao=np.argsort(map(np.median,ys[i])) if len(ys[i])==0:continue N=ys[i][ao[0]].size if N==0:continue dat[i,:min(dat.shape[1],N),0]=ys[i][ao[0]][:min(dat.shape[1],N)] datrev[i,-min(datrev.shape[1],N):,0]=ys[i][ao[0]][-min(datrev.shape[1],N):] N=ys[i][ao[-1]].size dat[i,:min(dat.shape[1],N),1]=ys[i][ao[-1]][:min(dat.shape[1],N)] datrev[i,-min(datrev.shape[1],N):,1]=ys[i][ao[-1]][-min(datrev.shape[1],N):] nrags=np.array(map(len,ys)) ylims=[[[1,2.5]]*3,[[],[3,4],[3,5]]] for a in range(3)[::-1]: if a==2: sel=nrags>=(a+1) else: sel = nrags==(a+1) for i in range(2): if a==0 and i==1:continue plt.subplot(4,4,i*8+vp);plt.grid(b=False);#plt.ylim(ylims[i][a]) plt.plot(np.linspace(0,xlim/1000.,dat.shape[1]),nanmedian(dat[sel,:,i],0)); plt.subplot(4,4,i*8+vp+4);plt.grid(b=False);#plt.ylim(ylims[i][a]) ss=datrev.shape[1]/HZ plt.plot(np.linspace(-ss,0,datrev.shape[1]),nanmedian(datrev[sel,:,i],0)); plt.subplot(441) plt.legend(['> 2','2','1'],loc=4) initVP(1,1) plt.savefig(figpath+'trackAgdist')
def medsubtract(image,outname): data = pyfits.open(image)[0].data.copy() if data.ndim==3: data = data[0].copy() tmp = data.copy() tmp[numpy.isnan(tmp)] = 0. tmp -= numpy.sort(tmp,0)[tmp.shape[0]/5] trace = tmp.sum(1) peak = trace.argmax() center = numpy.empty(data.shape[1]) w = center.copy() for i in range(1+center.size/100): b = i*100 e = b+100 if e>center.size: e = center.size if b==e: continue center[b:e] = tmp[:,b:e].sum(1).argmax() bg = center.copy() x = numpy.arange(data.shape[0]) for i in range(center.size): d = tmp[:,i].copy() peak = center[i] if numpy.isnan(d[peak]): center[i] = peak continue fit = numpy.array([0.,d[peak],peak,1.]) cond = ~numpy.isnan(d) input = numpy.empty((d[cond].size,2)) input[:,0] = x[cond].copy() input[:,1] = d[cond].copy() fit,chi = sf.ngaussfit(input,fit) center[i] = fit[2] w[i] = fit[3] fit = sf.lsqfit(ndimage.median_filter(center,17),'polynomial',5) centroid = sf.genfunc(numpy.arange(bg.size),0.,fit) w = numpy.median(w) for i in range(bg.size): d = data[:,i].copy() d[centroid[i]-w*4:centroid[i]+w*4] = numpy.nan data[:,i] -= stats.nanmedian(d) hdu = pyfits.open(image)[0] hdu.data = data.copy() hdu.writeto(outname,clobber=True)
def medianres(self, res, wrap=2*np.pi): ncyc = self.cycs.shape[0] nwid = self.data.shape[2] nlen = self.data.shape[1] logger.info('Analyzing %d cycles for unwrapping errors'% (ncyc)) numcheck = np.zeros(self.nslice, dtype=np.int) numcycper = np.zeros(self.nslice, dtype=np.int) progb = tsio.ProgressBar(maxValue=self.nslice) for kkk in range(self.nslice): cycind = np.flatnonzero(self.cycs[:, 0] == (kkk + 1)) numcycper[kkk] = len(cycind) orig = self.data[kkk, :, :] resarr = np.zeros((numcycper[kkk], nlen, nwid), dtype=np.int) for img in range(numcycper[kkk]): ind = cycind[img] sgn1 = np.sign(self.cycs[ind, 1]) ifg1 = np.abs(self.cycs[ind, 1]) - 1 sgn2 = np.sign(self.cycs[ind, 2]) ifg2 = np.sign(self.cycs[ind, 2]) - 1 p11 = self.data[ifg1, :, :] p22 = self.data[ifg2, :, :] recons = sgn1 * p11 + sgn2 * p22 derr = orig - recons #refph = st.nanmedian(derr.flatten()) #derr = derr - refph resarr[img, :, :] = (np.round(derr / wrap)).astype(np.int) medres = st.nanmedian(resarr, axis=0) #idict = {} #idict['orig'] = orig #idict['medres'] = medres/wrap #plots.imagemany(idict,show=True) res.data[kkk, :, :] = medres numcheck[kkk] = np.nansum(np.abs(medres) > 0) progb.update(kkk, every=3) progb.close() self.cyccount = numcycper self.check = numcheck
def medianres(self, res, wrap=2 * np.pi): ncyc = self.cycs.shape[0] nwid = self.data.shape[2] nlen = self.data.shape[1] logger.info('Analyzing %d cycles for unwrapping errors' % (ncyc)) numcheck = np.zeros(self.nslice, dtype=np.int) numcycper = np.zeros(self.nslice, dtype=np.int) progb = tsio.ProgressBar(maxValue=self.nslice) for kkk in range(self.nslice): cycind = np.flatnonzero(self.cycs[:, 0] == (kkk + 1)) numcycper[kkk] = len(cycind) orig = self.data[kkk, :, :] resarr = np.zeros((numcycper[kkk], nlen, nwid), dtype=np.int) for img in range(numcycper[kkk]): ind = cycind[img] sgn1 = np.sign(self.cycs[ind, 1]) ifg1 = np.abs(self.cycs[ind, 1]) - 1 sgn2 = np.sign(self.cycs[ind, 2]) ifg2 = np.sign(self.cycs[ind, 2]) - 1 p11 = self.data[ifg1, :, :] p22 = self.data[ifg2, :, :] recons = sgn1 * p11 + sgn2 * p22 derr = orig - recons #refph = st.nanmedian(derr.flatten()) #derr = derr - refph resarr[img, :, :] = (np.round(derr / wrap)).astype(np.int) medres = st.nanmedian(resarr, axis=0) #idict = {} #idict['orig'] = orig #idict['medres'] = medres/wrap #plots.imagemany(idict,show=True) res.data[kkk, :, :] = medres numcheck[kkk] = np.nansum(np.abs(medres) > 0) progb.update(kkk, every=3) progb.close() self.cyccount = numcycper self.check = numcheck
def sdize_vector(vec, ignore_zeroes=True, use_median=True ): ## note this is inplace! If don't want, pass vec.copy() !! v = vec if ignore_zeroes: v = vec[vec != 0] if use_median: from scipy.stats import nanmedian mn = nanmedian(v) sd = mad(v) else: mn = np.nanmean(v) sd = np.nanstd(v) vec -= mn vec /= (sd + 0.001) ## try to minimize copies? return vec
def median(self, files=[], bands=[1], doReproject=True, maskName='mask', opener=Nansat, eResampleAlg=0, period=(None, None), vmin=-np.inf, vmax=np.inf): '''Calculate median of input bands Memory and CPU greedy method. Generates 3D cube from bands of all input images and calculates median. Adds median bands to self Parameters ----------- files : list list of input files bands : list list of names/band_numbers to be processed doReproject : boolean, [True] reproject input files? maskName : str, ['mask'] name of the mask in input files nClass : child of Nansat, [Nansat] This class is used to read input files eResampleAlg : int, [0] agorithm for reprojection, see Nansat.reproject() period : [datetime0, datetime1] Start and stop datetime objects from pyhon datetime. ''' # check inputs if len(files) == 0: self.logger.error('No input files given!') return # add medians of all bands for band in bands: cube, mask, metadata = self._get_cube(files, band, doReproject, maskName, opener, eResampleAlg, period, vmin, vmax) median = st.nanmedian(cube, axis=0) # add band and std with metadata self.add_band(array=median, parameters=metadata) self.add_band(array=mask, parameters={'name': 'mask'})
def _clean_nans(self, data): """ Substitute NaNs with the median value of the related features Parameters ---------- data : array, shape=[n_samples, n_features] Data array """ r, c = np.isnan(data).nonzero() my = dict() for ic in np.unique(c): my[ic] = nanmedian(data[:, ic]) for i in range(len(r)): data[r[i], c[i]] = my[c[i]] return data
def binMean(X, Y, numBins=8, xmin=None, xmax=None): if xmin is None: xmin = X.min() if xmax is None: xmax = X.max() bins = np.linspace(xmin, xmax, numBins + 1) # print bins,Y YY = np.array([ nanmean(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])]) for binInd in range(numBins) ]) YYmedian = np.array([ nanmedian(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])]) for binInd in range(numBins) ]) YYstd = np.array([ np.std(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])]) for binInd in range(numBins) ]) return bins[:-1] + (bins[1] - bins[0]) * 0.5, YY, YYmedian, YYstd
def print_Description(merged_matrix): file_name = 'Descriptions.csv' with open(file_name, 'w') as f: f.write('Feature; nobs; min; max; mean; variance; skeweness; kurtosis; median; Q1; median; Q3') f.write('\n') for row in range(0, len(merged_matrix)): print('Feature: ' + str((row + 1))) with open(file_name, 'a') as f: line = 'Feature ' + str(row) + '; ' + \ str(stat.describe(merged_matrix[row])).replace('(', '').replace(')', '').replace(',',';') + \ ';' + str(stat.nanmedian(merged_matrix[row])) quantile_arr=list(stat.mstats.mquantiles(merged_matrix[row])) print(str(quantile_arr)) line = line + ';' + '; '.join([str(quantile) for quantile in quantile_arr]) print(line) line = line.replace('.', ',') f.write(line) f.write('\n')
def clean_spec_NaNs(flux): #fix initial nans on edges nanMap = np.isnan(flux) nanGroups, nNanGroups = label(nanMap) # leftEdgeIdx=0 # rightEdgeIdx=len(flux) # plt.plot(nanMap) # plt.show() # nanMapIdx = np.where(nanMap==True) <<<<<make the next lines faster by using this if np.sum(nanMap)>0: print 'Found NaNs in flux array' for i,booI in enumerate(nanMap): if booI==False: leftEdgeIdx = i break for j,rbooI in enumerate(nanMap[::-1]): if rbooI==False: rightEdgeIdx = len(nanMap)-j break fluxMedian = stats.nanmedian(flux) if leftEdgeIdx>0: flux[:leftEdgeIdx] = np.linspace(fluxMedian, flux[leftEdgeIdx+1],leftEdgeIdx) if rightEdgeIdx<len(flux): flux[rightEdgeIdx:] = np.linspace(flux[rightEdgeIdx-1], fluxMedian, len(flux)-rightEdgeIdx) nanMap = np.isnan(flux) if np.sum(nanMap)>0: print 'NaNs remain in flux array' plt.plot(nanMap) plt.show()
def append_clean_nans(self): train_nan = np.isnan(self.training_x) train_median = stats.nanmedian(self.training_x) train_nan_locs = np.where(train_nan) ms, ns = train_nan_locs for m, n in zip(ms, ns): self.training_x.ix[m,n] = train_median[n] cols_to_keep = train_nan.sum(axis=0) != 0 index_cols_to_keep = cols_to_keep.ix[np.where(cols_to_keep)].index self.train_dummy_nan = train_nan[index_cols_to_keep].astype(float) n_columns = [] for i in self.train_dummy_nan.columns.tolist(): i = "nan_" + i n_columns.append(i) self.train_dummy_nan.columns = n_columns #self.training_x += self.train_dummy_nan test_nan = np.isnan(self.testing_x) test_nan_locs = np.where(test_nan) ms, ns = test_nan_locs for m, n in zip(ms, ns): self.testing_x.ix[m,n] = train_median[n] self.test_dummy_nan = test_nan[index_cols_to_keep].astype(float) self.test_dummy_nan.columns = n_columns
def fitSky(self): # if not self.ui.checkBoxdoSkySub.isChecked(): # self.sky1d = np.zeros(self.data2D.shape[1]) # return #self.useSB1 = True #self.useSB2 = True #print self.imagetype b0 = np.floor(self.SB1_x0) ; b1 = np.ceil(self.SB1_x1) b2 = np.floor(self.SB2_x0) ; b3 = np.ceil(self.SB2_x1) sky01 = np.copy(self.data2D[b0:b1,:]) sky23 = np.copy(self.data2D[b2:b3,:]) if (self.pars['useSB1'] == False): sky01[:,:] = np.nan#sky01*0.0 if (self.pars['useSB2'] == False): sky23[:,:] = np.nan#sky23*0.0 sky03 = np.append(sky01,sky23,axis=0) # print 'here' # print np.shape(sky03) # sky1d = np.median(sky03,0) # notNaN = np.where(np.isfinite(sky03)) # sky1d = np.median(sky03[notNaN[0],notNaN[1]],0) sky1d = nanmedian(sky03,0) if np.sum(np.isnan(sky1d))==np.count_nonzero(sky1d) : # in case both sky bands are turned off, replace with zeros #print 'sky all nans' sky1d[:]=0. self.sky1d = sky1d # print self.sky1d """
def rebin(self, field, shape): """Rebin field to a coarser matrix""" sh = shape[0],field.shape[0]//shape[0],shape[1],field.shape[1]//shape[1] return nanmedian(nanmedian(field.reshape(sh),axis=-1), axis=1)
def make_hists(self, num_bins=None, use_prettyplotlib=True): if use_prettyplotlib: try: import prettyplotlib as plt except ImportError: import matplotlib.pyplot as plt use_prettyplotlib = False print "prettyplotlib not installed. Using matplotlib..." else: import matplotlib.pyplot as plt # Setup subplots if plotting together if self.subplot: num = len(self.columns) if num <= 3: ncols = 1 nrows = num elif num <= 8: ncols = 2 nrows = num / 2 else: # Max columns right now is 12 ncols = 3 nrows = num / 3 # Check if we need an extra row. if num % ncols != 0: nrows += 1 # Make the objects fig, axes = plt.subplots(nrows=nrows, ncols=ncols) # This is a bit awkward to get the indices, but my matplotlib version # doesn't use the same object type as prettyplotlib creates. posns = np.indices(axes.shape) x, y = posns[0].ravel(), posns[1].ravel() # Keep the mean, median, std. data_stats = {} for i, column in enumerate(self.columns): data = self.dataframe[column] data = data[np.isfinite(data)] if num_bins is None: num_bins = np.sqrt(len(data)) data_stats[column] = [nanmean(data), nanstd(data), nanmedian(data)] if self.subplot: if use_prettyplotlib: plt.hist(axes[x[i], y[i]],data, num_bins, grid="y") else: axes[x[i], y[i]].hist(data, num_bins) axes[x[i], y[i]].set_xlabel(column) # ADD UNITS! else: fig, axes = plt.subplots(1) axes.hist(data, num_bins) axes.set_xlabel(column) # ADD UNITS! if self.verbose and not self.subplot: print column+" Stats: %s" % (data_stats[column]) p.show() elif not self.subplot: fig.savefig(self.save_name+"_"+column+"."+self.save_type) p.close() if self.subplot: p.tight_layout() if self.verbose: for column in self.columns: print column+" Stats: %s" % (data_stats[column]) p.show() else: fig.savefig(self.save_name+"_hists."+self.save_type)
def test_nanmedian_none(self): """Check nanmedian when no values are nan.""" m = stats.nanmedian(self.X) assert_approx_equal(m, np.median(self.X))
def test_nanmedian_some(self): """Check nanmedian when some values only are nan.""" m = stats.nanmedian(self.Xsome) assert_approx_equal(m, np.median(self.Xsomet))
def test_nanmedian_all(self): """Check nanmedian when all values are nan.""" m = stats.nanmedian(self.Xall) assert np.isnan(m)
import numpy as np from scipy import stats data = np.genfromtxt('ship-nmpg.csv', delimiter=",", names=True, dtype="f8,i8,f8,f8,f8,f8,i8,i8,S35") hpmean = stats.nanmean(data['hp']) hpmedian = stats.nanmedian(data['hp']) imputeHP = np.round((hpmean + hpmedian) / 2.0) for i in range(len(data['hp'])): if np.isnan(data['hp'][i]): data['hp'][i] = imputeHP ## assign value here np.savetxt('ship-nmpg-imp.csv', data, delimiter=',', newline='\n', fmt="%f,%i,%f,%f,%f,%f,%i,%i,%s")
def mad(x): return stats.nanmedian(np.abs(x-stats.nanmedian(x)))