def __calc_mul_multiband_cut_threshold(): prefix = 197 band_values = {k: [] for k in range(8)} band_cut_th = {k: dict(max=0, min=0) for k in range(8)} # for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = "./img197/MUL-PanSharpen_AOI_3_Paris_img197.tif" with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(8): values_ = values[i_chan].ravel().tolist() values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask band_values[i_chan].append(values_) # for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = "./img197/MUL-PanSharpen_AOI_3_Paris_img197.tif" with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(8): values_ = values[i_chan].ravel().tolist() values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask band_values[i_chan].append(values_) logger.info("Calc percentile point ...") for i_chan in range(8): band_values[i_chan] = np.concatenate(band_values[i_chan]).ravel() band_cut_th[i_chan]['max'] = scipy.percentile(band_values[i_chan], 98) band_cut_th[i_chan]['min'] = scipy.percentile(band_values[i_chan], 2) return band_cut_th
def print_all_stats(ctx, series): ftime = get_ftime(series) start = 0 end = ctx.interval print('start-time, samples, min, avg, median, 90%, 95%, 99%, max') while (start < ftime): # for each time interval end = ftime if ftime < end else end sample_arrays = [ s.get_samples(start, end) for s in series ] samplevalue_arrays = [] for sample_array in sample_arrays: samplevalue_arrays.append( [ sample.value for sample in sample_array ] ) #print('samplevalue_arrays len: %d' % len(samplevalue_arrays)) #print('samplevalue_arrays elements len: ' + \ #str(map( lambda l: len(l), samplevalue_arrays))) # collapse list of lists of sample values into list of sample values samplevalues = reduce( array_collapser, samplevalue_arrays, [] ) #print('samplevalues: ' + str(sorted(samplevalues))) # compute all stats and print them myarray = scipy.fromiter(samplevalues, float) mymin = scipy.amin(myarray) myavg = scipy.average(myarray) mymedian = scipy.median(myarray) my90th = scipy.percentile(myarray, 90) my95th = scipy.percentile(myarray, 95) my99th = scipy.percentile(myarray, 99) mymax = scipy.amax(myarray) print( '%f, %d, %f, %f, %f, %f, %f, %f, %f' % ( start, len(samplevalues), mymin, myavg, mymedian, my90th, my95th, my99th, mymax)) # advance to next interval start += ctx.interval end += ctx.interval
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG, dmatrix1): mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis] index = sp.where(disp_conv)[0] lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1) upperBound = sp.percentile(sp.unique(disp_raw[index]), 99) idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0] matrix = sp.ones((idx.shape[0], 2), dtype='float') matrix[:, 0] /= mean_count[idx].ravel() modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity)) res = modGamma.fit() Lambda = res.params disp_fitted = disp_raw.copy() ok_idx = sp.where(~sp.isnan(disp_fitted))[0] disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1] if sp.sum(disp_fitted > 0) > 0: print "Found dispersion fit" if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_fitted, matrix=dmatrix1, figtitle='Fitted Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_fitted.pdf'), CFG=CFG) return (disp_fitted, Lambda, idx)
def fit_dispersion(counts, disp_raw, disp_conv, sf, options, dmatrix1, event_type): mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis] index = sp.where(disp_conv)[0] lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1) upperBound = sp.percentile(sp.unique(disp_raw[index]), 99) idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0] matrix = sp.ones((idx.shape[0], 2), dtype='float') matrix[:, 0] /= mean_count[idx].ravel() with warnings.catch_warnings(): warnings.simplefilter("ignore") modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity())) res = modGamma.fit() Lambda = res.params disp_fitted = disp_raw.copy() ok_idx = sp.where(~sp.isnan(disp_fitted))[0] disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1] if sp.sum(disp_fitted > 0) > 0: print("\nFound dispersion fit") if options.diagnose_plots: plot.mean_variance_plot(counts=counts, disp=disp_fitted, matrix=dmatrix1, figtitle='Fitted Dispersion Estimate', filename=os.path.join(options.plot_dir, 'dispersion_fitted_%s.%s' % (event_type, options.plot_format)), options=options) return (disp_fitted, Lambda, idx)
def calc_multiband_norm(path_dir, image_list, image_feature_norm_csv, kind='RGB-PanSharpen', channel_count=3, max_sample=100): band_values = {k: [] for k in range(channel_count)} band_cut_th = {k: dict(max=0, min=0) for k in range(channel_count)} # first get all data, then use first part 0:1000 to calc threshold, then update all data for image_id in tqdm.tqdm(image_list[:max_sample]): image_loc = get_image_path_based_type_imageid(path_dir, kind, image_id) with rasterio.open(image_loc, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(channel_count): values_ = values[i_chan].ravel().tolist() values_ = np.array( [v for v in values_ if v != 0] ) # Remove sensored mask band_values[i_chan].append(values_) logger.info("Calc percentile point for normalization") for i_chan in range(channel_count): band_values[i_chan] = np.concatenate( band_values[i_chan]).ravel() band_cut_th[i_chan]['max'] = scipy.percentile( band_values[i_chan], 98) band_cut_th[i_chan]['min'] = scipy.percentile( band_values[i_chan], 2) stat = dict() stat['path'] = path_dir for chan_i in band_cut_th.keys(): stat['chan{}_max'.format(chan_i)] = band_cut_th[chan_i]['max'] stat['chan{}_min'.format(chan_i)] = band_cut_th[chan_i]['min'] pd.DataFrame(stat, index=[0]).to_csv(image_feature_norm_csv, index=False)
def __calc_mul_multiband_cut_threshold(area_id, datapath): prefix = area_id_to_prefix(area_id) band_cut_th = {k: dict(max=0, min=0) for k in range(8)} image_id_list = pd.read_csv(FMT_VALTRAIN_IMAGELIST_PATH.format( prefix=prefix)).ImageId.tolist() image_id_list2 = pd.read_csv(FMT_VALTEST_IMAGELIST_PATH.format( prefix=prefix)).ImageId.tolist() image_id_list.extend(image_id_list2) for i_chan in range(8): logger.info("Reading band {} of the dataset..".format(i_chan)) band_values = [] for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = get_train_image_path_from_imageid( image_id, datapath, mul=True) with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) values_ = values[i_chan].ravel().tolist() values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask band_values.append(values_) logger.info("Calc percentile point for band {}".format(i_chan)) band_values = np.concatenate(band_values).ravel() band_cut_th[i_chan]['max'] = scipy.percentile( band_values, 98) band_cut_th[i_chan]['min'] = scipy.percentile( band_values, 2, overwrite_input=True) return band_cut_th
def cal_8band(file): print(file) ds = gdal.Open(path_main + r'\RGB-PanSharpen' + "\\" + file) data = ds.ReadAsArray() data3 = data.copy() # plt.imshow(grayImg) # plt.show() # img2 = np.array([datax,datax,datax,datax]).swapaxes(0,1).swapaxes(1,2) img = np.array(data3).swapaxes(0, 1).swapaxes(1, 2) # hsv_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) # r_min = min(data3[0]) # r_max = max(data3[0]) # g_min = min(data3[1]) # g_max = max(data3[1]) # b_min = min(data3[2]) # b_max = max(data3[2]) # hsv_image = rgb_to_hsv(img) # h, s, v = cv2.split(hsv_image) # band_gray = cv2.cvtColor(res2, cv2.COLOR_RGB2GRAY) bandstats = {k: dict(max=0, min=0) for k in range(3)} for i in range(3): bandstats[i]['min'] = scipy.percentile(data3[i], 2) bandstats[i]['max'] = scipy.percentile(data3[i], 98) for chan_i in range(3): min_val = bandstats[chan_i]['min'] max_val = bandstats[chan_i]['max'] data3[chan_i] = np.clip(data3[chan_i], min_val, max_val) data3[chan_i] = (data3[chan_i] - min_val) tgi_band = (data3[1] - 0.39 * data3[0] - 0.61 * data3[2]) * 10 grayImg = 0.0722 * data3[0] + 0.7152 * data3[1] + 0.2126 * data3[2] # plt.imshow(tgi_band) # plt.show() data4 = data3.copy() data4 = np.asarray(data4, dtype=np.float32) for chan_i in range(3): min_val = bandstats[chan_i]['min'] max_val = bandstats[chan_i]['max'] data4[chan_i] = (data4[chan_i] / (max_val - min_val)) img_1 = np.array(data4).swapaxes(0, 1).swapaxes(1, 2) hsv_image = rgb_to_hsv(img_1) # plt.imshow(hsv_image) # plt.show() data5 = np.array(hsv_image).swapaxes(2, 1).swapaxes(1, 0) img2 = np.array([ data3[0], data3[1], data3[2], data5[0] * 360, data5[1] * 100, data5[2] * 100, tgi_band, grayImg ]).swapaxes(0, 1).swapaxes(1, 2) output = path_main + r'\MUL-PanSharpen_2' + "\\" + file driver = gdal.GetDriverByName("GTiff") dst_ds = driver.Create(output, ds.RasterXSize, ds.RasterYSize, (img2.shape[2]), gdal.GDT_UInt16) #gdal.GDT_Byte/GDT_UInt16 for i in range(1, img2.shape[2] + 1): dst_ds.GetRasterBand(i).WriteArray(img2[:, :, i - 1]) dst_ds.GetRasterBand(i).ComputeStatistics(False) dst_ds.SetProjection(ds.GetProjection()) dst_ds.SetGeoTransform(ds.GetGeoTransform()) return 0
def main(database): #Commits per committer limited to the 30 first with the highest accumulated activity query = "select count(*) from scmlog group by committer_id order by count(*) desc limit 40" #Connecting to the data base and retrieving data connector = connect(database) results = int(connector.execute(query)) if results > 0: results_aux = connector.fetchall() else: print("Error when retrieving data") return #Moving data to a list commits = [] for commit in results_aux[5:]: # for commits in results_aux: commits.append(int(commit[0])) #Calculating basic statistics print "max: " + str(sp.amax(commits)) print "min: " + str(sp.amin(commits)) print "mean: " + str(sp.mean(commits)) print "median: " + str(sp.median(commits)) print "std: " + str(sp.std(commits)) print ".25 quartile: " + str(sp.percentile(commits, 25)) print ".50 quartile: " + str(sp.percentile(commits, 50)) print ".75 quartile: " + str(sp.percentile(commits, 75))
def __calc_rgb_multiband_cut_threshold(area_id, datapath): prefix = area_id_to_prefix(area_id) band_values = {k: [] for k in range(3)} band_cut_th = {k: dict(max=0, min=0) for k in range(3)} image_id_list = pd.read_csv( FMT_VALTRAIN_IMAGELIST_PATH.format(prefix=prefix)).ImageId.tolist() for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = get_train_image_path_from_imageid(image_id, datapath) with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(3): values_ = values[i_chan].ravel().tolist() values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask band_values[i_chan].append(values_) image_id_list = pd.read_csv( FMT_VALTEST_IMAGELIST_PATH.format(prefix=prefix)).ImageId.tolist() for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = get_train_image_path_from_imageid(image_id, datapath) with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(3): values_ = values[i_chan].ravel().tolist() values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask band_values[i_chan].append(values_) logger.info("Calc percentile point ...") for i_chan in range(3): band_values[i_chan] = np.concatenate(band_values[i_chan]).ravel() band_cut_th[i_chan]['max'] = scipy.percentile(band_values[i_chan], 98) band_cut_th[i_chan]['min'] = scipy.percentile(band_values[i_chan], 2) return band_cut_th
def sliding_window(im_data, window_width, baseline_percentile): """Calculate df/f using a sliding window of given width centered around each timepoint and take the nth percentile as the baseline. """ result = np.empty(im_data.shape) half_width = np.ceil(window_width / 2) for (roi, timepoint, cycle), value in np.ndenumerate(im_data): # define the window extent if timepoint - half_width < 0: window_start = 0 else: window_start = timepoint - half_width if timepoint + half_width > im_data.shape[1]: window_end = im_data.shape[1] else: window_end = timepoint + half_width # calculate the baseline as a percentile within the window baseline = percentile(im_data[roi, window_start:window_end, cycle], baseline_percentile) baseline = percentile(im_data[roi, window_start:window_end, cycle], baseline_percentile) # calculate df/f result[roi, timepoint, cycle] = (value - baseline) / baseline return result
def execLandMetric(self,name,nodata): if name == "LC_Mean": return unicode(name), numpy.mean(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_Sum": return unicode(name), numpy.sum(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_Min": return unicode(name), numpy.min(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_Max": return unicode(name), numpy.max(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_SD": return unicode(name), numpy.std(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_LQua": return unicode(name), scipy.percentile(self.array[self.array!=nodata],25) if name == "LC_Med": return unicode(name), numpy.median(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_UQua": return unicode(name), scipy.percentile(self.array[self.array!=nodata],75) if name == "DIV_SH": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("shannon",nodata) if name == "DIV_EV": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("eveness",nodata) if name == "DIV_SI": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("simpson",nodata)
def main(database): # Commits per committer limited to the 30 first with the highest accumulated activity query = "select count(*) from scmlog group by committer_id order by count(*) desc limit 40" # Connecting to the data base and retrieving data connector = connect(database) results = int(connector.execute(query)) if results > 0: results_aux = connector.fetchall() else: print ("Error when retrieving data") return # Moving data to a list commits = [] for commit in results_aux[5:]: # for commits in results_aux: commits.append(int(commit[0])) # Calculating basic statistics print "max: " + str(sp.amax(commits)) print "min: " + str(sp.amin(commits)) print "mean: " + str(sp.mean(commits)) print "median: " + str(sp.median(commits)) print "std: " + str(sp.std(commits)) print ".25 quartile: " + str(sp.percentile(commits, 25)) print ".50 quartile: " + str(sp.percentile(commits, 50)) print ".75 quartile: " + str(sp.percentile(commits, 75))
def execLandMetric(self,name,nodata): if name == "LC_Mean": return unicode(name), numpy.mean(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_Sum": return unicode(name), numpy.sum(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_Min": return unicode(name), numpy.min(self.array[self.array!=nodata]) if name == "LC_Max": return unicode(name), numpy.max(self.array[self.array!=nodata]) if name == "LC_SD": return unicode(name), numpy.std(self.array[self.array!=nodata],dtype=numpy.float64) if name == "LC_LQua": return unicode(name), scipy.percentile(self.array[self.array!=nodata],25) if name == "LC_Med": return unicode(name), numpy.median(self.array[self.array!=nodata]) if name == "LC_UQua": return unicode(name), scipy.percentile(self.array[self.array!=nodata],75) if name == "DIV_SH": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("shannon",nodata) if name == "DIV_EV": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("eveness",nodata) if name == "DIV_SI": if len(self.classes) == 1: func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING") return unicode(name), "NaN" else: return unicode(name), self.f_returnDiversity("simpson",nodata)
def test_single_parameter_percentile(): dist_f = PercentileDistanceFunction(measures_to_use=["a"]) abc = MockABC([{"a": -3}, {"a": 3}, {"a": 10}]) dist_f.initialize(abc.sample_from_prior()) d = dist_f({"a": 1}, {"a": 2}) expected = ( 1 / (sp.percentile([-3, 3, 10], 80) - sp.percentile([-3, 3, 10], 20))) assert expected == d
def stats(self, *args, **kwargs): import scipy as sp result = {} cp.thread_data.conn.execute( "SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED") just_diffs = ''' select upm.eqdiff, (upm.ratinguser - upm.increment) - (upm.ratingpos + upm.increment) from usersposmatchids upm where upm.eqdiff is not null and upm.increment is not null and upm.submittedat >= date_add(utc_timestamp(), interval - %s day) and upm.plasubms >= %s and upm.possubms >= %s limit 100000 ''' # First remove the positions where there is no previous rating. rows = [ r for r in cp.thread_data.conn.execute(just_diffs, [ int(kwargs.get('ld', 5000)), int(kwargs.get('pls', 20)), int(kwargs.get('pos', 30)) ]) if r[1] is not None ] ratings = [r[1] for r in rows] diffs = [r[0] for r in rows] rating_quintiles = sp.percentile(ratings, [20, 40, 60, 80]) result['mean_rating_of_quintile_to_figs'] = {} last_quint = -1000 for quint in rating_quintiles + [100000]: diffs = [r[0] for r in rows if last_quint < r[1] < quint] ratings = [r[1] for r in rows if last_quint < r[1] < quint] mean_rating_of_quintile = sp.mean(ratings) if len(diffs) > 0: n = len(diffs) mean = sp.mean(diffs) std = sp.std(diffs) quartiles = sp.percentile(diffs, [25, 50, 75]) last_quint = quint figs = {} result['mean_rating_of_quintile_to_figs'][ mean_rating_of_quintile] = figs figs['n'] = n figs['mean'] = mean figs['std'] = std figs['quartiles'] = quartiles cp.thread_data.conn.commit() return result
def get_list_statistics(lst): return { 'min': amin(lst), 'max': amax(lst), 'avg': mean(lst), 'median': median(lst), 'std': std(lst), 'q1': percentile(lst, 25), 'q3': percentile(lst, 75) }
def key_stats(values): return dict({ 'mean' : sp.mean(values), 'std' : sp.std(values)/sp.sqrt(len(values)), 'max' : sp.array(values).max(), 'min' : sp.array(values).min(), 'median' : sp.median(values), 'p25' : sp.percentile(values, 25), 'p75' : sp.percentile(values, 75), 'values' : sp.array(values) })
def Disp(Img,vmin=0,vmax=0,fname=""): from scipy import percentile import matplotlib.pyplot as plt if (vmin==vmax): vmin=percentile(Img,2) vmax=percentile(Img,98) plt.imshow(Img,vmin=vmin,vmax=vmax,cmap = plt.get_cmap('gray'),interpolation='None') plt.axis('off') if (fname!=""): plt.savefig(fname,bbox_inches='tight')
def stats(log_files): for log_file in log_files: print log_file f = open(log_file).read() print 'OVERALL' for field, num in re.findall(r'\[OVERALL\], ([a-zA-Z()/]*), (\d*)', f): print '%s\t%s' % (field.rjust(20), num) print 'READ' for field, num in re.findall(r'\[READ\], ([a-zA-Z()/]*), (\d*)', f): print '%s\t%s' % (field.rjust(20), num) read_lat = [float(x) for x in re.findall(r'\[READ\], \d*, ([\d.]*)', f)] c = scipy.percentile(read_lat, 25) o = scipy.percentile(read_lat, 75) h = scipy.percentile(read_lat, 99) l = scipy.percentile(read_lat, 1) print '%s\t%s' % ('open'.rjust(20), o) print '%s\t%s' % ('high'.rjust(20), h) print '%s\t%s' % ('low'.rjust(20), l) print '%s\t%s' % ('close'.rjust(20), c) print 'INSERT' for field, num in re.findall(r'\[INSERT\], ([a-zA-Z()/]*), (\d*)', f): print '%s\t%s' % (field.rjust(20), num) insert_lat = [float(x) for x in re.findall(r'\[INSERT\], \d*, ([\d.]*)', f)] c = scipy.percentile(insert_lat, 25) o = scipy.percentile(insert_lat, 75) h = scipy.percentile(insert_lat, 90) l = scipy.percentile(insert_lat, 10) print '%s\t%s' % ('open'.rjust(20), o) print '%s\t%s' % ('high'.rjust(20), h) print '%s\t%s' % ('low'.rjust(20), l) print '%s\t%s' % ('close'.rjust(20), c)
def resize_original_im(img): data3 = img.copy() bandstats = {k: dict(max=0, min=0) for k in range(3)} for i in range(3): bandstats[i]['min'] = scipy.percentile(data3[i], 2) bandstats[i]['max'] = scipy.percentile(data3[i], 98) for chan_i in range(3): min_val = bandstats[chan_i]['min'] max_val = bandstats[chan_i]['max'] data3[chan_i] = np.clip(data3[chan_i], min_val, max_val) data3[chan_i] = (data3[chan_i] - min_val) / (max_val - min_val) * 255 img2 = np.array(data3).swapaxes(0, 1).swapaxes(1, 2) return img2
def plot_correlations_vi(model, input, features_names: List = None, save_path=None): """ Plot the correlations """ output = model.forward(*input) pred = output[2] num_dim = pred.shape[1] data_x = input[0] fig, axes = plt.subplots(num_dim, 1, figsize=(3 * 1, 3 * num_dim), squeeze=False, sharex=False, sharey=False) for ix in range(num_dim): axes[ix, 0].axhline(y=0, xmin=-1, xmax=1, linestyle="--", color='red') axes[ix, 0].plot(data_x[:, ix].cpu().data.numpy(), pred[:, ix].cpu().data.numpy(), ms=4, marker=".", linestyle="") min_val = scipy.percentile(data_x[:, ix], 1) max_val = scipy.percentile(data_x[:, ix], 99) axes[ix, 0].set_xlim([min_val, max_val]) min_val = scipy.percentile(pred[:, ix].cpu().data.numpy(), 1) max_val = scipy.percentile(pred[:, ix].cpu().data.numpy(), 99) axes[ix, 0].set_ylim([min_val, max_val]) if features_names is None: axes[ix, 0].set_xlabel(f"target_{ix}") axes[ix, 0].set_ylabel(f"pred_{ix}") else: axes[ix, 0].set_xlabel(f"{features_names[ix]}") axes[ix, 0].set_ylabel(f"pred_{features_names[ix]}") fig.suptitle("Correlation between predictor and indicator") # plt.show() if save_path is not None: fig.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.savefig(save_path, bbox_inches='tight', format='png', dpi=200) plt.close(fig) return fig
def get_color(listing, listings, f, cm): price = f(listing) prices = [f(l) for l in listings] lower = sp.percentile(prices, 10) upper = sp.percentile(prices, 90) relative_price = (price - lower)/(upper - lower) color = cm(sp.clip(relative_price, 0, 1)) is_dark = sum(color[:3])/4 < 0.4 background_color = tuple([int(255*c) for c in color[:3]]) text_color = (230, 230, 230) if is_dark else (50, 50, 50) return background_color, text_color
def make_features(self, s): from scipy import percentile features_koh, blocks = Kohlschuetter.make_features(s) if features_koh is None: return None, blocks features = np.zeros((features_koh.shape[0], 6 + 4 + 2)) features[:, :6] = features_koh[:] # a global feature based on connected blocks of long text # inspired by Arias block_lengths = np.array([len(block.text) for block in blocks]) index = block_lengths.argmax() k = 6 for c in [0.15, 0.3333]: for window in [1, 4]: cutoff = int(percentile(block_lengths, 97) * c) lowindex, highindex = KohlschuetterExpanded.strip(block_lengths, index, window, cutoff) features[lowindex:(highindex + 1), k] = 1.0 k += 1 features[:, -2:] = capital_digit_features(blocks) normalize_features(features, self._mean_std) return features, blocks
def discretizeY(Y, col, firstThresh=33.3333, secondThresh=66.6666): ''' Discretize and returns and specific column of Y. The strategy is: to keep the data with score <=33rd percentile be the "low" group, score >=66th percentile be the "high" group, and the middle be the "medium" group. ''' y = Y[:, col] if kwlist[col] == 'Totalviews': y = np.log(y) lowthresh = sp.percentile(y, firstThresh) hithresh = sp.percentile(y, secondThresh) y[y <= lowthresh] = -1 # Low group y[y >= hithresh] = 1 # High group y[(y > lowthresh) * (y < hithresh)] = 0 # Medium group return y
def percentile(self, percentile): """Calculate a given spectral percentile for this `Spectrogram`. Parameters ---------- percentile : `float` percentile (0 - 100) of the bins to compute Returns ------- spectrum : `~gwpy.frequencyseries.FrequencySeries` the given percentile `FrequencySeries` calculated from this `SpectralVaraicence` """ out = scipy.percentile(self.value, percentile, axis=0) name = '%s %s%% percentile' % (self.name, percentile) return FrequencySeries( out, epoch=self.epoch, channel=self.channel, name=name, f0=self.f0, df=self.df, frequencies=(hasattr(self, '_frequencies') and self.frequencies or None))
def two_channel_to_color(im): """Converts a two-channel microarray image to a color image, as described in the paper associated with this codebase""" lower = sp.percentile(im, 5) upper = sp.percentile(im, 98) channel_0 = sp.clip((im[:, :, 0] - lower)/(upper - lower), 0, 1) channel_2 = sp.clip((im[:, :, 1] - lower)/(upper - lower), 0, 1) channel_1 = ((channel_0 + channel_2)/2.) im = sp.array((channel_0, channel_1, channel_2)) im = sp.rollaxis(im, 0, 3) im = (255*im).astype(sp.uint8) return im
def get_zbins(cat, nzbin): edges = [0] for i in range(1,nzbin+1): edges += [sp.percentile(cat['PHOTOZ_GAUSSIAN'], i*100./nzbin)] print 'bin %d'%i return edges
def plotprofile(confs,nreps,path,tol=0.9,target=1e-6): f=[] a=[] pmax=1 for i in range(pmax): f_,a_ = plt.subplots(1) for item in ([a_.title, a_.xaxis.label, a_.yaxis.label] + a_.get_xticklabels() + a_.get_yticklabels()): item.set_fontsize(10) f.append(f_) a.append(a_) colorlist = ['b','r','g','purple','k','grey','orange','c','lightgreen','lightblue','pink','b','r','g','purple','k','grey','orange','c','lightgreen','lightblue','pink'] lslist = ['solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted'] ci=-1 for C in confs: print('plotting {}...'.format(C[0])) ci+=1 col = colorlist[ci] line = lslist[ci] #collect the data data=[] xoverheads = sp.empty(nreps) xntotarget = sp.zeros(nreps) overheads = sp.empty(nreps) noverheads = [sp.empty(nreps) for i in range(len(C[1]['N']))] ninrun = sp.zeros(nreps) support = sp.logspace(-2,5,200) success = sp.zeros(nreps) for ii in range(nreps): D = gpbo.optimize.readoptdata(os.path.join(path,'{}_{}.csv'.format(C[0],ii))) A = sp.array(D['trueyatxrecc'].values) if A.min()>=target: xoverheads[ii]=sum(D['taq']) overheads[ii]=sum(D['taq']) for k,n in enumerate(C[1]['N']): noverheads[k][ii] = sum(D['taq'][:n]) else: success[ii]=1 i = sp.argmax(A<=target)#while A[i]>=target: xoverheads[ii] = sum(D['taq'][:i+1]) overheads[ii] = sum(D['taq']) xntotarget[ii] = i ninrun[ii]=len(D['taq']) for k,n in enumerate(C[1]['N']): noverheads[k][ii] = sum(D['taq'].values[:n]) if sp.mean(success)>=tol: if C[1]['oracle']: a[0].plot(support,sp.mean(xoverheads)+sp.mean(xntotarget)*support,col,label=C[0]+'oracle',linestyle='dashdot') if C[1]['full']: a[0].plot(support,sp.mean(overheads)+sp.mean(ninrun)*support,col,label=C[0]+'_all',linestyle='dashed') for k,n in enumerate(C[1]['N']): if sp.percentile(xntotarget,int(tol*100))<n: a[0].plot(support,sp.mean(noverheads[k])+n*support,col,label=C[0]+str(n),linestyle='solid') else: print('{} only acchived target on {}'.format(C[0],sp.mean(success))) a[0].set_xscale('log') a[0].set_yscale('log') a[0].legend() f[0].savefig(os.path.join(path,'profile_{}.png'.format(sp.log10(target))),bbox_inches='tight', pad_inches=0.1)
def credibility_interval(post, alpha=1.): """Calculate bayesian credibility interval. Parameters: ----------- post : array_like The posterior sample over which to calculate the bayesian credibility interval. alpha : float, optional Confidence level. Returns: -------- med : float Median of the posterior. low : float Lower part of the credibility interval. up : float Upper part of the credibility interval. """ z = erf(alpha / sp.sqrt(2)) lower_percentile = 100 * (1 - z) / 2 upper_percentile = 100 * (1 + z) / 2 low, med, up = sp.percentile(post, [lower_percentile, 50, upper_percentile]) return med, low, up
def func(ms_): # 極端に歪んだ分布でない限り,MAP推定値は95%信用区間の中には入っているだろう lo, hi = scipy.percentile(ms_, q=[2.5, 97.5]) kde = gaussian_kde(ms_) xs = scipy.linspace(lo, hi, 1000) ys = kde.evaluate(xs) return xs[scipy.argmax(ys)]
def getSizeFactor(fn_anno, data, gid, mode = 'sum', withXYMT = True, filterbyPC = True): ''' input annotation, counts and gene ids output sum of protein coding gene levels excluding sex chromosomes and mitochondria genes ''' anno = sp.loadtxt(fn_anno, delimiter = '\t', dtype = 'string', usecols=[0,2,8]) anno = anno[anno[:,1] == 'gene', :] if not withXYMT: ### filter xymt anno = anno[anno[:,0] != 'MT',:] anno = anno[anno[:,0] != 'Y',:] anno = anno[anno[:,0] != 'X',:] agid = [x.split(';')[0] for x in anno[:,2]] ### clean gene id's agid = sp.array([x.split(" ")[1].strip('\"') for x in agid]) if filterbyPC: ### filter protein coding gtpe = [x.split(';')[2] for x in anno[:,2]] gtpe = sp.array([x.split('\"')[1].split('\"')[0] for x in gtpe]) iPC = sp.where(gtpe == 'protein_coding')[0] agid = agid[iPC] iGn = sp.in1d(gid, agid) libsize = sp.sum(data[iGn,:], axis = 0) if mode == 'uq': libsize = sp.array([sp.percentile(x[x!=0] ,75) for x in data[iGn,:].T]) * iGn.sum() return libsize
def two_channel_to_color(im): """Converts a two-channel microarray image to a color image, as described in the paper associated with this codebase""" lower = sp.percentile(im, 5) upper = sp.percentile(im, 98) channel_0 = sp.clip((im[:, :, 0] - lower) / (upper - lower), 0, 1) channel_2 = sp.clip((im[:, :, 1] - lower) / (upper - lower), 0, 1) channel_1 = ((channel_0 + channel_2) / 2.) im = sp.array((channel_0, channel_1, channel_2)) im = sp.rollaxis(im, 0, 3) im = (255 * im).astype(sp.uint8) return im
def skip_extrem(im): # Get percentile pm, pM = sp.percentile(im, [2, 98]) ims = im ims[im < pm] = pm ims[im > pM] = pM return ims
def alt_results(self, samples, kplanets): titles = sp.array(["Period","Amplitude","Longitude", "Phase","Eccentricity", 'Acceleration', 'Jitter', 'Offset', 'MACoefficient', 'MATimescale', 'Stellar Activity']) namen = sp.array([]) ndim = kplanets * 5 + self.nins*2*(self.MOAV+1) + self.totcornum + 1 + self.PACC RESU = sp.zeros((ndim, 5)) for k in range(kplanets): namen = sp.append(namen, [titles[i] + '_'+str(k) for i in range(5)]) namen = sp.append(namen, titles[5]) # for acc if self.PACC: namen = sp.append(namen, 'Parabolic Acceleration') for i in range(self.nins): namen = sp.append(namen, [titles[ii] + '_'+str(i+1) for ii in sp.arange(2)+6]) for c in range(self.MOAV): namen = sp.append(namen, [titles[ii] + '_'+str(i+1) + '_'+str(c+1) for ii in sp.arange(2)+8]) for h in range(self.totcornum): namen = sp.append(namen, titles[-1]+'_'+str(h+1)) if self.PM: for g in range(self.nins_pm): for gg in range(self.lenppm): namen = sp.append(namen, 'Photometry param'+str(g)+'_'+str(gg+1)) alt_res = list(map(lambda v: (v[2], v[3]-v[2], v[2]-v[1], v[4]-v[2], v[2]-v[0]), zip(*np.percentile(samples, [2, 16, 50, 84, 98], axis=0)))) logdat = '\nAlternative results with uncertainties based on the 2nd, 16th, 50th, 84th and 98th percentiles of the samples in the marginalized distributions' logdat = '\nFormat is like median +- 1-sigma, +- 2-sigma' for res in range(ndim): logdat += '\n'+namen[res]+' : '+str(alt_res[res][0])+' +- '+str(alt_res[res][1:3]) +' 2% +- '+str(alt_res[res][3:5]) RESU[res] = sp.percentile(samples, [2, 16, 50, 84, 98], axis=0)[:, res] print(logdat) return RESU
def plotquartsends(a,xdata_, ydata_,col,line,lab,log=False,mean=False): xdata = [sp.array(i) for i in xdata_] ydata = [sp.array(i) for i in ydata_] n = len(xdata) ints = [] starts = sp.empty(n) ends = sp.empty(n) yends = sp.empty(n) for i in xrange(n): starts[i] = xdata[i][0] ends[i] = xdata[i][-1] yends[i] = ydata[i][-1] yendorder = sp.argsort(yends) Ydata = [sp.hstack([y[0], y, y[-1]]) for y in ydata] #the pad values are slightly outside the true range to so that exp(log(value)) stays in the interpolation range Xdata = [sp.hstack([0.999*min(starts), x, max(ends)*1.001]) for x in xdata] #print(min(starts),max(ends)) for i in xrange(n): #print(Xdata[i][0],Xdata[i][-1]) ints.append(sp.interpolate.interp1d(Xdata[i], Ydata[i])) # a.plot(Xdata[i], Ydata[i], 'lightblue') if log: x = sp.logspace(sp.log10(min(starts)), sp.log10(max(ends)), 200) else: x = sp.linspace(min(starts), max(ends), 200) #print(x) if mean: a.plot(x, map(lambda x: sp.mean([i(x) for i in ints]), x), color=col,label=lab) else: a.plot(x, map(lambda x: sp.percentile([i(x) for i in ints], 50), x), color=col,label=lab) #m = map(lambda x: sp.mean([i(x) for i in ints]), x) #v = map(lambda x: sp.mean([i(x) for i in ints]), x) #a.plot(x, map(lambda x: sp.mean([i(x) for i in ints]), x), color=col,label=lab) y25 = map(lambda x: sp.percentile([i(x) for i in ints], 25), x) y75 = map(lambda x: sp.percentile([i(x) for i in ints], 75), x) a.fill_between(x,y25,y75,edgecolor=col, facecolor=col,lw=0.0,alpha=0.1) #a.plot(ends[yendorder], yends[yendorder], '.',color=col ,linestyle=line) #print("endvalues: {}".format(yends)) a2 = a.twinx() a2.grid(False) a2.plot(ends[sp.argsort(ends)],sp.linspace(1,0,n),color=col, linestyle='--',linewidth=0.4) a2.set_ylabel('fraction of optimizations still running') return
def topWords(self,vote,plotWordCloud=True): freqs = list(self.wordRelativeFreqDic.values()) topWord = {} percLimit = 1 # print("Top words on ",vote) if vote=='Yes': thr = sp.percentile(freqs,q=100-percLimit) else: thr = sp.percentile(freqs,q=percLimit) total = 0 for pair in self.wordRelativeFreqDic.items(): if (vote=='Yes' and pair[1]>thr) or (vote=='No' and pair[1]<thr): topWord[pair[0]] = abs(pair[1]) total += topWord[pair[0]] if plotWordCloud: for key in topWord: topWord[key] = topWord[key]/total print(key,topWord[key]) wordcloud = WordCloud(max_font_size=40, relative_scaling=.5,background_color='white',max_words=50).generate_from_frequencies(topWord.items()) plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.savefig('Temp/WordCloud_TopOnly'+vote+'.png') plt.close() listKeys = list(topWord.keys()) try: listKeys.remove('sim') except: pass try: listKeys.remove('nao') except: pass return listKeys # topDif = topDifferentWords(getVoteData()) # # print(topDif.topWords('Yes')) # print(topDif.topWords('No'))
def returnArrayHigherQuant(self,array): if numpy.size(array) != 0 and self.count_nonzero(array) != 0: try: return scipy.percentile(array[array!=self.nodata],75) except ValueError: return None else: return None
def returnArrayHigherQuant(self, array): if numpy.size(array) != 0 and self.count_nonzero(array) != 0: try: return scipy.percentile(array[array != self.nodata], 75) except ValueError: return None else: return None
def _add_colorbar_(self,axcolorbar=None, label= None, verticale=True, no_ticks=True, add_legend=True): """ """ if axcolorbar is None: if verticale: self.axcbar = self.fig.add_axes([0.9,0.10,0.03,0.80]) else: self.axcbar = self.fig.add_axes([0.10,0.9,0.80,0.04]) else: self.axcbar = axcolorbar # ----------------- # vmin,vmax = self._skyPlot_color_ranges_ norm = P.matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) if verticale: x,y=N.mgrid[1:10:0.05,1:10] self._colorbar_ = self.axcbar.imshow(10-x, cmap=self.scatter_cmap) else: x,y=N.mgrid[1:10,1:10:0.1] self._colorbar_ = self.axcbar.imshow(10-x, cmap=self.scatter_cmap) if label is not None: self.axcbar.set_xlabel(label,fontsize=fontsize_label) if no_ticks: self.axcbar.set_xticks([]) self.axcbar.set_yticks([]) if add_legend: if verticale: loc = (0.5,1.) else: loc = (1.,.5) print "legend" self.axcbar.text(loc[0],loc[1],r"$\mathrm{%s}$"%self._skyPlot_colored_by_, fontsize="large", va="bottom",ha="center", transform=self.axcbar.transAxes, ) range_percent_to_show = [0,0.33,0.66,1] if verticale: [self.axcbar.text(1.02,x,r"$%+.1e$"%(percentile(self._colored_values_,(x*(vmax-vmin) + vmin )*100)),fontsize="small", va="center",ha="left", transform=self.axcbar.transAxes) for x in range_percent_to_show] else: [self.axcbar.text(1.0-x*0.95,-.2,r"$%+.1e$"%(x*100),fontsize="small", va="top",ha="center", transform=self.axcbar.transAxes) for x in range_percent_to_show]
def _load_sky_scatter_color_(self,colored_by,default_color="b", vmin=0,vmax=1): """ """ self._skyPlot_colored_by_ = colored_by self._skyPlot_color_ranges_ = [vmin, vmax] if colored_by is None: self._colored_values_ = None self._color_used_ = default_color return None if colored_by not in dir(self.Samp): raise ValueError("Sorry I don't have any %s in module self.Samp"%colored_by) self._colored_values_ = N.asarray(self.Samp.__dict__[colored_by]) self._color_used_ = self.scatter_cmap((self._colored_values_-percentile(self._colored_values_,vmin*100.) )\ / (percentile(self._colored_values_,vmax*100.)-percentile(self._colored_values_,vmin*100.)))
def find_photoz_bin_edges(self, nbin, sel=None, binning='mode'): if sel==None: sel = np.in1d(self.data['pz_cond_%s'%binning], self.data['pz_cond_%s'%binning]) self.data['tomographic_bin_edges'] = [ min(self.data['pz_cond_%s'%binning][sel]) ] for i in range(1,nbin+1): edge = sp.percentile(self.data['pz_cond_%s'%binning][sel],i*100.0/nbin) self.data['tomographic_bin_edges'] += [edge] self.data['tomographic_bin_edges'] += [max(self.data['pz_cond_%s'%binning][sel])] self.data['tomographic_bin_edges'] = np.array(self.data['tomographic_bin_edges'])
def pairwise_dists(data, nneighbors=10, folder='model', dist='l2'): ''' Computes pairwise distances between bag-of-words vectors of articles INPUT folder model folder nneighbors number of closest neighbors to include in distance list ''' stopwords = codecs.open("stopwords.txt", "r", encoding="utf-8", errors='ignore').readlines()[5:] stops = map(lambda x:x.lower().strip(),stopwords) # using now stopwords and filtering out digits bow = TfidfVectorizer(min_df=2,stop_words=stops) X = bow.fit_transform(data) print 'Computing %s pairwise distances'%dist # KPCA transform bow vectors if dist is 'l2_kpca_zscore': K = pairwise_distances(X,metric='l2',n_jobs=1) perc = 50.0 width = percentile(K.flatten(),perc) Xc = zscore(KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X)) K = pairwise_distances(Xc,metric='l2',n_jobs=1) elif dist is 'l2_kpca': K = pairwise_distances(X,metric='l2',n_jobs=1) perc = 100./len(data) width = percentile(K.flatten(),perc) Xc = KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X) K = pairwise_distances(Xc,metric='l2',n_jobs=1) elif dist is 'l2': K = pairwise_distances(X,metric='l2',n_jobs=1) elif dist is 'l1': K = pairwise_distances(X,metric='l1',n_jobs=1) # collect closest neighbors distances = [] for urlidx in range(len(data)): idx = (K[urlidx,:]).argsort()[1:nneighbors+1] for sidx in idx: distances.append([urlidx,sidx,(idx==sidx).nonzero()[0][0]]) return distances
def f_MedianPatchArea(self,cl=None,name="Median patch area",niv=50): res = [] for group in self.groups: r = self.returnGroupArea(group,cl) try: v = scipy.percentile(r,niv) except ValueError: # Catch empty array v = "NULL" res.append( [group,name,v] ) return res
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False): ''' Computes clustering of bag-of-words vectors of articles INPUT folder model folder nclusters number of clusters ''' from sklearn.cluster import KMeans # filtering out some noise words stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:]) # vectorize non-stopwords bow = TfidfVectorizer(min_df=2,stop_words=stops) X = bow.fit_transform(data) # creating bow-index-to-word map idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys())) # using now stopwords and filtering out digits print 'Computing pairwise distances' K = pairwise_distances(X,metric='l2',n_jobs=1) perc = 50.0 width = percentile(K.flatten(),perc) # KPCA transform bow vectors Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X) if zscored: Xc = zscore(Xc) # compute clusters km = KMeans(n_clusters=nclusters).fit(Xc) Xc = km.predict(Xc) clusters = [] for icluster in range(nclusters): nmembers = (Xc==icluster).sum() if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big members = (Xc==icluster).nonzero()[0] topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1] topwords = ' '.join([idx2word[wi] for wi in topwordidx]) meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum() meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0) # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords clusters.append({ 'name':'Cluster-%d'%icluster, 'description': topwords, 'members': list(members), 'meanL2Distances': meanDist }) return clusters
def __calc_mul_multiband_cut_threshold(area_id, datapath): prefix = area_id_to_prefix(area_id) band_values = {k: [] for k in range(8)} band_cut_th = {k: dict(max=0, min=0) for k in range(8)} image_id_list = pd.read_csv(FMT_VALTRAIN_IMAGELIST_PATH.format( prefix=prefix)).ImageId.tolist() for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = get_train_image_path_from_imageid( image_id, datapath, mul=True) with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(8): values_ = values[i_chan].ravel().tolist() values_ = np.array( [v for v in values_ if v != 0] ) # Remove sensored mask band_values[i_chan].append(values_) image_id_list = pd.read_csv(FMT_VALTEST_IMAGELIST_PATH.format( prefix=prefix)).ImageId.tolist() for image_id in tqdm.tqdm(image_id_list[:500]): image_fn = get_train_image_path_from_imageid( image_id, datapath, mul=True) with rasterio.open(image_fn, 'r') as f: values = f.read().astype(np.float32) for i_chan in range(8): values_ = values[i_chan].ravel().tolist() values_ = np.array( [v for v in values_ if v != 0] ) # Remove sensored mask band_values[i_chan].append(values_) logger.info("Calc percentile point ...") for i_chan in range(8): band_values[i_chan] = np.concatenate( band_values[i_chan]).ravel() band_cut_th[i_chan]['max'] = scipy.percentile( band_values[i_chan], 98) band_cut_th[i_chan]['min'] = scipy.percentile( band_values[i_chan], 2) return band_cut_th
def __call__(self, blocks, train=False): from scipy import percentile features = np.zeros((len(blocks), AriasFeatures.nfeatures)) block_lengths = np.array([len(block.text) for block in blocks]) index = block_lengths.argmax() cutoff = int(percentile(block_lengths, self._percent_cutoff)) lowindex, highindex = AriasFeatures.strip(block_lengths, index, self._window, cutoff) features[lowindex : (highindex + 1), 0] = 1.0 return features
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG): mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis] index = sp.where(disp_conv)[0] lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1) upperBound = sp.percentile(sp.unique(disp_raw[index]), 99) idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0] matrix = sp.ones((idx.shape[0], 2), dtype='float') matrix[:, 0] /= mean_count[idx].ravel() modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity)) res = modGamma.fit() Lambda = res.params disp_fitted = disp_raw.copy() ok_idx = sp.where(~sp.isnan(disp_fitted))[0] disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1] if sp.sum(disp_fitted > 0) > 0: print "Found dispersion fit" if CFG['debug']: fig = plt.figure(figsize=(8, 6), dpi=100) ax = fig.add_subplot(111) idx = sp.where(~sp.isnan(disp_fitted))[0] ax.plot(sp.mean(sp.log10(counts + 1), axis=1)[idx], disp_fitted[idx], 'bo') ax.set_title('Fitted Dispersion Estimate') ax.set_xlabel('Mean expression count') ax.set_ylabel('Dispersion') plt.savefig('dispersion_fitted.pdf', format='pdf', bbox_inches='tight') plt.close(fig) return (disp_fitted, Lambda, idx)
def percentile(self, percentile): """Calculate a given spectral percentile for this `Spectrogram`. Parameters ---------- percentile : `float` percentile (0 - 100) of the bins to compute Returns ------- spectrum : `~gwpy.spectrum.Spectrum` the given percentile `Spectrum` calculated from this `SpectralVaraicence` """ out = scipy.percentile(self.value, percentile, axis=0) name = '%s %s%% percentile' % (self.name, percentile) return Spectrum(out, epoch=self.epoch, channel=self.channel, name=name, f0=self.f0, df=self.df, frequencies=(hasattr(self, '_frequencies') and self.frequencies or None))
def compute_profiles(data_grouped): profiles = {} r2_min = +1e16 r2_max = -1e16 for (index, resonance_id), profile in data_grouped.items(): mag_ref = sp.mean( [data_pt.val for data_pt in profile if data_pt.par['ncyc'] == 0] ) r2_profile = [] for data_pt in profile: ncyc = data_pt.par['ncyc'] time_t2 = data_pt.par['time_t2'] frq = ncyc / time_t2 if frq: mag_cal = data_pt.cal mag_exp = data_pt.val mag_err = data_pt.err mag_ens = sp.random.normal(mag_exp, mag_err, 10000) r2_cal = -sp.log(mag_cal / mag_ref) / time_t2 r2_exp = -sp.log(mag_exp / mag_ref) / time_t2 r2_ens = -sp.log(mag_ens / mag_ref) / time_t2 r2_err = abs(sp.percentile(r2_ens, [15.9, 84.1]) - r2_exp) r2_erd, r2_eru = r2_err r2_profile.append([frq, r2_cal, r2_exp, r2_erd, r2_eru]) r2_min = min(r2_min, r2_cal, r2_exp - r2_erd) r2_max = max(r2_max, r2_cal, r2_exp + r2_eru) r2_profile = zip(*sorted(r2_profile)) profiles.setdefault((index, resonance_id), []).append(r2_profile) return profiles, r2_min, r2_max
def percentile(self, percentile): """Calculate a given spectral percentile for this `Spectrogram`. Parameters ---------- percentile : `float` percentile (0 - 100) of the bins to compute Returns ------- spectrum : `~gwpy.frequencyseries.FrequencySeries` the given percentile `FrequencySeries` calculated from this `SpectralVaraicence` """ out = scipy.percentile(self.value, percentile, axis=0) if self.name is not None: name = '{}: {} percentile'.format(self.name, _ordinal(percentile)) else: name = None return FrequencySeries(out, epoch=self.epoch, channel=self.channel, name=name, f0=self.f0, df=self.df, frequencies=(hasattr(self, '_frequencies') and self.frequencies or None))
def summarize_sampler(sampler, burn=0, thin=1, ci=0.95): r"""Create summary statistics of the flattened chain of the sampler. The confidence regions are computed from the quantiles of the data. Parameters ---------- sampler : :py:class:`emcee.EnsembleSampler` instance The sampler to summarize the chains of. burn : int, optional The number of samples to burn from the beginning of the chain. Default is 0 (no burn). thin : int, optional The step size to thin with. Default is 1 (no thinning). ci : float, optional A number between 0 and 1 indicating the confidence region to compute. Default is 0.95 (return upper and lower bounds of the 95% confidence interval). Returns ------- mean : array, (num_params,) Mean values of each of the parameters sampled. ci_l : array, (num_params,) Lower bounds of the `ci*100%` confidence intervals. ci_u : array, (num_params,) Upper bounds of the `ci*100%` confidence intervals. """ flat_trace = sampler.chain[:, burn::thin, :] flat_trace = flat_trace.reshape((-1, flat_trace.shape[2])) mean = scipy.mean(flat_trace, axis=0) cibdry = 100.0 * (1.0 - ci) / 2.0 ci_l, ci_u = scipy.percentile(flat_trace, [cibdry, 100.0 - cibdry], axis=0) return (mean, ci_l, ci_u)
def calc_noise(wav_filename, bins=None): window_buffers, sample_rate = stft.get_buffers_from_file(wav_filename) bins = numpy.empty([ stft.WINDOWSIZE/2 + 1, len(window_buffers), ]) for i, window_buffer in enumerate(window_buffers): fft_amplitude = stft.stft_amplitude(window_buffer) bins[:,i] = fft_amplitude print bins.shape freqs = [ stft.bin2hertz(i, sample_rate) for i in range(stft.WINDOWSIZE/2 + 1) ] #for i, window_buffer in enumerate(window_buffers): # pylab.plot(freqs, # stft.amplitude2db(bins[:,i]), # color="blue") noise = numpy.empty(len(bins[:,0])) means = numpy.empty(len(bins[:,0])) mins = numpy.empty(len(bins[:,0])) stds = numpy.empty(len(bins[:,0])) for i, bin_spot in enumerate(bins): detected_noise = scipy.percentile(bin_spot, defs.NOISE_PERCENTILE_BELOW) #noise[i] = stft.db2amplitude(stft.amplitude2db(detected_noise)) noise[i] = detected_noise means[i] = scipy.mean(bin_spot) mins[i] = bin_spot.min() stds[i] = scipy.std(bin_spot, ddof=1) #if i == 100: # numpy.savetxt("noise.csv", bin_spot, delimiter=', ') #return noise, freqs, variance return noise, freqs, means, mins, stds
def E_step(x1,x0,lam,data): a1=scipy.log(ss.norm.pdf(data, loc=x1[0], scale=x1[1])*lam) a0=scipy.log(ss.norm.pdf(data, loc=x0[0], scale=x0[1])*(1-lam)) lratio=a1-a0 return inv_logit(lratio) eps=1e-4 data1=np.array(ss.norm.rvs(loc=20,scale=5,size=1000)) data0=np.array(ss.norm.rvs(loc=0,scale=5,size=300)) data=np.concatenate([data1,data0]) x1_old=np.array([6,1]) x0_old=np.array([-3,2]) xx_old=np.concatenate((x1_old,x0_old)) Z=np.ones(len(data))*0.5 Z[data>scipy.percentile(data,90)]=1 Z[data<scipy.percentile(data,10)]=0 #Z[1:10]=1 #Z[-10:-1]=0 lam=0.5 ans=np.zeros(len(data)) ans[0:1001]=1 diff=1 cnt=0 while diff>eps: lam=sum(Z)/len(data) res = op.minimize(f_sum,xx_old ,args=(data, Z, lam), method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False}) xx=res.x diff=max(abs(xx-xx_old)) x1=xx[0:2] x0=xx[2:]
def plotBias(vals, fn_plot, myidx, logScale = False, refname = 'TCGA'): iqr = ( (sp.percentile(vals[~myidx],75) - sp.percentile(vals[~myidx],25) ) * 1.5) iqr2 = ( (sp.percentile(vals[myidx],75) - sp.percentile(vals[myidx],25) ) * 1.5) sidx = sp.argsort(vals) vals = vals[sidx] myidx = myidx[sidx] fig = plt.figure(figsize=(12,10)) ax = fig.add_subplot(111) ax_c = ax.twinx() ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[myidx],[0], vals[myidx], label = '%s Reference'%refname) ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[~myidx],[0], vals[~myidx], color = 'r', label = 'Your Samples') ax.plot([0,vals.shape[0]],[3,3], '--', color = 'green') ax.plot([0,vals.shape[0]],[5,5] , '--',color = 'green') ax.plot([0,vals.shape[0]],[iqr + sp.percentile(vals[~myidx], 75),iqr + sp.percentile(vals[~myidx], 75)], '--',color = 'green') ax.plot([0,vals.shape[0]],[iqr2 + sp.percentile(vals[myidx], 75),iqr2 + sp.percentile(vals[myidx], 75)], '--',color = 'green') # ax.plot([0,vals.shape[0]],[6.25,6.25],'--', color = 'green') ax.plot([0,vals.shape[0]],[10,10] , '--',color = 'green') ax.set_ylabel('Median 3\'/5\' Bias') ax.set_xlim(0,vals.shape[0]) if logScale: ax.set_yscale('log') ax_c.set_yscale('log') ax_c.set_ylim(ax.get_ylim()) ### add right side ticks if logScale: tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10])#sp.array(sp.log([3,5,iqr+sp.percentile(vals,75), 10, 50])) else: tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10]) tick_idx = sp.argsort(tick_thresholds) tick_thresholds = tick_thresholds[tick_idx] tick_thresholds = sp.around(tick_thresholds, decimals = 2) ax_c.set_yticks(tick_thresholds) tick_thresholds = tick_thresholds.astype('|S4') tick_thresholds = tick_thresholds.astype('|S50') tick_thresholds[tick_idx == 2] = tick_thresholds[tick_idx == 2][0] + ' (Your Filter)' # tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (PRAD Filter)' tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (%s Filter)'%(refname) ax_c.set_yticklabels(tick_thresholds) ax.grid() ax.legend(loc=2) plt.tight_layout() plt.savefig(fn_plot, dpi = 300) plt.clf()
import pandas as pd import random random.seed(10) import numpy as np from scipy import percentile import json df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) out = [] for row in df.T.itertuples(index=False): out.append({'max': max(row), 'min': min(row), 'Q1': percentile(row, 25), 'median': percentile(row, 50), 'Q3': percentile(row, 75)}) print out with open("../quartiles/quartiles.json", "w") as f: json.dump(out, f)