def fit_line(self, x, y, startvalues=None): # popt, pcov = curve_fit(self.linear, x, y, p0=startvalues) # return popt #slope = np.median((y[1:]-y[:-1]))/np.median((x[1:]-x[:-1])) #slope = np.median((y[1:]-y[:-1])/(x[1:]-x[:-1])) #return np.array((slope, np.median(y[1:-1]) - slope*np.median(x[1:-1]))) #return theilslopes(y, x)[0:2] x_part = theilslopes(x)[0:2] y_part = theilslopes(y)[0:2] p = np.array((y_part[1], x_part[1])) q = np.array((y_part[0], x_part[0])) + p return (p, q)
def trend_theilsen(self, alpha=0.05): from scipy.stats.mstats import theilslopes xaxis = self.data_ts.index.to_julian_date().values yaxis = self.data_ts.values theilsen_result = theilslopes(yaxis, x=xaxis, alpha=alpha) slope, intercept, slope_low, slope_up = theilsen_result self.fitted['theilsen'] = xaxis * slope + intercept assert (slope_low <= slope <= slope_up) # Just to be safe, check this slope_sign = np.sign(slope) if not slope_low < 0.0 < slope_up: sign = int(np.sign(slope)) else: sign = int(0) results_dict = {} results_dict['sign'] = sign results_dict['slope'] = slope * (365.25 * 10) # From /day to /decade results_dict['slope_low'] = slope_low * (365.25 * 10 ) # From /day to /decade results_dict['slope_up'] = slope_up * (365.25 * 10 ) # From /day to /decade # Add a sign to Theilsen self.__add_to_logbook__('Calculated Theil-Sen slope') results_dict['method'] = 'theilsen' results_dict['pvalue'] = None # Trend Theilsen has no pvalue return results_dict
def get_TheilSen(_y, what="slope"): if not np.ma.is_masked(_y): if what=="slope": return mstats.theilslopes(np.ma.masked_invalid(_y))[0] else: _x=np.arange(len(_y)) return kendalltau(_x, _y, nan_policy='omit')[1] return np.nan
def test_theilslopes(): # Test for basic slope and intercept. slope, intercept, lower, upper = mstats.theilslopes([0, 1, 1]) assert_almost_equal(slope, 0.5) assert_almost_equal(intercept, 0.5) # Test for correct masking. y = np.ma.array([0, 1, 100, 1], mask=[False, False, True, False]) slope, intercept, lower, upper = mstats.theilslopes(y) assert_almost_equal(slope, 1. / 3) assert_almost_equal(intercept, 2. / 3) # Test of confidence intervals from example in Sen (1968). x = [1, 2, 3, 4, 10, 12, 18] y = [9, 15, 19, 20, 45, 55, 78] slope, intercept, lower, upper = mstats.theilslopes(y, x, 0.07) assert_almost_equal(slope, 4) assert_almost_equal(upper, 4.38, decimal=2) assert_almost_equal(lower, 3.71, decimal=2)
def test_theilslopes(): # Test for basic slope and intercept. slope, intercept, lower, upper = mstats.theilslopes([0,1,1]) assert_almost_equal(slope, 0.5) assert_almost_equal(intercept, 0.5) # Test for correct masking. y = np.ma.array([0,1,100,1], mask=[False, False, True, False]) slope, intercept, lower, upper = mstats.theilslopes(y) assert_almost_equal(slope, 1./3) assert_almost_equal(intercept, 2./3) # Test of confidence intervals from example in Sen (1968). x = [1, 2, 3, 4, 10, 12, 18] y = [9, 15, 19, 20, 45, 55, 78] slope, intercept, lower, upper = mstats.theilslopes(y, x, 0.07) assert_almost_equal(slope, 4) assert_almost_equal(upper, 4.38, decimal=2) assert_almost_equal(lower, 3.71, decimal=2)
def get_TheilSen(_x, what, _nboot, _y): import numpy as np import pandas as pd #the x y are weird, it appears that apply passes the dataframe column as last element from arch.bootstrap import StationaryBootstrap, IIDBootstrap from scipy.stats import mstats, mannwhitneyu, t, kendalltau from statsmodels.distributions.empirical_distribution import ECDF try: if what=="slope": return mstats.theilslopes(np.ma.masked_invalid(_y.values), _x)[0]*86400*365*1000000000 elif what=="pval_tau": return kendalltau(_x, _y)[1]/2 elif what=="pval_autocorr": res0=mstats.theilslopes(_y, _x, alpha=0.95)[0] bs=StationaryBootstrap(3, np.array(range(len(_y)))) bs_slopes=[] for data in bs.bootstrap(_nboot): ind=data[0][0] res=mstats.theilslopes(_y[ind], _x, alpha=0.95) bs_slopes=bs_slopes+[res[0]] ecdf=ECDF(bs_slopes) pvalue=ecdf(res0) if pvalue>0.5: pvalue=1-pvalue # print pvalue return pvalue elif what=="pval": bs=IIDBootstrap(np.array(range(len(_y)))) bs_slopes=[] for data in bs.bootstrap(_nboot): ind=data[0][0] res=mstats.theilslopes(_y[ind], _x, alpha=0.95) bs_slopes=bs_slopes+[res[0]] ecdf=ECDF(bs_slopes) pvalue=ecdf(0) if pvalue>0.5: pvalue=1-pvalue # print pvalue return pvalue except: return np.nan
def _populate_from_gc_bias_metrics(self, run_dir): for k, run_element in self.barcodes_info.items(): if run_element.get('barcode') == 'unknown' or run_element[ELEMENT_NB_READS_PASS_FILTER] == 0: self.info('No reads for %s, not expecting GC bias data', run_element['run_element_id']) continue metrics_file = util.find_file( run_dir, run_element['project_id'], run_element['sample_id'], '*_S*_L00%s_gc_bias.metrics' % run_element['lane'] ) with open(metrics_file) as f: header = '' while not header.startswith('ACCUMULATION_LEVEL'): header = f.readline() reader = csv.DictReader(f, header.split('\t'), delimiter='\t') lines = [l for l in reader] # gc slope data_points = [float(l['NORMALIZED_COVERAGE']) for l in lines if 20 <= int(l['GC']) <= 80] gc_slope = theilslopes(data_points) self.info('Calculated a GC slope of %s from %s data points', gc_slope, len(data_points)) # deviation from normal total_windows = sum([int(l['WINDOWS']) for l in lines]) # total_windows * 0.0004 gives approximately the same number of data points as 20 <= GC <= 80 threshold = total_windows * 0.0004 diffs = [abs(1 - float(l['NORMALIZED_COVERAGE'])) for l in lines if int(l['WINDOWS']) > threshold] normal_dev = sum(diffs) / len(diffs) self.info('Calculated a normal deviation of %s from %s data points', normal_dev, len(diffs)) run_element['gc_bias'] = { 'slope': gc_slope[0], 'mean_deviation': normal_dev }
mm2=pd.DataFrame(tc2) mm2['yearfrac']=np.arange(0,len(tc2))/12. ssd2 = decompose(tc2, frequency=12, s_window=35, robust=True, s_degree=0) mm2['deseas']=ssd2['trend']+ssd2['residual'] tc=pd.Series(data=df['total_column']).resample('M').mean() mm=pd.DataFrame(tc) mm['yearfrac']=np.arange(0,len(tc))/12. ssd = decompose(tc, frequency=12, s_window=35, robust=True, s_degree=0) mm['deseas']=ssd['trend']+ssd['residual'] s1=theilslopes(mm1['deseas'],mm1['yearfrac']) s2=theilslopes(mm2['deseas'],mm2['yearfrac']) mm1['trend']=mm1['yearfrac']*s1[0]+s1[1] mm2['trend']=mm2['yearfrac']*s2[0]+s2[1] #plt.plot(df1.index.to_pydatetime(),df1['total_column'],'.') plt.plot(mm1.index.to_pydatetime(),mm1['total_column']) plt.plot(mm1.index.to_pydatetime(),mm1['deseas']) plt.plot(mm1.index.to_pydatetime(),mm1['trend']) plt.plot(mm2.index.to_pydatetime(),mm2['total_column']) plt.plot(mm2.index.to_pydatetime(),mm2['deseas']) plt.plot(mm2.index.to_pydatetime(),mm2['trend']) #plt.xlim(datetime.date(2007,1,1),datetime.date(2009,1,1)) plt.show()
def calc_result_frame(self, trim=True): '''Return a result_frame Returns a result_frame which contains the charecteristics of each soiling interval.soiling. An updated version of the pm_frame is stored as self.pm_frame. Parameters ---------- trim (bolean): whether to trim (remove) the first and last soiling intervals to avoid inclusion of partial intervals ''' # Estimate slope of each soiling interval, store results in a dataframe result_list = [] if trim: res_loop = sorted(list(set(self['run'])))[1:-1] # ignore first and last interval else: res_loop = sorted(list(set(self['run']))) for r in res_loop: run = self[self.run == r] length = (run.day[-1] - run.day[0]) start_day = run.day[0] end_day = run.day[-1] run = run[run.pi_norm > 0] if len(run) > 2 and run.pi_norm.sum() > 0: fit = theilslopes(run.pi_norm, run.day) fit_poly = np.poly1d(fit[0:2]) result_list.append({ 'start': run.index[0], 'end': run.index[-1], 'length': length, 'run': r, 'run_slope': fit[0], 'run_slope_low': fit[2], 'run_slope_high': min([0.0, fit[3]]), 'max_neg_step': min(run.delta), 'start_loss': 1, 'clean_wo_precip': run.clean_wo_precip[0], 'inferred_start_loss': fit_poly(start_day), 'inferred_end_loss': fit_poly(end_day), 'valid': True }) else: run = self[self.run == r] result_list.append({ 'start': run.index[0], 'end': run.index[-1], 'length': length, 'run': r, 'run_slope': 0, 'run_slope_low': 0, 'run_slope_high': 0, 'max_neg_step': min(run.delta), 'start_loss': 1, 'clean_wo_precip': run.clean_wo_precip[0], 'inferred_start_loss': run.pi_norm.mean(), 'inferred_end_loss': run.pi_norm.mean(), 'valid': False }) results = pd.DataFrame(result_list) if results.empty: raise NoValidIntervalError('No valid soiling intervals were found') # Filter results for each interval setting invalid interval to slope of 0 results['slope_err'] = (results.run_slope_high - results.run_slope_low) / abs(results.run_slope) # critera for exclusions filt = ( (results.run_slope > 0) | (results.slope_err > 5) | (results.max_neg_step <= -0.05) ) results.loc[filt, 'run_slope'] = 0 results.loc[filt, 'run_slope_low'] = 0 results.loc[filt, 'run_slope_high'] = 0 results.loc[filt, 'valid'] = False # Calculate the next inferred start loss from next valid interval results['next_inferred_start_loss'] = np.clip(results[results.valid].inferred_start_loss.shift(-1), 0, 1) # Calculate the inferred recovery at the end of each interval results['inferred_recovery'] = np.clip(results.next_inferred_start_loss - results.inferred_end_loss, 0, 1) # Don't consider data outside of first and last valid interverals if len(results[results.valid]) == 0: raise NoValidIntervalError('No valid soiling intervals were found') new_start = results[results.valid].start.iloc[0] new_end = results[results.valid].end.iloc[-1] pm_frame_out = self[new_start:new_end] pm_frame_out = pm_frame_out.reset_index().merge(results, how='left', on='run').set_index('date') pm_frame_out['loss_perfect_clean'] = np.nan pm_frame_out['loss_inferred_clean'] = np.nan pm_frame_out['days_since_clean'] = (pm_frame_out.index - pm_frame_out.start).dt.days # Caluclate the daily derate pm_frame_out['loss_perfect_clean'] = pm_frame_out.start_loss + pm_frame_out.days_since_clean * pm_frame_out.run_slope pm_frame_out.loss_perfect_clean = pm_frame_out.loss_perfect_clean.fillna(1) # filling the flat intervals may need to be recalculated for different assumptions pm_frame_out['loss_inferred_clean'] = pm_frame_out.inferred_start_loss + pm_frame_out.days_since_clean * pm_frame_out.run_slope pm_frame_out.loss_inferred_clean = pm_frame_out.loss_inferred_clean.fillna(1) # filling the flat intervals may need to be recalculated for different assumptions out = result_frame(results) out.pm_frame = pm_frame_out return out
def draw_corplot(x, y, xname, yname, add_robust=False, save_to_file=True, \ ax=None, stats_title=True, stats_legend=False, customcol=None, \ legendprefix=''): # Choose the right colour for the plot. if customcol is None: regress_col = PLOTCOLS['regression'] sample_col = PLOTCOLS['samples'] sample_alpha = 0.75 else: regress_col = customcol sample_col = customcol sample_alpha = 0.75 # Create a new plot. if ax is None: fig, ax = pyplot.subplots(nrows=1, ncols=1) # Plot a scatter plot of the x and y values. ax.plot(x, y, 'o', color=sample_col, alpha=sample_alpha) # Plot the regression line. if add_robust: # Perform a linear regression. slope, intercept, lo_slope, up_slope = theilslopes(y, x, alpha=0.95) # Plot the regression line. x_pred = numpy.array([numpy.min(x), numpy.max(x)]) y_pred = slope * x_pred + intercept y_lo = lo_slope * x_pred + intercept y_up = up_slope * x_pred + intercept ax.plot(x_pred, y_pred, '-', color=regress_col) ax.fill_between(x_pred, y_lo, y_up, linewidth=3, alpha=0.2, \ color=PLOTCOLS['regression']) # Perform a linear regression. model = linregress(x, y) try: r = model.rvalue p = model.pvalue slope = model.slope intercept = model.intercept except: slope, intercept, r, p, stderr = model # Perform a Spearman correlation. spearman = spearmanr(x, y) try: spearman_rho = spearman.correlation spearman_p = spearman.pvalue except: spearman_rho, spearman_p = spearman # Compute Kendall's Tau. kendall = kendalltau(x, y) try: kendall_tau = kendall.correlation kendall_p = kendall.pvalue except: kendall_tau, kendall_p = kendall # Set the regression line's label. if stats_legend: # Uncomment if you'd like to see both parametric and non-parametric # test results. #lbl = r"$R=%.2f, p=%.2f$" % (r, p) #lbl = lbl + "\n" + r"$\tau=%.2f, p=%.2f$" % (kendall_tau, kendall_p) # Show Kendall's tau, as we're using a lowish N. if kendall_p < 0.001: kendall_pstr = r"p<0.001" else: kendall_pstr = r"p=%.3f" % (kendall_p) lbl = r"%s$\tau=%.2f, %s$" % (legendprefix, kendall_tau, kendall_pstr) else: lbl = None # Plot the regression line. x_pred = numpy.array([numpy.min(x), numpy.max(x)]) y_pred = slope * x_pred + intercept ax.plot(x_pred, y_pred, '-', color=regress_col, linewidth=3, label=lbl) # Finish the plot. ax.set_xlabel(xname.capitalize(), fontsize=FONTSIZE['label']) ax.set_ylabel(yname.capitalize(), fontsize=FONTSIZE['label']) if stats_title: ax.set_title("R=%.2f, p=%.3f; Rho=%.2f, p=%.3f; Tau=%.3f, p=%.3f" % \ (r, p, spearman_rho, spearman_p, kendall_tau, kendall_p)) if stats_legend: ax.legend(loc="best", fontsize=FONTSIZE['legend']) # Save the plot. if save_to_file: fig.savefig(os.path.join(OUTDIR, "corplot_%sx%s.png" % (xname, yname))) if ax is None: pyplot.close(fig)
def subtract_psfs(image, psf, radius, x, y, e_limit=0.1, **kwargs): """ Subtract PSFs from an image using linear regression Parameters ---------- image : 2D array the image from which the PSFs with be subtracted psf : 2D array image of the PSF, doesn't need to be size-matched to the image radius : int radius of box around the PSF to be used for matching the height and base x, y : list, array pixel coordinates e_limit : float If the relative difference between thielslope fits is less than some value, reject the subtraction i.e. (m_high - m_low) / m < 2 * e_limit Returns ------- im_new : 2D array results : list The results of the fit as returned by ``scipy.stats.linregress`` i.e. gradient, intercept, r-value, p-value, err """ im_new = np.copy(image) if np.shape(psf)[0] < 2 * np.shape(image)[0]: w, h = np.shape(image) pw, ph = np.shape(psf) pad_w = int(w-pw/2)+1 psf_pad = np.pad(psf, pad_w, mode="constant") else: psf_pad = psf q = int(radius) cy, cx = np.where(psf_pad == psf_pad.max()) cy, cx = cy[0], cx[0] psf_cutout = np.copy(psf_pad[cx-q : cx+q+1, cy-q : cy+q+1]) psf_flat = psf_cutout.ravel() fit_results = [] for xx, yy in zip(x, y): xii, yii = int(xx), int(yy) w, h = im_new.shape dx0, dx1 = xii, w-xii dy0, dy1 = yii, h-yii q1, q2, q3, q4 = min(q, dx0), min(q, dx1), min(q, dy0), min(q, dy1) im_cutout = np.copy(im_new[xii-q1 : xii+q2+1, yii-q3 : yii+q4+1]) im_flat = im_cutout.ravel() if len(psf_flat) != len(im_flat): fit_results += [[0]*7] continue m, c, r, p, e = linregress(psf_flat, im_flat) m, c, a, b = theilslopes(im_flat, psf_flat) e = 0.5*(b-a)/m # If it failes the null-hypothosis test - i.e. p > 0.1 if p > 0.1: fit_results += [[0]*7] continue # If the relative difference between thielslope fits is less than # some value, reject the subtraction # i.e. the (m_high - m_low) < 2 * m * e_limit if e > e_limit: fit_results += [[0]*7] continue # If the fitted slope is less then zero, forget the fit if m < 0: fit_results += [[0]*7] continue else: fit_results += [[m, c, r, p, e, a, b]] psf_cutout = np.copy(psf_pad[cx-dx0 : cx+dx1, cy-dy0 : cy+dy1]) psf_cutout *= m im_new[xii-dx0 : xii+dx1, yii-dy0 : yii+dy1] -= psf_cutout return im_new, fit_results
def compute_trend(self, start_year, stop_year, season=None, slope_confidence=.68): if slope_confidence is None: slope_confidence = .68 if self._mobs is None: raise ValueError('Cannot compute trends: monthly data is not ' 'available') mobs = self._mobs start_year, stop_year, period_str, yrs = _init_period(mobs, start_year, stop_year) if season in [None, 'all']: seas = 'all' elif season in SEASONS: seas = season if not 'seas' in self.yearly: self['yearly'][seas] = yearly = _get_yearly(mobs, seas, yrs) else: yearly = self['yearly'][seas] dates = yearly.index.values values = yearly.values (start_date, stop_date, period_index, num_dates_period) = _init_period_dates(start_year, stop_year, seas) # get period filter mask tmask = np.logical_and(dates>=start_date, dates<=stop_date) # apply period mask to jsdate vector and value vector dates_data = dates[tmask] # vector containing data values vals = values[tmask] valid = ~np.isnan(vals) #works only on not nan values dates_data = dates_data[valid] vals = vals[valid] num_dates_data = dates_data.astype('datetime64[Y]').astype(np.float64) # create empty dictionary that is used to store trends results result = _init_trends_result_dict(start_year) #TODO: len(y) is number of years - 1 due to midseason averages result['n'] = len(vals) if len(vals) > 2: result['y_mean'] = np.nanmean(vals) result['y_min'] = np.nanmin(vals) result['y_max'] = np.nanmax(vals) #Mann / Kendall test [tau, pval] = kendalltau(x=num_dates_data, y=vals) (slope, yoffs, slope_low, slope_up) = theilslopes(y=vals, x=num_dates_data, alpha=slope_confidence) # estimate error of slope at input confidence level slope_err = np.mean([abs(slope - slope_low), abs(slope - slope_up)]) reg_data = slope * num_dates_data + yoffs reg_period = slope * num_dates_period + yoffs # value used for normalisation of slope to compute trend T # T=m / v0 v0_data = reg_data[0] v0_period = reg_period[0] # Compute the mean residual value, which is used to estimate # the uncertainty in the normalisation value used to compute # trend mean_residual = np.mean(np.abs(vals - reg_data)) # trend is slope normalised by first reference value. # 2 trends are computed, 1. the trend using the first value of # the regression line at the first available data year, 2. the # trend corresponding to the value corresponding to the first # year of the considered period. trend_data = slope / v0_data * 100 trend_period = slope / v0_period * 100 # Compute errors of normalisation values v0_err_data = mean_residual t0_data, tN_data = num_dates_data[0], num_dates_data[-1] t0_period = num_dates_period[0] # sanity check assert t0_data < tN_data assert t0_period <= t0_data dt_ratio = (t0_data - t0_period) / (tN_data - t0_data) v0_err_period = v0_err_data * (1 + dt_ratio) trend_data_err = _compute_trend_error(m=slope, m_err=slope_err, v0=v0_data, v0_err=v0_err_data) trend_period_err = _compute_trend_error(m=slope, m_err=slope_err, v0=v0_period, v0_err=v0_err_period) result['pval'] = pval result['m'] = slope result['m_err'] =slope_err result['yoffs'] = yoffs result['slp'] = trend_data result['slp_err'] = trend_data_err result['reg0'] = v0_data tp, tperr, v0p = None, None, None if v0_period > 0: tp = trend_period tperr = trend_period_err v0p = v0_period result['slp_{}'.format(start_year)] = tp result['slp_{}_err'.format(start_year)] = tperr result['reg0_{}'.format(start_year)] = v0p result['period'] = period_str if not seas in self.results: self.results[seas] = od() self.results[seas][period_str] = result return result
y = sdatasok X, Y = [], [] #only work with notnan values for i in range(0,len(x)): if not np.isnan(x[i]) and not np.isnan(y[i]): X.append(x[i]) Y.append(y[i]) if lok>=nmkmin: p_stat='yes' #Mann-Kendall test [tau,pval]=kendalltau(X,Y) print(tau,pval) #theil slope res=theilslopes(Y,X,sig) reg=res[0]*np.asarray(X)+res[1]*np.ones(len(X)) regg=res[0]*np.asarray(mods)+res[1]*np.ones(len(mods)) spyr=res[0]*365*100/abs(reg[0]) #% per year reg0=reg[0] else: tau, pval, spyr = float('NaN'), float('NaN'), float('NaN') str_tau = "%5.2f" % tau str_pval = "%5.4f" % pval str_a = "%4.1f" % spyr # - - - - - - - - Model - - - - - - - - - mod_tau, mod_pval, mod_spyr, mod_reg0 = float('NaN'), float('NaN'), float('NaN'), float('NaN') mod_X, mod_Y, mod_sdatas, mod_sdatasok, mod_sodsok = [], [], [], [], [] lok=0
def compute_trends_new(s_monthly, periods, only_yearly=True): #sm = to_monthly_current_trends_interface(s0, MIN_DIM) d = dict(month=s_monthly.index.month, year=s_monthly.index.year, value=s_monthly.values) mobs = pd.DataFrame(d) mobs['season'] = mobs.apply( lambda row: _get_season_new(row['month'], row['year']), axis=1) mobs = mobs.dropna(subset=['value']) seasons = ['JFM', 'AMJ', 'JAS', 'OND', 'all'] #trends with yearly and seasonal averages # get all years that are contained in data yrs = np.unique(mobs['year']) data = {} for i, seas in enumerate(seasons): if only_yearly and not seas == 'all': continue #initialize seasonal object data[seas] = {'date': [], 'jsdate': [], 'val': [], 'trends': {}} dates = [] #filter the months for yr in yrs: if seas != 'all': catch = mobs[mobs['season'].str.contains('{}-{}'.format( seas, yr))] else: catch = mobs[mobs['season'].str.contains('-{}'.format(yr))] date = _mid_season_new(seas, yr) dates.append(date) #needs 4 seasons to compute seasonal average to avoid biases if seas == 'all' and len(np.unique(catch['season'].values)) < 4: data[seas]['val'].append(np.nan) else: data[seas]['val'].append(np.nanmean(catch['value'])) data[seas]['date'] = np.asarray(dates) data[seas]['jsdate'] = to_jsdate(data[seas]['date']) #filter period for period in periods: data[seas]['trends'][period] = {} # desired start / stop year (note, that this may change if first # or last value in tseries (or both) is NaN) start_yr, stop_yr = years_from_periodstr(period) num_yrs = stop_yr - start_yr #filtering to the period limit jsp0 = to_jsdate(np.datetime64('{}-01-01'.format(start_yr))) jsp1 = to_jsdate(np.datetime64('{}-12-31'.format(stop_yr))) # vector containing numerical timestamps in javascript format jsdate = data[seas]['jsdate'] # get period filter mask tmask = np.logical_and(jsdate >= jsp0, jsdate <= jsp1) # filter data by period jsdate = jsdate[tmask] # vector containing data values y = np.asarray(data[seas]['val'])[tmask] # ============================================================================= # num_leap_years = np.sum(dt_idx.is_leap_year) # # secs_per_year = np.mean(([86400 * 365] * (num_yrs-num_leap_years) + # [86400 * 366] * num_leap_years)) # # # ============================================================================= valid = ~np.isnan(y) # Remove NaNs for Mann-Kendall test and regression _jsdate = jsdate[valid] _y = y[valid] if len(_jsdate) > 2: #kendall [tau, pval] = kendalltau(_jsdate, _y) data[seas]['trends'][period]['pval'] = pval #theil slope res = theilslopes(_y, _jsdate, 0.9) slope = res[0] yoffs = res[1] # regression line (evaluate at ACTUAL time-stamps corresponding # to input period -> jsdate and not _jsdate, which may have # removed first or last year, or both) reg = slope * jsdate + yoffs # ============================================================================= # # time difference between start and stop in full years. # dt = (np.datetime64(dates[-1]) - # np.datetime64(dates[0])).astype('timedelta64[s]').astype(int) # # ============================================================================= # ============================================================================= # from numpy.testing import assert_allclose # try: # assert_allclose(dt, secs_per_year * num_yrs, rtol=1e-3) # except: # print(start_yr, stop_yr) # print(dates[0], dates[-1]) # print(yrs) # ============================================================================= #dt = dt / secs_per_year #time diff in units of years # compute slope in units of %/yr-1 normalised by first value # of considered time-series slp = (reg[-1] - reg[0]) / (num_yrs * reg[0]) * 100 data[seas]['trends'][period]['slp'] = slp data[seas]['trends'][period]['reg0'] = reg[0] data[seas]['trends'][period]['t0'] = jsdate[0] data[seas]['trends'][period]['n'] = len(y) else: data[seas]['trends'][period]['pval'] = None data[seas]['trends'][period]['slp'] = None data[seas]['trends'][period]['reg0'] = None data[seas]['trends'][period]['t0'] = None data[seas]['trends'][period]['n'] = len(y) return data
def processInput_trends(subchunk, parent_iteration, child_iteration): """This is the main file that calculate trends""" #print('INFO: see pid.<pid>.out to monitor trend computation progress') #sys.stdout = open('pid.'+str(os.getpid()) + '.out', 'w') #print('INFO: see trend.out to monitor trend computation progress') #sys.stdout = open('trend.out', 'a') ## Debug tool to print process Ids process = psutil.Process(os.getpid()) current = current_process() print(process, current._identity, '{} Mo'.format(process.memory_info().rss/1024/1024)) if subchunk.input=='box': print('### Chunk {} > subchunk {} started: COL: [{}:{}] ROW: [{}:{}]'.format(parent_iteration, child_iteration, *subchunk.get_limits('local', 'str'))) write_string0 = (param.hash+"_CHUNK" + np.str(parent_iteration) + "_SUBCHUNK" + np.str(child_iteration) + "_" + '_'.join(subchunk.get_limits('global', 'str')) + '.nc') subchunk_fname = param.output_path / write_string0 ## Check if cache file already exists and must be overwritten if not param.b_delete: if subchunk_fname.is_file(): print ('INFO: {} already exists. Use -d option to overwrite it.'.format(write_string0)) return elif subchunk.input=='points': print('### Chunk {} > subchunk {} started.'.format(parent_iteration, child_iteration)) print(param.input_file) str_date_range = param.input_file.stem.replace('timeseries','') write_string0 = 'merged_trends{}.h5'.format(str_date_range) subchunk_fname = param.output_path / write_string0 ## Result file is always overwritten in the case of point input ## Read the input time series file from main chunk, configured length of time X 500 X 500; it may vary if different chunks are used hdf_ts = h5py.File(param.input_file, 'r') ## Create temporary storage with size of sub chunks in main chunk, currently configured 100 by 100 blocks var_temp_output = np.empty([*subchunk.dim,4]) var_temp_output[:] = np.nan # NaN matrix by default ## Parameters for te loop b_deb = 0 # flag to print time profiling t00 = timer() t000 = timer() t_mean = 0. #print(current._identity, f'{process.memory_info().rss/1024/1024} Mo') offsetx = subchunk.get_limits('local', 'tuple')[0] offsety = subchunk.get_limits('local', 'tuple')[2] print_freq = 20 tab_prof_valid = [] tab_prof_zero = [] hf = h5py.File(subchunk_fname, 'w') for tsvar in hdf_ts['vars'].keys(): for jj_sub in range(subchunk.dim[0]): #for jj_sub in range(61,80): #debug # dimension of variable: time,x,y # preload all the y data here to avoid overhead due to calling Dataset.variables at each iteration in the inner loop data_test0 = hdf_ts['vars/'+tsvar][:,jj_sub+offsety,offsetx:offsetx+subchunk.dim[1]] #data_test0 = hdf_ts.variables[tsvar][:500,sub_chunks_x[ii_sub],:] for ii_sub in range(subchunk.dim[1]): #for ii_sub in range(55,100): if b_deb: print('---------------') if b_deb: print('jj: {} - ii: {} '.format(jj_sub, ii_sub)) t0 = timer() data_test = data_test0[:,ii_sub] ## remove tie group data_test[1:][np.diff(data_test)==0.] = np.nan #data_test=hdf_ts.variables[tsvar][:,sub_chunks_x[ii_sub],sub_chunks_y[jj_sub]] slope=999.0 if b_deb: print('t0', timer()-t0) t0 = timer() if b_deb: print('Data valid:', data_test.size - np.isnan(data_test).sum(), '/', data_test.size) if 0: print('Use mstats') data_sen=np.ma.masked_array(data_test, mask=np.isnan(data_test)) t0 = timer() slope, intercept, lo_slope, up_slope = mstats.theilslopes(data_sen, alpha=0.1) print('slope, intercept, lo_slope, up_slop:') print(slope, intercept, lo_slope, up_slope) if b_deb: print('t02', timer()-t0) np.savetxt('data_test.dat', data_test.T) sys.exit() t0 = timer() # this mstats give correct slope and is consistent with python man-kendall score of Sn; this is fast than Fortran''' # stats.theilslopes is giving incorrect values when NaN are inside data''' if b_deb: print('t2', timer()-t0) t0 = timer() if 1: ## orinal mann-kendall test : bla = data_test[~np.isnan(data_test)] if bla.size > 0: #print('min/mean/max/nb/nb_unique', bla.min(), bla.mean(), bla.max(), len(bla), len(np.unique(bla))) #print(bla) if len(np.unique(bla))==1: p,z,Sn,nx = [0,0,0,0] else: #data_test = data_test[-10:] # debug line to speed up #try: # p,z,Sn,nx = mk_test_timeout(data_test) #except TimeoutError as e: # print('timeout!') # p,z,Sn,nx = [0,0,0,0] p,z,Sn,nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) else: p,z,Sn,nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) # if data_test = [], the test return (p,z,Sn,nx) = (1.0, 0.0, 0.5, 0.0) else: ## other test p,z,Sn,nx = [0,0,0,0] z = data_test.mean() if b_deb: t4 = timer()-t0 if bla.size>0: if bla.mean()==0.0: tab_prof_zero.append(t4) else: tab_prof_valid.append(t4) print('t4=', t4) if 0: import matplotlib.pyplot as plt plt.clf() plt.plot(bla) plt.ylim(0,6.1) ti1 = '{}/{} - {:.3f} s'.format(jj_sub, ii_sub, t4) ti2 = 'min/mean/max/nb/nb_unique {:.3f} {:.3f} {:.3f} {} {}'.format(bla.min(), bla.mean(), bla.max(), len(bla), len(np.unique(bla))) ti3 = 'slope: {}'.format(Sn) plt.title(ti1+'\n'+ti2+'\n'+ti3) if Sn==0.0: plt.savefig('bla.Sn0.{}.{}.png'.format(jj_sub, ii_sub)) else: plt.savefig('bla.{}.{}.png'.format(jj_sub, ii_sub)) t0 = timer() if b_deb: print('p,z,slope,nx', p,z,slope,nx) if b_deb: print('p,z,Sn,nx', p,z,Sn,nx) var_temp_output[jj_sub,ii_sub,0] = p var_temp_output[jj_sub,ii_sub,1] = z var_temp_output[jj_sub,ii_sub,2] = Sn var_temp_output[jj_sub,ii_sub,3] = nx ## Print efficiency stats if (jj_sub+1)%print_freq==0: elapsed = timer()-t00 data_stat = hdf_ts.variables[tsvar][:,jj_sub+1+offsety-print_freq:jj_sub+1+offsety,offsetx:offsetx+subchunk.dim[1]] valid = 100.*(data_stat.size - np.count_nonzero(np.isnan(data_stat)))/data_stat.size eff = 1e6*elapsed/data_stat.size #print(subchunk.dim, data_test0.shape) print('{} : {}.{}.block[{}-{}] : {:.3f}s elapsed : {:.3f} us/pix/date : {:.2f}% valid'.format(datetime.datetime.now(), parent_iteration, child_iteration, jj_sub+1-print_freq, jj_sub+1, elapsed, eff, valid)) t00 = timer() sys.stdout.flush() if 0: t_mean += timer()-t00 #print(f't00 {ii_sub} {t_mean/(ii_sub+1)}') #print(f't00.p{current._identity[0]}.it{ii_sub} {t_mean/(ii_sub+1):.3f}s {process.memory_info().rss/1024/1024:.2f}Mo') print('t00.p{}.it{} {:.3f}s {:.2f}Mo'.format(current._identity[0], ii_sub, t_mean/(ii_sub+1), process.memory_info().rss/1024/1024)) v = var_temp_output[ii_sub,:,:] for ii in range(4): #print(f'{np.count_nonzero(np.isnan(v[:,ii]))/v[:,ii].size:.3f}', np.nanmin(v[:,ii]), np.nanmax(v[:,ii])) print('{:.3f}'.format(np.count_nonzero(np.isnan(v[:,ii]))/v[:,ii].size), np.nanmin(v[:,ii]), np.nanmax(v[:,ii])) print(np.nanmin(v), np.nanmax(v)) t00 = timer() if b_deb: #if 1: valid = np.array(tab_prof_valid) zero = np.array(tab_prof_zero) #print('valid:', valid.size, valid.min(), valid.mean(), valid.max()) #print('zero:', zero.size, zero.min(), zero.mean(), zero.max()) print(valid.mean()) print(zero.mean()) #return #sys.exit() print('t000tot.p{} {:.3f}s {:.2f}Mo'.format(current._identity, timer()-t000, process.memory_info().rss/1024/1024)) hf.create_dataset(tsvar+'/pval', data=var_temp_output[:,:,0]) hf.create_dataset(tsvar+'/zval', data=var_temp_output[:,:,1]) hf.create_dataset(tsvar+'/slope', data=var_temp_output[:,:,2]) hf.create_dataset(tsvar+'/len', data=var_temp_output[:,:,3]) hf.close() print ('Subchunk {} completed, save to {}'.format(child_iteration, subchunk_fname)) return None
def _calc_result_df(self, trim=False, max_relative_slope_error=500.0, max_negative_step=0.05, min_interval_length=2): ''' Calculates self.result_df, a pandas dataframe summarizing the soiling intervals identified and self.analyzed_daily_df, a version of self.daily_df with additional columns calculated during analysis. Parameters ---------- trim : bool, default False whether to trim (remove) the first and last soiling intervals to avoid inclusion of partial intervals max_relative_slope_error : float, default 500 the maximum relative size of the slope confidence interval for an interval to be considered valid (percentage). max_negative_step : float, default 0.05 The maximum magnitude of negative discrete steps allowed in an interval for the interval to be considered valid (units of normalized performance metric). min_interval_length : int, default 2 The minimum duration for an interval to be considered valid. Cannot be less than 2 (days). ''' daily_df = self.daily_df result_list = [] if trim: # ignore first and last interval res_loop = sorted(list(set(daily_df['run'])))[1:-1] else: res_loop = sorted(list(set(daily_df['run']))) for r in res_loop: run = daily_df[daily_df['run'] == r] length = (run.day[-1] - run.day[0]) start_day = run.day[0] end_day = run.day[-1] start = run.index[0] end = run.index[-1] run_filtered = run[run.pi_norm > 0] # use the filtered version if it contains any points # otherwise use the unfiltered version to populate a # valid=False row if not run_filtered.empty: run = run_filtered result_dict = { 'start': start, 'end': end, 'length': length, 'run': r, 'run_slope': 0, 'run_slope_low': 0, 'run_slope_high': 0, 'max_neg_step': min(run.delta), 'start_loss': 1, 'inferred_start_loss': run.pi_norm.mean(), 'inferred_end_loss': run.pi_norm.mean(), 'valid': False } if len(run) > min_interval_length and run.pi_norm.sum() > 0: fit = theilslopes(run.pi_norm, run.day) fit_poly = np.poly1d(fit[0:2]) result_dict['run_slope'] = fit[0] result_dict['run_slope_low'] = fit[2] result_dict['run_slope_high'] = min([0.0, fit[3]]) result_dict['inferred_start_loss'] = fit_poly(start_day) result_dict['inferred_end_loss'] = fit_poly(end_day) result_dict['valid'] = True result_list.append(result_dict) results = pd.DataFrame(result_list) if results.empty: raise NoValidIntervalError('No valid soiling intervals were found') # Filter results for each interval, # setting invalid interval to slope of 0 results['slope_err'] = (results.run_slope_high - results.run_slope_low) / abs(results.run_slope) # critera for exclusions filt = ((results.run_slope > 0) | (results.slope_err >= max_relative_slope_error / 100.0) | (results.max_neg_step <= -1.0 * max_negative_step)) results.loc[filt, 'run_slope'] = 0 results.loc[filt, 'run_slope_low'] = 0 results.loc[filt, 'run_slope_high'] = 0 results.loc[filt, 'valid'] = False # Calculate the next inferred start loss from next valid interval results['next_inferred_start_loss'] = np.clip( results[results.valid].inferred_start_loss.shift(-1), 0, 1) # Calculate the inferred recovery at the end of each interval results['inferred_recovery'] = np.clip( results.next_inferred_start_loss - results.inferred_end_loss, 0, 1) # Don't consider data outside of first and last valid intervals if len(results[results.valid]) == 0: raise NoValidIntervalError('No valid soiling intervals were found') new_start = results[results.valid].start.iloc[0] new_end = results[results.valid].end.iloc[-1] pm_frame_out = daily_df[new_start:new_end] pm_frame_out = pm_frame_out.reset_index() \ .merge(results, how='left', on='run') \ .set_index('date') pm_frame_out['loss_perfect_clean'] = np.nan pm_frame_out['loss_inferred_clean'] = np.nan pm_frame_out['days_since_clean'] = \ (pm_frame_out.index - pm_frame_out.start).dt.days # Calculate the daily derate pm_frame_out['loss_perfect_clean'] = \ pm_frame_out.start_loss + \ pm_frame_out.days_since_clean * pm_frame_out.run_slope # filling the flat intervals may need to be recalculated # for different assumptions pm_frame_out.loss_perfect_clean = \ pm_frame_out.loss_perfect_clean.fillna(1) pm_frame_out['loss_inferred_clean'] = \ pm_frame_out.inferred_start_loss + \ pm_frame_out.days_since_clean * pm_frame_out.run_slope # filling the flat intervals may need to be recalculated # for different assumptions pm_frame_out.loss_inferred_clean = \ pm_frame_out.loss_inferred_clean.fillna(1) self.result_df = results self.analyzed_daily_df = pm_frame_out
y = sdatasok X, Y = [], [] #only work with notnan values for i in range(0,len(x)): if not np.isnan(x[i]) and not np.isnan(y[i]): X.append(x[i]) Y.append(y[i]) if lok>=nmkmin: p_stat='yes' #Mann-Kendall test [tau,pval]=kendalltau(X,Y) print(tau,pval) #theil slope res=theilslopes(Y,X,sig) reg=res[0]*np.asarray(X)+res[1]*np.ones(len(X)) regg=res[0]*np.asarray(mods)+res[1]*np.ones(len(mods)) spyr=res[0]*365*100/reg[0] #% per year reg0=reg[0] #str_b = "%3.2f" % res[1] else: tau, pval, spyr, reg0 = float('NaN'), float('NaN'), float('NaN'), float('NaN') str_tau = "%5.2f" % tau str_pval = "%5.4f" % pval str_a = "%4.1f" % spyr #listing of statistics taus.append(tau), pvals.append(pval), spyrs.append(spyr), reg0s.append(reg0) #plotting
filename2)).variables[var][stmon:, :, :] mask = IO.Land_Mask(root, model) data_land_monthly = nanmean(nanmean(data * mask, axis=2), axis=1) # calculate global annual mean data_land_annual = vstack([ sum(data_land_monthly[mon:mon + 12]) for mon in xrange(0, len(data_land_monthly), 12) ]) # calculate moving trend slope = np.empty((edyr - styr - 9, edyr - styr - 9)) slope.fill(np.nan) for st in xrange(0, edyr - styr - 9): for ed in xrange(st + 10, edyr - styr + 1): slope[ed - 10, st] = mstats.theilslopes(data_land_annual[st:ed], alpha=0.95)[0] # Mapping print model, var x = np.arange(styr, edyr - 9, 1.) y = np.arange(styr + 10, edyr + 1, 1.) X, Y = np.meshgrid(x, y) # create figure clevs = arange(limits[v][0], limits[v][1] + 0.01, (limits[v][1] - limits[v][0]) / 100) cblevs = arange(limits[v][0], limits[v][1] + 0.01, round((limits[v][1] - limits[v][0]) / 10, 2)) fig = plt.figure(figsize=(12, 8), dpi=100, facecolor="white") font = { 'family': 'serif', 'color': 'darkred',
def rna_dna_correspondence_main(loomfile, args_seg_file, args_patient_column, args_patient, args_time_point, output=None): """ Parameters ---------- loomfile : args_seg_file : args_patient_column : args_patient : args_time_point : output : (Default value = None) Returns ------- """ with loompy.connect(loomfile, validate=False) as loom: genes = loom.ra['gene'] metadata = pd.DataFrame(loom.ca['patient_ID']).astype(str) metadata.columns = ['patient_ID'] metadata['complexity'] = loom.ca['complexity'] metadata['cell_type'] = loom.ca['cell_type'] metadata['time_point'] = loom.ca['time_point'] list_of_gene_windows = wme.get_list_of_gene_windows( genes, window_size=800, window_step=300) segmentation = pd.read_table(args_seg_file) copy_ratio_dict = dna.segmentation_to_copy_ratio_dict( genes, segmentation, chrom_col='Chromosome', start_col='Start.bp', end_col='End.bp', score_col='tau', log2=True) gotten_cr = [ np.mean([copy_ratio_dict[gene] for gene in window]) for window in list_of_gene_windows ] gotten_wme, gotten_wme_metadata = wme.get_windowed_mean_expression(loom, list_of_gene_windows, patient=args_patient, patient_column='patient_ID', upper_cut=0) ps = [] ps_tumor = [] ps_tumor_low_complexity = [] ps_nontumor = [] ps_nontumor_low_complexity = [] sorted_cr = np.argsort(gotten_cr) # sorted_cr = np.hstack((sorted_cr[0:len(sorted_cr) // 5], # sorted_cr[4 * len(sorted_cr) // 5:len(sorted_cr)])) celltypes = metadata[metadata['patient_ID'] == args_patient]['cell_type'].values complexities = metadata[metadata['patient_ID'] == args_patient]['complexity'].values tps = metadata[metadata['patient_ID'] == args_patient]['time_point'].values for i in tqdm(range(gotten_wme.shape[1])): # bah = mk.original_test(gotten_wme[:, i][sorted_cr]) # p = (1 - stats.norm.cdf(bah.z)) p, a, b, c = theilslopes(gotten_wme[:,i], gotten_cr) if complexities[i] > 500: if celltypes[i] == 'tumor': ps_tumor.append(p) else: ps_nontumor.append(p) else: if celltypes[i] == 'tumor': ps_tumor_low_complexity.append(p) else: ps_nontumor_low_complexity.append(p) # if complexities[i] > 500: # ps_tumor.append(p) # else: # ps_tumor_low_complexity.append(p) # celltypes = metadata.query( # 'patient_ID == {}'.format(args_patient))['cell_type'].values # complexities = metadata.query( # 'patient_ID == {}'.format(args_patient))['complexity'].values # tps = metadata.query( # 'patient_ID == {}'.format(args_patient))['time_point'].values # iterator = 0 # for i in tqdm(range(gotten_wme.shape[1])): # if tps[i] == args_time_point: # if celltypes[i] != 'tumor': # bah = mk.original_test(gotten_wme[:, i][sorted_cr]) # p = (1 - stats.norm.cdf(bah.z)) # p = bah.z # #if p > 1: # # plt.plot(gotten_wme[:,i][sorted_cr]) # # plt.show() # ps.append(p) # if celltypes[i] == 'tumor': # ps_tumor.append(p) # else: # ps_nontumor.append(p) # if iterator == 500: # break # iterator += 1 # iterator = 0 # sorted_cr = np.argsort(gotten_cr) ## celltypes = metadata.query( ## 'patient_ID == {}'.format(args_patient))['cell_type'].values # for i in tqdm(range(gotten_wme.shape[1])): # if tps[i] == args_time_point: # if celltypes[i] == 'tumor': # bah = mk.original_test(gotten_wme[:, i][sorted_cr]) # p = (1 - stats.norm.cdf(bah.z)) # p = bah.z # ps.append(p) # if celltypes[i] == 'tumor': # # if p < 3: # # plt.plot(gotten_wme[:,i][sorted_cr]) # # plt.show() # if complexities[i] > 1000: # ps_tumor.append(p) # else: # ps_tumor_low_complexity.append(p) # else: # ps_nontumor.append(p) # if iterator > 500 and len(ps_tumor_low_complexity) > 0: # break # iterator += 1 sns.distplot(ps_nontumor, label='non-malignant', color='r') sns.distplot(ps_nontumor_low_complexity, label='low complexity non-malignant', color='y') sns.distplot(ps_tumor, label='high complexity_tumor', color='k') sns.distplot(ps_tumor_low_complexity, label='low complexity tumor', color='b') plt.legend() #from IPython.core.debugger import set_trace; set_trace() # The IPython is not in the dependencies and the output is disabled from Feb 2020. # Ipython debug mode should be removed. #if output: warn(f"Output is disabled from 20 Feb 2020. Saving to {output}") plt.savefig(output)
y = np.sin(2 * np.pi * x) slope = 0.2 # [1/year] # sinus + slope y += slope * x # random + slope only #y = 0.1*np.random.rand(len(x)) + slope*x #plt.plot(x,y) #plt.show() p, z, Sn, nx = m.mk_trend(len(y), np.arange(len(y)), y) print('p, z, Sn, nx:') print(p, z, Sn, nx) slope2, intercept, lo_slope, up_slope = mstats.theilslopes(y) print('slope2, intercept, lo_slope, up_slop:') print(slope2, intercept, lo_slope, up_slope) res_smk = sk.seakeni(y, 365) print(res_smk) print('Summary:') print("mk fortran : {}, err[%] = {:.2f}".format( Sn * freq, 100 * (slope - Sn * freq) / slope)) print("mk scipy : {}, err[%] = {:.2f}".format( slope2 * freq, 100 * (slope - slope2 * freq) / slope)) print("smk fortran: {}, err[%] = {:.2e}".format( res_smk[1], 100 * (slope - res_smk[1]) / slope))
def trend_CI(x_var, y_var, n_boot=1000, ci=95, trendtype="linreg", q=0.5, frac=0.6, it=3, autocorr=None, CItype="bootstrap"): """calculates bootstrap confidence interval and significance level for trend, ignoring autocorrelation or accounting for it Parameters ---------- x_var : list independent variable y_var : list dependent variable, same length as x_var q : int, optional, only if trendtype==quantreg quantile for which regression is to be calculated n : int, optional number of bootstrap samples ci : int, optional confidence level. Default is for 95% confidence interval frac : int, optional, only if trendtype==lowess lowess parameter (fraction of time period length used in local regression) it : int, optional, only if trendtype==lowess lowess parameter (numbre of iterations) autocorr : str, optional way of accounting for autocorrelation, possible values: None, "bootstrap" trendtype : str, optional method of trend derivation, possible values: lowess, linreg, quantreg, TheilSen CItype : str, optional method of CI derivation, possible values: "analytical" and "bootstrap". if trendtype is "lowess", CItype will be set to None if CItype is "analytical": autocorrelation will be set to None Results ------- returns library with following elements: slope - slope of the trend CI_high - CI on the slope value CI_low - as above pvalue - trend's significance level trend - trend line, or rather its y values for all x_var trendCI_high - confidence interval for each value of y trendCI_low - as above Remarks ------- the fit function ocassionally crashes on resampled data. The workaround is to use try statement """ import numpy as np import pandas as pd #for linreg import statsmodels.api as sm from statsmodels.regression.linear_model import OLS #for arima import statsmodels.tsa as tsa #for quantreg import statsmodels.formula.api as smf from statsmodels.regression.quantile_regression import QuantReg #for lowess import statsmodels.nonparametric.api as npsm #other from statsmodels.distributions.empirical_distribution import ECDF from scipy.stats import mstats, mannwhitneyu, t, kendalltau from arch.bootstrap import StationaryBootstrap, IIDBootstrap #preparing data if CItype=="analytical" and trendtype=="TheilSen": CItype="bootstrap" x_var=np.array(x_var) y_var=np.ma.masked_invalid(y_var) n_data=len(y_var) ci_low=(100-ci)/2 ci_high=100-ci_low #setting bootstrapping function if autocorr=="bootstrap": bs=StationaryBootstrap(3, np.array(range(len(y_var)))) else: bs=IIDBootstrap(np.array(range(len(y_var)))) if trendtype=="quantreg": print "Quantile regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" xydata=pd.DataFrame(np.column_stack([x_var, y_var]), columns=['X', 'Y']) model=smf.quantreg('Y ~ X', xydata) res=model.fit(q=q) intcpt=res.params.Intercept slope=res.params.X pvalue=res.pvalues[1] CI_low=res.conf_int()[0]['X'] CI_high=res.conf_int()[1]['X'] y_pred=res.predict(xydata) #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping bs_trends=np.copy(y_pred).reshape(-1,1) bs_slopes=[] bs_intcpts=[] for data in bs.bootstrap(n_boot): ind=data[0][0] model = smf.quantreg('Y ~ X', xydata.ix[ind,:]) try: res = model.fit(q=q) bs_slopes=bs_slopes+[res.params.X] bs_intcpts=bs_intcpts+[res.params.Intercept] bs_trends=np.append(bs_trends,res.predict(xydata).reshape(-1,1), 1) except: goingdownquietly=1 if trendtype=="linreg": print "Linear regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" x_varOLS = sm.add_constant(x_var) model = sm.OLS(y_var, x_varOLS, hasconst=True, missing='drop') res = model.fit() intcpt,slope=res.params pvalue=res.pvalues[1] CI_low,CI_high=res.conf_int()[1] y_pred=res.predict(x_varOLS) #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping for confidence intervals bs_slopes=[] bs_intcpts=[] bs_trends=np.copy(y_pred).reshape(-1,1) for data in bs.bootstrap(n_boot): ind=data[0][0] model = sm.OLS(y_var[ind], x_varOLS[ind,:], hasconst=True, missing='drop') try: res = model.fit() bs_slopes=bs_slopes+[res.params[1]] bs_intcpts=bs_intcpts+[res.params[0]] bs_trends=np.append(bs_trends,res.predict(x_varOLS).reshape(-1,1), 1) except: goingdownquietly=1 if trendtype=="TheilSen": # print "Theil-Sen slope, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" #significance of MK tau tau,pvalue=kendalltau(x_var, y_var) # print "raw MK tau:", tau, "raw MK pvalue:", pvalue #TS slope and confidence intervals slope,intercept,CI_low,CI_high=mstats.theilslopes(y_var, x_var, alpha=0.95) #getting slope line's y values y_pred=intercept+slope*x_var #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping for confidence intervals bs_slopes=[] bs_intcpts=[] bs_trends=np.copy(y_pred).reshape(-1,1) for data in bs.bootstrap(n_boot): ind=data[0][0] res=mstats.theilslopes(y_var[ind], x_var[ind], alpha=0.95) bs_slopes=bs_slopes+[res[0]] bs_intcpts=bs_intcpts+[res[1]] bs_trends=np.append(bs_trends, (res[1]+res[0]*x_var).reshape(-1,1), 1) if trendtype=="lowess": print "Lowess\n" temp=dict(npsm.lowess(y_var, x_var, frac=frac, it=it, missing="drop")) y_pred=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1) bs_trends=np.copy(y_pred) for data in bs.bootstrap(n_boot): ind=data[0][0] try: temp = dict(npsm.lowess(y_var[ind], x_var[ind], frac=frac, it=it, missing="drop")) temp=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1) pred=pd.DataFrame(temp, index=x_var) temp_interp=pred.interpolate().values bs_trends=np.append(bs_trends, temp_interp, 1) except: goingdownquietly=1 #calculating final values of CI and p-value #skipping when lowess if trendtype=="lowess": CI_low=np.nan CI_high=np.nan slope=np.nan intcpt=np.nan pvalue=np.nan confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1) trendCI_low=confint[:,0] trendCI_high=confint[:,1] else: if CItype=="bootstrap": #values for slope, intercept and trend can be obtained as medians of bootstrap distributions, but normally analytical parameters are used instead # it the bootstrap bias (difference between analytical values and bootstap median) is strong, it might be better to use bootstrap values. # These three lines would need to be uncommented then # slope=np.median(bs_slopes) # intcpt=np.median(bs_intcpts) # trend=intcpt+slope*x_var #these are from bootstrap too, but needs to be used for this accounts for autocorrelation, which is the point of this script CI_low,CI_high=np.percentile(bs_slopes, [5, 95]) ecdf=ECDF(bs_slopes) pvalue=ecdf(0) #this makes sure we are calculating p-value on the correct side of the distribution. That will be one-sided pvalue if pvalue>0.5: pvalue=1-pvalue confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1) print "bs_trends:", bs_trends.shape, confint.shape trendCI_low=confint[:,0] trendCI_high=confint[:,1] else: #this is for analytical calculation of trend confidence interval #it happens in the same way for each of the trend types, thus it is done here, not under the trendtype subroutines #making sure x are floats xtemp=np.array(x_var)*1.0 #squared anomaly squanom=(xtemp-np.mean(xtemp))**2 temp=((1./len(x_var))+(squanom/sum(squanom)))**0.5 #standard error of estmation see=(np.nansum((np.array(y_var)-np.nanmean(y_pred))**2)/len(x_var))**0.5 #adjusting ci ci_adj=1-((1-ci/100.)/2) #accounting for uncertainty in mean through student's t tcomp=t.ppf(ci_adj, len(x_var)-2) #confidence interval cint=tcomp*see*temp #for trend only trendCI_high=y_pred+cint trendCI_low=y_pred-cint print trendtype, "slope:",slope, "pvalue (one sided):", pvalue, "conf interval:", CI_low, CI_high, "autocorrelation adjustment:", autocorr, "\n" output={"slope":slope, "CI_high":CI_high, "CI_low":CI_high, "pvalue":pvalue, "trend": y_pred, "trendCI_low":trendCI_low, "trendCI_high":trendCI_high} return output
def compute_trends_current(s_monthly, periods, only_yearly=True): """Compute trends for station Slightly modified code from original trends interface developed by A. Mortier. Main changes applied: - Keep NaNs """ #sm = to_monthly_current_trends_interface(s0, MIN_DIM) d = dict(month=s_monthly.index.month, year=s_monthly.index.year, value=s_monthly.values) mobs = pd.DataFrame(d) mobs['season'] = mobs.apply( lambda row: _get_season_current(row['month'], row['year']), axis=1) mobs = mobs.dropna(subset=['value']) #trends with yearly and seasonal averages seasons = ['spring', 'summer', 'autumn', 'winter', 'all'] yrs = np.unique(mobs['year']) data = {} for i, seas in enumerate(seasons): if only_yearly and not seas == 'all': continue #initialize seasonal object data[seas] = {'date': [], 'jsdate': [], 'val': []} #filter the months for yr in yrs: if seas != 'all': catch = mobs[mobs['season'].str.contains(seas + '-' + str(yr))] else: catch = mobs[mobs['season'].str.contains('-' + str(yr))] date = _mid_season_current(seas, yr) data[seas]['date'].append(date) epoch = datetime.datetime(1970, 1, 1) data[seas]['jsdate'] = [(dat - epoch).total_seconds() * 1000 for dat in data[seas]['date']] #needs 4 seasons to compute seasonal average to avoid biases if (seas == 'all') & (len(np.unique(catch['season'].values)) < 4): data[seas]['val'].append(np.nan) else: data[seas]['val'].append(np.nanmean(catch['value'])) #trends for this season data[seas]['trends'] = {} #filter period for period in periods: p0 = int(period[:4]) p1 = int(period[5:]) data[seas]['trends'][period] = {} #Mann-Kendall test x = np.array(data[seas]['jsdate']) y = np.array(data[seas]['val']) #works only on not nan values x = x[~np.isnan(y)] y = y[~np.isnan(y)] #filtering to the period limit jsp0 = (datetime.datetime(p0, 1, 1) - epoch).total_seconds() * 1000 jsp1 = (datetime.datetime(p1, 12, 31) - epoch).total_seconds() * 1000 y = y[(x >= jsp0) & (x <= jsp1)] x = x[(x >= jsp0) & (x <= jsp1)] if len(x) > 2: #kendall [tau, pval] = kendalltau(x, y) data[seas]['trends'][period]['pval'] = pval #theil slope res = theilslopes(y, x, 0.9) reg = res[0] * np.asarray(x) + res[1] * np.ones(len(x)) slp = res[0] * 1000 * 60 * 60 * 24 * 365 / reg[ 0] #slp per milliseconds to slp per year data[seas]['trends'][period]['slp'] = slp * 100 #in percent data[seas]['trends'][period]['reg0'] = reg[0] data[seas]['trends'][period]['t0'] = x[0] data[seas]['trends'][period]['n'] = len(y) else: data[seas]['trends'][period]['pval'] = None data[seas]['trends'][period]['slp'] = None data[seas]['trends'][period]['reg0'] = None data[seas]['trends'][period]['t0'] = None data[seas]['trends'][period]['n'] = len(y) return data
def _calc_skew_angle(cls, source: OffsetSeries) -> Tuple[float, float]: # We use ordinary linear regression just for the R^2 value (below function does not provide it) _, _, rval, _, _ = scipy_stats.linregress(source.reception_times, source.offsets) # Apply robust linear regression - note that (x, y) is flipped in the argument list medslope, _, _, _ = scipy_mstats.theilslopes(source.offsets, source.reception_times) return medslope, rval ** 2
def compute_trends_current(s_monthly, periods, only_yearly=True): """ Compute trends for station. Used in the trends interface to compute the trends before. 05.06.19. Slightly modified code from original trends interface developed by A. Mortier. Parameters ------------------------ s_monthly : pd.DataFrame Dataframe containing montly values of data. periods : list[str] List containing periods. only_yearly : boolean Returns ------------------------ data : dict Dictionary containing the trends. Main changes applied: - Keep NaNs """ d = dict(month=s_monthly.index.month, year=s_monthly.index.year, value=s_monthly.values) mobs = pd.DataFrame(d) mobs['season'] = mobs.apply( lambda row: _get_season_current(row['month'], row['year']), axis=1) # drop rows where value = nan. mobs = mobs.dropna(subset=['value']) # trends with yearly and seasonal averages seasons = ['spring', 'summer', 'autumn', 'winter', 'all'] yrs = np.unique(mobs['year']) #print('yrs {}'.format(yrs)) data = {} # added to minimize the computation for i, seas in enumerate(seasons): if only_yearly and not seas == 'all': continue # initialize seasonal object data[seas] = {'date': [], 'jsdate': [], 'val': []} # filter the months for yr in yrs: if seas != 'all': catch = mobs[mobs['season'].str.contains(seas + '-' + str(yr))] else: catch = mobs[mobs['season'].str.contains('-' + str(yr))] date = _mid_season_current(seas, yr) data[seas]['date'].append(date) epoch = datetime.datetime(1970, 1, 1) data[seas]['jsdate'] = [(dat - epoch).total_seconds() * 1000 for dat in data[seas]['date']] # needs 4 seasons to compute seasonal average to avoid biases if (seas == 'all') & (len(np.unique(catch['season'].values)) < 4): data[seas]['val'].append(np.nan) else: #print(catch['value']) data[seas]['val'].append(np.nanmean(catch['value'])) # trends for this season data[seas]['trends'] = {} # filter period for period in periods: p0 = int(period[:4]) p1 = int(period[5:]) data[seas]['trends'][period] = {} # Mann-Kendall test x = np.array(data[seas]['jsdate']) y = np.array(data[seas]['val']) len_period = len(y) # works only on not nan values x = x[~np.isnan(y)] # Better ith np.isfinite() y = y[~np.isnan(y)] # filtering to the period limit jsp0 = (datetime.datetime(p0, 1, 1) - epoch).total_seconds() * 1000 jsp1 = (datetime.datetime(p1, 12, 31) - epoch).total_seconds() * 1000 y = y[(x >= jsp0) & (x <= jsp1)] x = x[(x >= jsp0) & (x <= jsp1)] # Making sure there is at least 75% coverage in the data period. # and that we have more than two points if len(y) / len_period >= 0.75 and len(y) > 1: # Kendall # TODO THIS IS WHERE YOU SHOULD ASK AUGUSTIN HOW THINGS # SHOULD BE RESTRICTED BY KENTAL TAu [tau, pval] = kendalltau(x, y) #print('pval {}'.format(pval)) data[seas]['trends'][period]['pval'] = pval if pval < 0.1: # Theil slope res = theilslopes(y, x, 0.9) medslope, medintercept, lo_slope, up_slope = res reg = medslope * np.asarray(x) + medintercept * np.ones( len(x)) slp = res[0] * 1000 * 60 * 60 * 24 * 365.25 / reg[ 0] # slp per milliseconds to slp per year data[seas]['trends'][period][ 'slp'] = slp * 100 # in percent data[seas]['trends'][period]['reg0'] = reg[0] data[seas]['trends'][period]['t0'] = x[0] data[seas]['trends'][period]['n'] = len(y) else: data[seas]['trends'][period]['pval'] = None data[seas]['trends'][period]['slp'] = None data[seas]['trends'][period]['reg0'] = None data[seas]['trends'][period]['t0'] = None data[seas]['trends'][period]['n'] = len(y) else: data[seas]['trends'][period]['pval'] = None data[seas]['trends'][period]['slp'] = None data[seas]['trends'][period]['reg0'] = None data[seas]['trends'][period]['t0'] = None data[seas]['trends'][period]['n'] = len(y) return data """ def test_unitconversion_surface_conc(): a = 10 temp = unitconv_sfc_conc(a, 2) A = unitconv_sfc_conc_bck(temp, 2) assert np.abs(a - A) < 0.000001 """ """
matched = np.array( [x[idx], xr, y[idx], yr, lums[idx], results["m"]]).T names = [ "x_orig", "x_match", "y_orig", "y_match", "lum_orig", "lum_match" ] tbl_matched = Table(data=matched, names=names) ################################################################### n = 26 bins = np.logspace(-3, 2, n) mask = tbl_matched["lum_orig"] > 0.5 f = theilslopes(tbl_matched["lum_match"][mask], tbl_matched["lum_orig"][mask])[0] mass_orig = imf.mass_from_luminosity(tbl_matched["lum_orig"]) mass_found = imf.mass_from_luminosity(results["m"] / f) mass_match = imf.mass_from_luminosity(tbl_matched["lum_match"] / f) mass_ratio = mass_match / mass_orig results.add_column(Column(data=mass_found, name="mass_found")) tbl_mass = Table(data=[mass_orig, mass_match], names=["mass_orig", "mass_match"]) tbl_matched = hstack(tbl_matched, tbl_mass) tbl_stats = imf.binned_clipped_stats(mass_orig, mass_ratio, bins) # mask = (tbl_stats["std"] > 0) * (tbl_stats["std"] < 99) * (bins > 1E-2)[1:]
# tstep = len(dates) # tstep = (edyr-styr+1)*12 for m, model in enumerate(models): for v, var in enumerate(vars): data = Dataset('%s%s/%s/%s_Amon_%s_%s' % (root, model, res[1], var, model, filename2)).variables[var][stmon:, :, :] mask = IO.Land_Mask(root, model) data_land_monthly = nanmean(nanmean(data*mask, axis=2), axis=1) # calculate global annual mean data_land_annual = vstack([sum(data_land_monthly[mon:mon+12]) for mon in xrange(0, len(data_land_monthly), 12)]) # calculate moving trend slope = np.empty((edyr-styr-9, edyr-styr-9)) slope.fill(np.nan) for st in xrange(0, edyr - styr - 9): for ed in xrange(st+10, edyr - styr + 1): slope[ed - 10, st] = mstats.theilslopes(data_land_annual[st:ed], alpha=0.95)[0] # Mapping print model, var x = np.arange(styr, edyr-9, 1.) y = np.arange(styr+10, edyr+1, 1.) X, Y = np.meshgrid(x, y) # create figure clevs = arange(limits[v][0], limits[v][1]+0.01, (limits[v][1]-limits[v][0])/100) cblevs = arange(limits[v][0], limits[v][1]+0.01, round((limits[v][1]-limits[v][0])/10, 2)) fig = plt.figure(figsize=(12, 8), dpi=100, facecolor="white") font = {'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 50} im = plt.contourf(X, Y, slope, clevs, cmap=plt.cm.seismic) cb = plt.colorbar(im, ticks=cblevs) plt.xlabel("STARTING YEAR") plt.ylabel("ENDING YEAR")
def TheilSenXY(x, y): res = mstats.theilslopes(x, y) return res[0][0], res[1][0]