def legacy_wrapper(data_test0, subchunk, b_deb): """ Legacy fortran wrapper that loop on each pixel """ res = np.empty([subchunk.dim[1], 4]) res[:] = np.nan # NaN matrix by default for ii_sub in range(subchunk.dim[1]): #if b_deb: print('---------------') #if b_deb: print('jj: {} - ii: {} '.format(jj_sub, ii_sub)) data_test = data_test0[:, ii_sub] ## remove tie group data_test[1:][np.diff(data_test) == 0.] = np.nan if b_deb: print('Data valid:', data_test.size - np.isnan(data_test).sum(), '/', data_test.size) if 1: ## original mann-kendall test : bla = data_test[~np.isnan(data_test)] if bla.size > 0: #print('min/mean/max/nb/nb_unique', bla.min(), bla.mean(), bla.max(), len(bla), len(np.unique(bla))) #print(bla) if len(np.unique(bla)) == 1: p, z, Sn, nx = [0, 0, 0, 0] else: p, z, Sn, nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) else: p, z, Sn, nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) # if data_test = [], the test return (p,z,Sn,nx) = (1.0, 0.0, 0.5, 0.0) else: ## other test p, z, Sn, nx = [0, 0, 0, 0] z = data_test.mean() #if b_deb: print('p,z,Sn,nx', p,z,Sn,nx) res[ii_sub, 0] = p res[ii_sub, 1] = z res[ii_sub, 2] = Sn res[ii_sub, 3] = nx if b_deb: print('p,z,Sn,nx') print(res) #return (res[:,0], res[:,1], res[:,2], res[:,3]) return res
def mk_test(y): n_zero = (y == 0.0).sum() #print("{:.4f} / {} / {} / {}".format(np.isnan(y).sum()/len(y), np.nanmin(y), np.nanmax(y), n_zero)) ## To avoid 0.0 tie that makes the sort algo of median fortran computation stuck in very long loops, ## we add very low fake noise to all 0.0 values to allow the sort algo to perform efficiently. y[y == 0.0] = 1.e-6 * np.random.rand(n_zero) return m.mk_trend(len(y), np.arange(len(y)), y, 3)
def proc(y): n_zero = (y == 0.0).sum() #print("{:.4f} / {} / {} / {}".format(np.isnan(y).sum()/len(y), np.nanmin(y), np.nanmax(y), n_zero)) ## To avoid 0.0 tie that makes the sort algo of median fortran computation stuck in very long loops, ## we add very low fake noise to all 0.0 values to allow the sort algo to perform efficiently. y[y == 0.0] = 1.e-6 * np.random.rand(n_zero) if not p_test[y.name]: if b_deb: print("INFO: p-value > 0.05 for {}".format(y.name)) return tuple([np.nan] * 4) return m.mk_trend(len(y), np.arange(len(y)), y, 2)
x = np.arange(freq * 10) / (1. * freq) y = np.sin(2 * np.pi * x) slope = 0.2 # [1/year] # sinus + slope y += slope * x # random + slope only #y = 0.1*np.random.rand(len(x)) + slope*x #plt.plot(x,y) #plt.show() p, z, Sn, nx = m.mk_trend(len(y), np.arange(len(y)), y) print('p, z, Sn, nx:') print(p, z, Sn, nx) slope2, intercept, lo_slope, up_slope = mstats.theilslopes(y) print('slope2, intercept, lo_slope, up_slop:') print(slope2, intercept, lo_slope, up_slope) res_smk = sk.seakeni(y, 365) print(res_smk) print('Summary:') print("mk fortran : {}, err[%] = {:.2f}".format( Sn * freq, 100 * (slope - Sn * freq) / slope)) print("mk scipy : {}, err[%] = {:.2f}".format( slope2 * freq, 100 * (slope - slope2 * freq) / slope))
def processInput_trends(subchunk, parent_iteration, child_iteration): """This is the main file that calculate trends""" #print('INFO: see pid.<pid>.out to monitor trend computation progress') #sys.stdout = open('pid.'+str(os.getpid()) + '.out', 'w') #print('INFO: see trend.out to monitor trend computation progress') #sys.stdout = open('trend.out', 'a') ## Debug tool to print process Ids process = psutil.Process(os.getpid()) current = current_process() print(process, current._identity, '{} Mo'.format(process.memory_info().rss/1024/1024)) if subchunk.input=='box': print('### Chunk {} > subchunk {} started: COL: [{}:{}] ROW: [{}:{}]'.format(parent_iteration, child_iteration, *subchunk.get_limits('local', 'str'))) write_string0 = (param.hash+"_CHUNK" + np.str(parent_iteration) + "_SUBCHUNK" + np.str(child_iteration) + "_" + '_'.join(subchunk.get_limits('global', 'str')) + '.nc') subchunk_fname = param.output_path / write_string0 ## Check if cache file already exists and must be overwritten if not param.b_delete: if subchunk_fname.is_file(): print ('INFO: {} already exists. Use -d option to overwrite it.'.format(write_string0)) return elif subchunk.input=='points': print('### Chunk {} > subchunk {} started.'.format(parent_iteration, child_iteration)) print(param.input_file) str_date_range = param.input_file.stem.replace('timeseries','') write_string0 = 'merged_trends{}.h5'.format(str_date_range) subchunk_fname = param.output_path / write_string0 ## Result file is always overwritten in the case of point input ## Read the input time series file from main chunk, configured length of time X 500 X 500; it may vary if different chunks are used hdf_ts = h5py.File(param.input_file, 'r') ## Create temporary storage with size of sub chunks in main chunk, currently configured 100 by 100 blocks var_temp_output = np.empty([*subchunk.dim,4]) var_temp_output[:] = np.nan # NaN matrix by default ## Parameters for te loop b_deb = 0 # flag to print time profiling t00 = timer() t000 = timer() t_mean = 0. #print(current._identity, f'{process.memory_info().rss/1024/1024} Mo') offsetx = subchunk.get_limits('local', 'tuple')[0] offsety = subchunk.get_limits('local', 'tuple')[2] print_freq = 20 tab_prof_valid = [] tab_prof_zero = [] hf = h5py.File(subchunk_fname, 'w') for tsvar in hdf_ts['vars'].keys(): for jj_sub in range(subchunk.dim[0]): #for jj_sub in range(61,80): #debug # dimension of variable: time,x,y # preload all the y data here to avoid overhead due to calling Dataset.variables at each iteration in the inner loop data_test0 = hdf_ts['vars/'+tsvar][:,jj_sub+offsety,offsetx:offsetx+subchunk.dim[1]] #data_test0 = hdf_ts.variables[tsvar][:500,sub_chunks_x[ii_sub],:] for ii_sub in range(subchunk.dim[1]): #for ii_sub in range(55,100): if b_deb: print('---------------') if b_deb: print('jj: {} - ii: {} '.format(jj_sub, ii_sub)) t0 = timer() data_test = data_test0[:,ii_sub] ## remove tie group data_test[1:][np.diff(data_test)==0.] = np.nan #data_test=hdf_ts.variables[tsvar][:,sub_chunks_x[ii_sub],sub_chunks_y[jj_sub]] slope=999.0 if b_deb: print('t0', timer()-t0) t0 = timer() if b_deb: print('Data valid:', data_test.size - np.isnan(data_test).sum(), '/', data_test.size) if 0: print('Use mstats') data_sen=np.ma.masked_array(data_test, mask=np.isnan(data_test)) t0 = timer() slope, intercept, lo_slope, up_slope = mstats.theilslopes(data_sen, alpha=0.1) print('slope, intercept, lo_slope, up_slop:') print(slope, intercept, lo_slope, up_slope) if b_deb: print('t02', timer()-t0) np.savetxt('data_test.dat', data_test.T) sys.exit() t0 = timer() # this mstats give correct slope and is consistent with python man-kendall score of Sn; this is fast than Fortran''' # stats.theilslopes is giving incorrect values when NaN are inside data''' if b_deb: print('t2', timer()-t0) t0 = timer() if 1: ## orinal mann-kendall test : bla = data_test[~np.isnan(data_test)] if bla.size > 0: #print('min/mean/max/nb/nb_unique', bla.min(), bla.mean(), bla.max(), len(bla), len(np.unique(bla))) #print(bla) if len(np.unique(bla))==1: p,z,Sn,nx = [0,0,0,0] else: #data_test = data_test[-10:] # debug line to speed up #try: # p,z,Sn,nx = mk_test_timeout(data_test) #except TimeoutError as e: # print('timeout!') # p,z,Sn,nx = [0,0,0,0] p,z,Sn,nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) else: p,z,Sn,nx = m.mk_trend(len(data_test), np.arange(len(data_test)), data_test) # if data_test = [], the test return (p,z,Sn,nx) = (1.0, 0.0, 0.5, 0.0) else: ## other test p,z,Sn,nx = [0,0,0,0] z = data_test.mean() if b_deb: t4 = timer()-t0 if bla.size>0: if bla.mean()==0.0: tab_prof_zero.append(t4) else: tab_prof_valid.append(t4) print('t4=', t4) if 0: import matplotlib.pyplot as plt plt.clf() plt.plot(bla) plt.ylim(0,6.1) ti1 = '{}/{} - {:.3f} s'.format(jj_sub, ii_sub, t4) ti2 = 'min/mean/max/nb/nb_unique {:.3f} {:.3f} {:.3f} {} {}'.format(bla.min(), bla.mean(), bla.max(), len(bla), len(np.unique(bla))) ti3 = 'slope: {}'.format(Sn) plt.title(ti1+'\n'+ti2+'\n'+ti3) if Sn==0.0: plt.savefig('bla.Sn0.{}.{}.png'.format(jj_sub, ii_sub)) else: plt.savefig('bla.{}.{}.png'.format(jj_sub, ii_sub)) t0 = timer() if b_deb: print('p,z,slope,nx', p,z,slope,nx) if b_deb: print('p,z,Sn,nx', p,z,Sn,nx) var_temp_output[jj_sub,ii_sub,0] = p var_temp_output[jj_sub,ii_sub,1] = z var_temp_output[jj_sub,ii_sub,2] = Sn var_temp_output[jj_sub,ii_sub,3] = nx ## Print efficiency stats if (jj_sub+1)%print_freq==0: elapsed = timer()-t00 data_stat = hdf_ts.variables[tsvar][:,jj_sub+1+offsety-print_freq:jj_sub+1+offsety,offsetx:offsetx+subchunk.dim[1]] valid = 100.*(data_stat.size - np.count_nonzero(np.isnan(data_stat)))/data_stat.size eff = 1e6*elapsed/data_stat.size #print(subchunk.dim, data_test0.shape) print('{} : {}.{}.block[{}-{}] : {:.3f}s elapsed : {:.3f} us/pix/date : {:.2f}% valid'.format(datetime.datetime.now(), parent_iteration, child_iteration, jj_sub+1-print_freq, jj_sub+1, elapsed, eff, valid)) t00 = timer() sys.stdout.flush() if 0: t_mean += timer()-t00 #print(f't00 {ii_sub} {t_mean/(ii_sub+1)}') #print(f't00.p{current._identity[0]}.it{ii_sub} {t_mean/(ii_sub+1):.3f}s {process.memory_info().rss/1024/1024:.2f}Mo') print('t00.p{}.it{} {:.3f}s {:.2f}Mo'.format(current._identity[0], ii_sub, t_mean/(ii_sub+1), process.memory_info().rss/1024/1024)) v = var_temp_output[ii_sub,:,:] for ii in range(4): #print(f'{np.count_nonzero(np.isnan(v[:,ii]))/v[:,ii].size:.3f}', np.nanmin(v[:,ii]), np.nanmax(v[:,ii])) print('{:.3f}'.format(np.count_nonzero(np.isnan(v[:,ii]))/v[:,ii].size), np.nanmin(v[:,ii]), np.nanmax(v[:,ii])) print(np.nanmin(v), np.nanmax(v)) t00 = timer() if b_deb: #if 1: valid = np.array(tab_prof_valid) zero = np.array(tab_prof_zero) #print('valid:', valid.size, valid.min(), valid.mean(), valid.max()) #print('zero:', zero.size, zero.min(), zero.mean(), zero.max()) print(valid.mean()) print(zero.mean()) #return #sys.exit() print('t000tot.p{} {:.3f}s {:.2f}Mo'.format(current._identity, timer()-t000, process.memory_info().rss/1024/1024)) hf.create_dataset(tsvar+'/pval', data=var_temp_output[:,:,0]) hf.create_dataset(tsvar+'/zval', data=var_temp_output[:,:,1]) hf.create_dataset(tsvar+'/slope', data=var_temp_output[:,:,2]) hf.create_dataset(tsvar+'/len', data=var_temp_output[:,:,3]) hf.close() print ('Subchunk {} completed, save to {}'.format(child_iteration, subchunk_fname)) return None
def mk_test_timeout(data_test): return m.mk_trend(len(data_test), np.arange(len(data_test)), data_test)
def preproc(y): # Second 70% threshold: compute MK test only if 70% of valid data if (np.count_nonzero(np.isnan(y)) / len(y)) > 0.3: if b_deb: print("WARNING: More than 30% of NaN") return tuple([np.nan] * 4) return m.mk_trend(len(y), np.arange(len(y)), y, 1)