def prepare_ocn_wv_data(pattern_path): """ :param pattern_path: could also be a list of path :return: """ #ff = '/home/datawork-cersat-public/cache/project/mpc-sentinel1/data/esa/sentinel-1a/L2/WV/S1A_WV_OCN__2S/2020/129/*.SAFE/measurement/s1*nc' logging.info('start reading S1 WV OCN data') #try: ocn_wv_ds = xarray.open_mfdataset(pattern_path, combine='by_coords', concat_dim='time', preprocess=preproc_ocn_wv) #except: #for py2.7 version # ocn_wv_ds = xarray.open_mfdataset(pattern_path,concat_dim='time',preprocess=preproc_ocn_wv) logging.info('Nb pts in dataset: %s', ocn_wv_ds['todSAR'].size) logging.info('SAR data ready to be used') cspcRe = ocn_wv_ds['oswQualityCrossSpectraRe'].values cspcIm = ocn_wv_ds['oswQualityCrossSpectraIm'].values re = preprocess.conv_real(cspcRe) im = preprocess.conv_imaginary(cspcIm) logging.info('re : %s', re.shape) logging.info('im : %s', im.shape) spectrum = np.stack((re, im), axis=3) logging.info('spectrum shape : %s', spectrum.shape) return spectrum, ocn_wv_ds
def read_input_files(single_input_ref_file): """ dataset provided by J stopa 16 dec 2020 (input and output are in the same files :return: """ logging.info('arbitrary chosen input ref file : %s', single_input_ref_file) dsref0 = xarray.open_mfdataset(single_input_ref_file, preprocess=preproc_ref_input, decode_times=False) #pdb.set_trace() #dsref = dsref0.where(dsref0['timeSARdt']<np.datetime64('2019-06-02'),drop=True) #for test I only take the first day #dsref = dsref0.where(dsref0['timeSARdt'] < np.datetime64('2018-01-02'), # drop=True) # for test I only take the first day dsref = dsref0 #pdb.set_trace() #dsref = dsref0.where(dsref0['timeSARdt']<datetime.datetime(2019,1,2),drop=True) logging.info('timeSAR : %s', dsref['timeSAR'].size) cspcRe = dsref['cspcRe'].values.squeeze() cspcIm = dsref['cspcIm'].values.squeeze() re = preprocess.conv_real(cspcRe) im = preprocess.conv_imaginary(cspcIm) spectrum = np.stack((re, im), axis=3) logging.debug('spectrum shape : %s', spectrum.shape) return spectrum, dsref
def prepare_training_dataset_core(ds_train_raw, validation_dataset=False): """ this method I used for building training dataset and also to do the validation dataset :param ds_train_raw: :return: """ # except: #for py2.7 version # ocn_wv_ds = xarray.open_mfdataset(pattern_path,concat_dim='time',preprocess=preproc_ocn_wv) logging.info('Nb pts in dataset: %s', ds_train_raw['timeSAR'].size) varstoadd = [ 'S', 'cwave', 'dxdt', 'latlonSARcossin', 'todSAR', 'incidence', 'satellite', 'oswQualityCrossSpectraRe', 'oswQualityCrossSpectraIm' ] # additional_vars_for_validation = ['oswLon','oswLat','oswLandFlag','oswIncidenceAngle','oswWindSpeed','platformName', # 'nrcs','nv','heading','oswK','oswNrcs'] # varstoadd += additional_vars_for_validation if validation_dataset: varstoadd.append('py_cspcImX') varstoadd.append('py_cspcReX') varstoadd.append('fileNameL2') if 'hsSM' in ds_train_raw: varstoadd += ['hsSM'] S = ds_train_raw['py_S'].values s0 = ds_train_raw['sigma0'] nv = ds_train_raw['normalizedVariance'].values ds_training_normalized = xarray.Dataset() timeSAR_vals = ds_train_raw['timeSAR'].values # hours since .... #apath = ('').join([ddc.decode() for ddc in filenames_L2[iiu,:]]) timeSAR_seconds = np.array([ datetime.datetime.strptime( os.path.basename(fup.decode()).split('-')[4], '%Y%m%dt%H%M%S') for fup in ds_train_raw['fileNameL2'].values ]) ths1 = ds_train_raw['th'].values ks1 = ds_train_raw['k'].values if 'fileNameFull' in ds_train_raw: fpaths = ds_train_raw['fileNameFull'].values #varstoadd.append('fileNameFull') else: fpaths = ds_train_raw[ 'fileNameL2'].values # 2019 dataset is a bt different #varstoadd.append('fileNameL2') sattelites = np.array([os.path.basename(hhy)[0:3] for hhy in fpaths]) satellites_int = np.array([ threelettersat[2] == 'a' for threelettersat in sattelites ]).astype(int) cspcRe = ds_train_raw['cspcRe'].values cspcIm = ds_train_raw['cspcIm'].values for vv in varstoadd: logging.info('start format variable :%s', vv) if vv in ['cwave']: dimszi = ['time', 'cwavedim'] coordi = {'time': timeSAR_seconds, 'cwavedim': np.arange(22)} logging.debug('S %s s0: %s nv: %s', S.shape, s0.shape, nv.shape) cwave = np.vstack([S.T, s0, nv]).T # found L77 in preprocess.py logging.debug('cwave vals: %s', cwave.shape) cwave = preprocess.conv_cwave(cwave) ds_training_normalized[vv] = xarray.DataArray(data=cwave, dims=dimszi, coords=coordi) elif vv in ['fileNameFull', 'fileNameL2']: # dimszi = ['time','pathnchar'] # coordi = {'time' : timeSAR_seconds,'pathnchar' : len(fpaths[0])} dimszi = ['time'] coordi = {'time': timeSAR_seconds} ds_training_normalized[vv] = xarray.DataArray(data=fpaths, dims=dimszi, coords=coordi) elif vv == 'S': # to ease the comparison with Justin files dimszi = ['time', 'Sdim'] coordi = {'time': timeSAR_seconds, 'Sdim': np.arange(20)} ds_training_normalized[vv] = xarray.DataArray(data=S, dims=dimszi, coords=coordi) elif vv in [ 'dxdt' ]: # dx and dt and delta from coloc with alti see /home/cercache/users/jstopa/sar/empHs/cwaveV5, I can put zeros here at this stage #dxdt = np.column_stack([ds_train_raw['dx'].values,ds_train_raw['dt'].values]) dxdt = np.column_stack([np.zeros(s0.shape), np.ones(s0.shape)]) dimszi = ['time', 'dxdtdim'] coordi = {'time': timeSAR_seconds, 'dxdtdim': np.arange(2)} ds_training_normalized[vv] = xarray.DataArray(data=dxdt, dims=dimszi, coords=coordi) elif vv in ['latlonSARcossin']: latSARcossin = preprocess.conv_position( ds_train_raw['latSAR'].values) # Gets cos and sin lonSARcossin = preprocess.conv_position( ds_train_raw['lonSAR'].values) latlonSARcossin = np.hstack([latSARcossin, lonSARcossin]) dimszi = ['time', 'latlondim'] coordi = {'time': timeSAR_seconds, 'latlondim': np.arange(4)} ds_training_normalized[vv] = xarray.DataArray(data=latlonSARcossin, dims=dimszi, coords=coordi) elif vv in ['todSAR']: dimszi = ['time'] new_dates_dt = np.array( [from_np64_to_dt(dt64) for dt64 in timeSAR_vals]) unit = "hours since 2010-01-01T00:00:00Z UTC" # see https://github.com/grouny/sar_hs_nn/blob/c05322e6635c6d77409e36537d7c3b58788e7322/sarhspredictor/lib/sarhs/preprocess.py#L11 new_dates_num = np.array( [netCDF4.date2num(dfg, unit) for dfg in new_dates_dt]) coordi = {'time': timeSAR_seconds} todSAR = conv_time(new_dates_num) ds_training_normalized[vv] = xarray.DataArray(data=todSAR, dims=dimszi, coords=coordi) elif vv in ['oswK']: dimszi = ['time', 'oswWavenumberBinSize'] coordi = { 'time': timeSAR_seconds, 'oswWavenumberBinSize': np.arange(len(ks1)) } ds_training_normalized[vv] = xarray.DataArray(data=ks1, dims=dimszi, coords=coordi) elif vv in [ 'incidence', ]: dimszi = ['time', 'incdim'] coordi = {'time': timeSAR_seconds, 'incdim': np.arange(2)} incidence = preprocess.conv_incidence( ds_train_raw['incidenceAngle'].values.squeeze()) ds_training_normalized[vv] = xarray.DataArray(data=incidence, dims=dimszi, coords=coordi) elif vv in ['incidence_angle']: dimszi = ['time'] olddims = [ x for x in ds_train_raw['incidenceAngle'].dims if x not in ['oswAzSize', 'oswRaSize'] ] coordi = {} for didi in olddims: coordi[didi] = ds_train_raw['incidenceAngle'].coords[ didi].values coordi['time'] = timeSAR_seconds incidence = np.array( [ds_train_raw['incidenceAngle'].values.squeeze()]) ds_training_normalized[vv] = xarray.DataArray(data=incidence, dims=dimszi, coords=coordi) elif vv in ['satellite']: dimszi = ['time'] coordi = {'time': timeSAR_seconds} # satellite_int = np.array([satellite[2] == 'a']).astype(int) ds_training_normalized[vv] = xarray.DataArray(data=satellites_int, dims=dimszi, coords=coordi) elif vv in ['platformName']: dimszi = ['time'] coordi = {'time': timeSAR_seconds} satellite_int = sattelites ds_training_normalized[vv] = xarray.DataArray(data=satellite_int, dims=dimszi, coords=coordi) elif vv in ['nrcs']: dimszi = ['time'] coordi = {'time': timeSAR_seconds} ds_training_normalized[vv] = xarray.DataArray(data=s0, dims=dimszi, coords=coordi) elif vv in ['heading']: dimszi = ['time'] coordi = {'time': timeSAR_seconds} ds_training_normalized[vv] = xarray.DataArray( data=ds_train_raw['trackAngle'].values, dims=dimszi, coords=coordi) elif vv in ['nv']: dimszi = ['time'] coordi = {'time': timeSAR_seconds} ds_training_normalized[vv] = xarray.DataArray(data=nv, dims=dimszi, coords=coordi) elif vv in ['oswQualityCrossSpectraRe', 'oswQualityCrossSpectraIm']: if vv == 'oswQualityCrossSpectraRe': datatmp = cspcRe elif vv == 'oswQualityCrossSpectraIm': datatmp = cspcIm else: raise Exception() # datatmp = ds[vv].values.squeeze() # olddims = [x for x in ds[vv].dims if x not in ['oswAzSize','oswRaSize']] coordi = {} # for didi in olddims: # coordi[didi] = ds[vv].coords[didi].values coordi['time'] = timeSAR_seconds coordi['oswAngularBinSize'] = np.arange(len(ths1)) coordi['oswWavenumberBinSize'] = np.arange(len(ks1)) dimsadd = ['time', 'oswAngularBinSize', 'oswWavenumberBinSize'] # if datatmp.shape == (72,60) : # case only one spectra # datatmp = datatmp.reshape((1,72,60)) ds_training_normalized[vv] = xarray.DataArray(data=datatmp, dims=dimsadd, coords=coordi) elif vv in ['py_cspcImX', 'py_cspcReX']: datatmp = ds_train_raw[vv].values coordi = ds_train_raw[vv].coords coordi['time'] = timeSAR_seconds dimsadd = ds_train_raw[vv].dims ds_training_normalized[vv] = xarray.DataArray(data=datatmp, dims=dimsadd, coords=coordi) else: datatmp = ds_train_raw[vv].values.squeeze() olddims = [ x for x in ds_train_raw[vv].dims if x not in ['oswAzSize', 'oswRaSize'] ] coordi = {} for didi in olddims: coordi[didi] = ds_train_raw[vv].coords[didi].values coordi['time'] = timeSAR_seconds dimsadd = ['time'] logging.info('data: %s', datatmp.shape) ds_training_normalized[vv] = xarray.DataArray(data=datatmp, dims=dimsadd, coords=coordi) # logging.debug('field xarray : %s %s',vv,newds[vv]) logging.debug('newds: %s', ds_training_normalized) logging.info('SAR data ready to be used') # cspcRe = ds_train_raw['oswQualityCrossSpectraRe'].values # cspcIm = ds_train_raw['oswQualityCrossSpectraIm'].values re = preprocess.conv_real(cspcRe) im = preprocess.conv_imaginary(cspcIm) logging.info('re : %s', re.shape) logging.info('im : %s', im.shape) spectrum = np.stack((re, im), axis=3) logging.info('spectrum shape : %s', spectrum.shape) return spectrum, ds_training_normalized
def split_aggregated_ds(file_src, file_dest): """ :param file_src: :param file_dest: :return: """ groups = {'2015_2016': [2015, 2016], '2017': [2017], '2018': [2018]} # Print fields of source file. with h5py.File(file_src, 'r') as f: for k in [k for k in f.keys()]: print(f'{k}: {f[k].dtype}') # Create h5. with h5py.File(file_src, 'r') as fs, h5py.File(file_dest, 'w') as fd: for group_name, years in groups.items(): grp = fd.create_group(group_name) # Find examples of the specified years. indices = np.zeros_like(fs['year'][:], dtype='bool') for year in years: indices = np.logical_or(fs['year'][:] == year, indices) # Find examples that don't have nans. indices[np.any(np.isnan(fs['py_S'][:]), axis=1)] = 0 indices[np.isnan(fs['sigma0'][:])] = 0 indices[np.isnan(fs['normalizedVariance'][:])] = 0 # Done num_examples = indices.sum() print(f'Found {num_examples} events from years: ', years) # Write data from this year. # print(fs['year'][indices].shape) grp.create_dataset('year', data=fs['year'][indices]) # Get 22 CWAVE features. cwave = np.hstack([ fs['py_S'][indices, ...], fs['sigma0'][indices].reshape(-1, 1), fs['normalizedVariance'][indices].reshape(-1, 1) ]) cwave = preprocess.conv_cwave( cwave ) # Remove extrema, then standardize with hardcoded mean,vars. grp.create_dataset('cwave', data=cwave) # Additional features. dx = preprocess.conv_dx(fs['dx'][indices]) dt = preprocess.conv_dt(fs['dt'][indices]) grp.create_dataset('dxdt', data=np.column_stack([dx, dt])) latSAR = fs['latSAR'][indices] lonSAR = fs['lonSAR'][indices] latSARcossin = preprocess.conv_position(latSAR) # Gets cos and sin lonSARcossin = preprocess.conv_position(lonSAR) grp.create_dataset('latlonSAR', data=np.column_stack([latSAR, lonSAR])) grp.create_dataset('latlonSARcossin', data=np.hstack([latSARcossin, lonSARcossin])) timeSAR = fs['timeSAR'][indices] todSAR = preprocess.conv_time(timeSAR) grp.create_dataset('timeSAR', data=timeSAR, shape=(timeSAR.shape[0], 1)) grp.create_dataset('todSAR', data=todSAR, shape=(todSAR.shape[0], 1)) incidence = preprocess.conv_incidence( fs['incidenceAngle'][indices]) # Separates into 2 var. grp.create_dataset('incidence', data=incidence) satellite = fs['satellite'][indices] grp.create_dataset('satellite', data=satellite, shape=(satellite.shape[0], 1)) # Altimeter hsALT = fs['hsALT'][indices] grp.create_dataset('hsALT', data=hsALT, shape=(hsALT.shape[0], 1)) # Get spectral data. x = np.stack(( preprocess.conv_real(fs['cspcRe'][indices, ...]), preprocess.conv_imaginary(fs['cspcIm'][indices, ...]), ), axis=3) grp.create_dataset('spectrum', data=x) print(f'Done with {years}') print('Done')