Exemplo n.º 1
0
def prepare_ocn_wv_data(pattern_path):
    """
    :param pattern_path: could also be a list of path
    :return:
    """
    #ff = '/home/datawork-cersat-public/cache/project/mpc-sentinel1/data/esa/sentinel-1a/L2/WV/S1A_WV_OCN__2S/2020/129/*.SAFE/measurement/s1*nc'
    logging.info('start reading S1 WV OCN data')
    #try:
    ocn_wv_ds = xarray.open_mfdataset(pattern_path,
                                      combine='by_coords',
                                      concat_dim='time',
                                      preprocess=preproc_ocn_wv)
    #except: #for py2.7 version
    #    ocn_wv_ds = xarray.open_mfdataset(pattern_path,concat_dim='time',preprocess=preproc_ocn_wv)
    logging.info('Nb pts in dataset: %s', ocn_wv_ds['todSAR'].size)
    logging.info('SAR data ready to be used')
    cspcRe = ocn_wv_ds['oswQualityCrossSpectraRe'].values
    cspcIm = ocn_wv_ds['oswQualityCrossSpectraIm'].values
    re = preprocess.conv_real(cspcRe)
    im = preprocess.conv_imaginary(cspcIm)
    logging.info('re : %s', re.shape)
    logging.info('im : %s', im.shape)
    spectrum = np.stack((re, im), axis=3)
    logging.info('spectrum shape : %s', spectrum.shape)
    return spectrum, ocn_wv_ds
def read_input_files(single_input_ref_file):
    """
    dataset provided by J stopa 16 dec 2020 (input and output are in the same files
    :return:
    """

    logging.info('arbitrary chosen input ref file : %s', single_input_ref_file)
    dsref0 = xarray.open_mfdataset(single_input_ref_file,
                                   preprocess=preproc_ref_input,
                                   decode_times=False)
    #pdb.set_trace()
    #dsref = dsref0.where(dsref0['timeSARdt']<np.datetime64('2019-06-02'),drop=True) #for test I only take the first day
    #dsref = dsref0.where(dsref0['timeSARdt'] < np.datetime64('2018-01-02'),
    #                     drop=True)  # for test I only take the first day
    dsref = dsref0
    #pdb.set_trace()
    #dsref = dsref0.where(dsref0['timeSARdt']<datetime.datetime(2019,1,2),drop=True)
    logging.info('timeSAR : %s', dsref['timeSAR'].size)
    cspcRe = dsref['cspcRe'].values.squeeze()
    cspcIm = dsref['cspcIm'].values.squeeze()
    re = preprocess.conv_real(cspcRe)
    im = preprocess.conv_imaginary(cspcIm)
    spectrum = np.stack((re, im), axis=3)
    logging.debug('spectrum shape : %s', spectrum.shape)
    return spectrum, dsref
def prepare_training_dataset_core(ds_train_raw, validation_dataset=False):
    """
    this method I used for building training dataset and also to do the validation dataset
    :param ds_train_raw:
    :return:
    """
    # except: #for py2.7 version
    #    ocn_wv_ds = xarray.open_mfdataset(pattern_path,concat_dim='time',preprocess=preproc_ocn_wv)
    logging.info('Nb pts in dataset: %s', ds_train_raw['timeSAR'].size)
    varstoadd = [
        'S', 'cwave', 'dxdt', 'latlonSARcossin', 'todSAR', 'incidence',
        'satellite', 'oswQualityCrossSpectraRe', 'oswQualityCrossSpectraIm'
    ]
    # additional_vars_for_validation = ['oswLon','oswLat','oswLandFlag','oswIncidenceAngle','oswWindSpeed','platformName',
    #                                  'nrcs','nv','heading','oswK','oswNrcs']
    # varstoadd += additional_vars_for_validation
    if validation_dataset:
        varstoadd.append('py_cspcImX')
        varstoadd.append('py_cspcReX')
        varstoadd.append('fileNameL2')
    if 'hsSM' in ds_train_raw:
        varstoadd += ['hsSM']
    S = ds_train_raw['py_S'].values
    s0 = ds_train_raw['sigma0']
    nv = ds_train_raw['normalizedVariance'].values
    ds_training_normalized = xarray.Dataset()
    timeSAR_vals = ds_train_raw['timeSAR'].values  # hours since ....
    #apath = ('').join([ddc.decode() for ddc in filenames_L2[iiu,:]])
    timeSAR_seconds = np.array([
        datetime.datetime.strptime(
            os.path.basename(fup.decode()).split('-')[4], '%Y%m%dt%H%M%S')
        for fup in ds_train_raw['fileNameL2'].values
    ])
    ths1 = ds_train_raw['th'].values
    ks1 = ds_train_raw['k'].values
    if 'fileNameFull' in ds_train_raw:
        fpaths = ds_train_raw['fileNameFull'].values
        #varstoadd.append('fileNameFull')
    else:
        fpaths = ds_train_raw[
            'fileNameL2'].values  # 2019 dataset is a bt different
        #varstoadd.append('fileNameL2')
    sattelites = np.array([os.path.basename(hhy)[0:3] for hhy in fpaths])
    satellites_int = np.array([
        threelettersat[2] == 'a' for threelettersat in sattelites
    ]).astype(int)
    cspcRe = ds_train_raw['cspcRe'].values
    cspcIm = ds_train_raw['cspcIm'].values
    for vv in varstoadd:
        logging.info('start format variable :%s', vv)
        if vv in ['cwave']:
            dimszi = ['time', 'cwavedim']
            coordi = {'time': timeSAR_seconds, 'cwavedim': np.arange(22)}
            logging.debug('S %s s0: %s nv: %s', S.shape, s0.shape, nv.shape)
            cwave = np.vstack([S.T, s0, nv]).T  # found L77 in preprocess.py
            logging.debug('cwave vals: %s', cwave.shape)
            cwave = preprocess.conv_cwave(cwave)
            ds_training_normalized[vv] = xarray.DataArray(data=cwave,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['fileNameFull', 'fileNameL2']:
            # dimszi = ['time','pathnchar']
            # coordi = {'time' : timeSAR_seconds,'pathnchar' : len(fpaths[0])}
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            ds_training_normalized[vv] = xarray.DataArray(data=fpaths,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv == 'S':  # to ease the comparison with Justin files
            dimszi = ['time', 'Sdim']
            coordi = {'time': timeSAR_seconds, 'Sdim': np.arange(20)}
            ds_training_normalized[vv] = xarray.DataArray(data=S,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in [
                'dxdt'
        ]:  # dx and dt and delta from coloc with alti see /home/cercache/users/jstopa/sar/empHs/cwaveV5, I can put zeros here at this stage
            #dxdt = np.column_stack([ds_train_raw['dx'].values,ds_train_raw['dt'].values])
            dxdt = np.column_stack([np.zeros(s0.shape), np.ones(s0.shape)])
            dimszi = ['time', 'dxdtdim']
            coordi = {'time': timeSAR_seconds, 'dxdtdim': np.arange(2)}
            ds_training_normalized[vv] = xarray.DataArray(data=dxdt,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['latlonSARcossin']:
            latSARcossin = preprocess.conv_position(
                ds_train_raw['latSAR'].values)  # Gets cos and sin
            lonSARcossin = preprocess.conv_position(
                ds_train_raw['lonSAR'].values)
            latlonSARcossin = np.hstack([latSARcossin, lonSARcossin])
            dimszi = ['time', 'latlondim']
            coordi = {'time': timeSAR_seconds, 'latlondim': np.arange(4)}
            ds_training_normalized[vv] = xarray.DataArray(data=latlonSARcossin,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['todSAR']:
            dimszi = ['time']
            new_dates_dt = np.array(
                [from_np64_to_dt(dt64) for dt64 in timeSAR_vals])
            unit = "hours since 2010-01-01T00:00:00Z UTC"  # see https://github.com/grouny/sar_hs_nn/blob/c05322e6635c6d77409e36537d7c3b58788e7322/sarhspredictor/lib/sarhs/preprocess.py#L11
            new_dates_num = np.array(
                [netCDF4.date2num(dfg, unit) for dfg in new_dates_dt])
            coordi = {'time': timeSAR_seconds}
            todSAR = conv_time(new_dates_num)
            ds_training_normalized[vv] = xarray.DataArray(data=todSAR,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['oswK']:
            dimszi = ['time', 'oswWavenumberBinSize']
            coordi = {
                'time': timeSAR_seconds,
                'oswWavenumberBinSize': np.arange(len(ks1))
            }
            ds_training_normalized[vv] = xarray.DataArray(data=ks1,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in [
                'incidence',
        ]:
            dimszi = ['time', 'incdim']
            coordi = {'time': timeSAR_seconds, 'incdim': np.arange(2)}
            incidence = preprocess.conv_incidence(
                ds_train_raw['incidenceAngle'].values.squeeze())
            ds_training_normalized[vv] = xarray.DataArray(data=incidence,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['incidence_angle']:
            dimszi = ['time']
            olddims = [
                x for x in ds_train_raw['incidenceAngle'].dims
                if x not in ['oswAzSize', 'oswRaSize']
            ]
            coordi = {}
            for didi in olddims:
                coordi[didi] = ds_train_raw['incidenceAngle'].coords[
                    didi].values
            coordi['time'] = timeSAR_seconds
            incidence = np.array(
                [ds_train_raw['incidenceAngle'].values.squeeze()])
            ds_training_normalized[vv] = xarray.DataArray(data=incidence,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['satellite']:
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            # satellite_int = np.array([satellite[2] == 'a']).astype(int)
            ds_training_normalized[vv] = xarray.DataArray(data=satellites_int,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['platformName']:
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            satellite_int = sattelites
            ds_training_normalized[vv] = xarray.DataArray(data=satellite_int,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['nrcs']:
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            ds_training_normalized[vv] = xarray.DataArray(data=s0,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['heading']:
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            ds_training_normalized[vv] = xarray.DataArray(
                data=ds_train_raw['trackAngle'].values,
                dims=dimszi,
                coords=coordi)
        elif vv in ['nv']:
            dimszi = ['time']
            coordi = {'time': timeSAR_seconds}
            ds_training_normalized[vv] = xarray.DataArray(data=nv,
                                                          dims=dimszi,
                                                          coords=coordi)
        elif vv in ['oswQualityCrossSpectraRe', 'oswQualityCrossSpectraIm']:
            if vv == 'oswQualityCrossSpectraRe':
                datatmp = cspcRe
            elif vv == 'oswQualityCrossSpectraIm':
                datatmp = cspcIm
            else:
                raise Exception()
            # datatmp = ds[vv].values.squeeze()
            # olddims = [x for x in ds[vv].dims if x not in ['oswAzSize','oswRaSize']]
            coordi = {}
            # for didi in olddims:
            #    coordi[didi] = ds[vv].coords[didi].values
            coordi['time'] = timeSAR_seconds
            coordi['oswAngularBinSize'] = np.arange(len(ths1))
            coordi['oswWavenumberBinSize'] = np.arange(len(ks1))
            dimsadd = ['time', 'oswAngularBinSize', 'oswWavenumberBinSize']
            # if datatmp.shape == (72,60) :  # case only one spectra
            #    datatmp = datatmp.reshape((1,72,60))

            ds_training_normalized[vv] = xarray.DataArray(data=datatmp,
                                                          dims=dimsadd,
                                                          coords=coordi)
        elif vv in ['py_cspcImX', 'py_cspcReX']:
            datatmp = ds_train_raw[vv].values
            coordi = ds_train_raw[vv].coords
            coordi['time'] = timeSAR_seconds
            dimsadd = ds_train_raw[vv].dims
            ds_training_normalized[vv] = xarray.DataArray(data=datatmp,
                                                          dims=dimsadd,
                                                          coords=coordi)
        else:
            datatmp = ds_train_raw[vv].values.squeeze()
            olddims = [
                x for x in ds_train_raw[vv].dims
                if x not in ['oswAzSize', 'oswRaSize']
            ]
            coordi = {}
            for didi in olddims:
                coordi[didi] = ds_train_raw[vv].coords[didi].values
            coordi['time'] = timeSAR_seconds
            dimsadd = ['time']
            logging.info('data: %s', datatmp.shape)
            ds_training_normalized[vv] = xarray.DataArray(data=datatmp,
                                                          dims=dimsadd,
                                                          coords=coordi)
        # logging.debug('field xarray : %s %s',vv,newds[vv])
    logging.debug('newds: %s', ds_training_normalized)
    logging.info('SAR data ready to be used')
    # cspcRe = ds_train_raw['oswQualityCrossSpectraRe'].values
    # cspcIm = ds_train_raw['oswQualityCrossSpectraIm'].values
    re = preprocess.conv_real(cspcRe)
    im = preprocess.conv_imaginary(cspcIm)
    logging.info('re : %s', re.shape)
    logging.info('im : %s', im.shape)
    spectrum = np.stack((re, im), axis=3)
    logging.info('spectrum shape : %s', spectrum.shape)
    return spectrum, ds_training_normalized
Exemplo n.º 4
0
def split_aggregated_ds(file_src, file_dest):
    """

    :param file_src:
    :param file_dest:
    :return:
    """
    groups = {'2015_2016': [2015, 2016], '2017': [2017], '2018': [2018]}
    # Print fields of source file.
    with h5py.File(file_src, 'r') as f:
        for k in [k for k in f.keys()]:
            print(f'{k}: {f[k].dtype}')

    # Create h5.
    with h5py.File(file_src, 'r') as fs, h5py.File(file_dest, 'w') as fd:
        for group_name, years in groups.items():
            grp = fd.create_group(group_name)

            # Find examples of the specified years.
            indices = np.zeros_like(fs['year'][:], dtype='bool')
            for year in years:
                indices = np.logical_or(fs['year'][:] == year, indices)
            # Find examples that don't have nans.
            indices[np.any(np.isnan(fs['py_S'][:]), axis=1)] = 0
            indices[np.isnan(fs['sigma0'][:])] = 0
            indices[np.isnan(fs['normalizedVariance'][:])] = 0
            # Done
            num_examples = indices.sum()
            print(f'Found {num_examples} events from years: ', years)

            # Write data from this year.
            # print(fs['year'][indices].shape)
            grp.create_dataset('year', data=fs['year'][indices])

            # Get 22 CWAVE features.
            cwave = np.hstack([
                fs['py_S'][indices, ...], fs['sigma0'][indices].reshape(-1, 1),
                fs['normalizedVariance'][indices].reshape(-1, 1)
            ])
            cwave = preprocess.conv_cwave(
                cwave
            )  # Remove extrema, then standardize with hardcoded mean,vars.
            grp.create_dataset('cwave', data=cwave)

            # Additional features.
            dx = preprocess.conv_dx(fs['dx'][indices])
            dt = preprocess.conv_dt(fs['dt'][indices])
            grp.create_dataset('dxdt', data=np.column_stack([dx, dt]))

            latSAR = fs['latSAR'][indices]
            lonSAR = fs['lonSAR'][indices]
            latSARcossin = preprocess.conv_position(latSAR)  # Gets cos and sin
            lonSARcossin = preprocess.conv_position(lonSAR)
            grp.create_dataset('latlonSAR',
                               data=np.column_stack([latSAR, lonSAR]))
            grp.create_dataset('latlonSARcossin',
                               data=np.hstack([latSARcossin, lonSARcossin]))

            timeSAR = fs['timeSAR'][indices]
            todSAR = preprocess.conv_time(timeSAR)
            grp.create_dataset('timeSAR',
                               data=timeSAR,
                               shape=(timeSAR.shape[0], 1))
            grp.create_dataset('todSAR',
                               data=todSAR,
                               shape=(todSAR.shape[0], 1))

            incidence = preprocess.conv_incidence(
                fs['incidenceAngle'][indices])  # Separates into 2 var.
            grp.create_dataset('incidence', data=incidence)

            satellite = fs['satellite'][indices]
            grp.create_dataset('satellite',
                               data=satellite,
                               shape=(satellite.shape[0], 1))

            # Altimeter
            hsALT = fs['hsALT'][indices]
            grp.create_dataset('hsALT', data=hsALT, shape=(hsALT.shape[0], 1))

            # Get spectral data.
            x = np.stack((
                preprocess.conv_real(fs['cspcRe'][indices, ...]),
                preprocess.conv_imaginary(fs['cspcIm'][indices, ...]),
            ),
                         axis=3)
            grp.create_dataset('spectrum', data=x)
            print(f'Done with {years}')
    print('Done')