def extract_eq_var_at_pressure_level(path=mri_path, var='vmrh2o', plevel=8000): import xarray as xr from aux_functions_strat import path_glob from aux_functions_strat import save_ncfile file = path_glob(path, '{}_*equatorial*.nc'.format(var))[0] var_ds = xr.load_dataset(file) ps_file = path_glob(path, 'ps_*equatorial*.nc')[0] ps = xr.load_dataset(ps_file)['ps'] a, b, p0 = load_a_b_p0(path=path) plev = convert_hybrid_sigma_to_pressure(a, b, p0, ps) var_dts = [] for dt in plev.time: var_dt = var_ds.sel(time=dt) pl = plev.sel(time=dt) var_dt['lev'] = pl var_dt = var_dt.sel(lev=plevel, method='nearest') var_dts.append(var_dt) var_plev = xr.concat(var_dts, 'time') var_at_plevel = var_plev.reset_coords(drop=True)[var] var_at_plevel.attrs['plevel'] = str(plevel) + 'Pa' yrmax = var_at_plevel.time.max().dt.year.item() yrmin = var_at_plevel.time.min().dt.year.item() filename = '{}_equatorial_{}_Pa_{}-{}.nc'.format(var, plevel, yrmin, yrmax) save_ncfile(var_at_plevel, path, filename) return var_at_plevel
def proc_era5(path, field, model_name): import xarray as xr import numpy as np from aux_functions_strat import path_glob # from aux_functions_strat import save_ncfile files = sorted(path_glob(path, 'era5_{}_*.nc'.format(field))) files = [x for x in files if 'mm' not in x.as_posix()] print(files) if 'single' in model_name: era5 = xr.open_mfdataset(files) era5 = concat_era5T(era5) start = era5.time.dt.year[0].values.item() end = era5.time.dt.year[-1].values.item() filename = '_'.join( ['era5', str(field), '4Xdaily', str(start) + '-' + str(end)]) filename += '.nc' era5.to_netcdf(path / filename) print('Done!') elif 'pressure' in model_name: era5 = xr.open_mfdataset(files) era5 = concat_era5T(era5) start = era5.time.dt.year[0].values.item() end = era5.time.dt.year[-1].values.item() years = np.arange(start, end + 1).tolist() for year in years: era5_yearly = era5.sel(time=str(year)) filename = '_'.join(['era5', str(field), '4Xdaily', str(year)]) filename += '.nc' era5_yearly.to_netcdf(path / filename) print('Done!') return
def load_one_gridsearchcv_object(path=ml_path, model_name='SVM', verbose=True): """load one gridsearchcv obj with model_name and features and run read_one_gridsearchcv_object""" from aux_functions_strat import path_glob import joblib # first filter for model name: if verbose: print('loading GridsearchCVs results for {} model'.format(model_name)) model_files = path_glob(path, 'GRSRCHCV_*.pkl') model_files = [x for x in model_files if model_name in x.as_posix()] # now select features: # if verbose: # print('loading GridsearchCVs results with {} features'.format(features)) # model_features = [x.as_posix().split('/')[-1].split('_')[3] for x in model_files] # feat_ind = get_feature_set_from_list(model_features, features) # also get the test ratio and seed number: # if len(feat_ind) > 1: # if verbose: # print('found {} GR objects.'.format(len(feat_ind))) # files = sorted([model_files[x] for x in feat_ind]) # outer_splits = [x.as_posix().split('/')[-1].split('.')[0].split('_')[-3] for x in files] # grs = [joblib.load(x) for x in files] # best_dfs = [read_one_gridsearchcv_object(x) for x in grs] # di = dict(zip(outer_splits, best_dfs)) # return di # else: # file = model_files[feat_ind] # seed = file.as_posix().split('/')[-1].split('.')[0].split('_')[-1] # outer_splits = file.as_posix().split('/')[-1].split('.')[0].split('_')[-3] # load and produce best_df: gr = joblib.load(model_files[0]) best_df = read_one_gridsearchcv_object(gr) return best_df
def get_all_swoosh_files(path): import xarray as xr import pandas as pd from aux_functions_strat import path_glob """Reads all swoosh dataset downloaded from swoosh website and write them as xarray netcdf files, basically adding datetime index and importing just the 'combined' fields.""" filenames = path_glob(path, 'swoosh-v02.6*.nc') datasets = [] for file in filenames: dataset = xr.open_dataset(file, decode_times=False) dataset.attrs['filename'] = file.as_posix().split('/')[-1] datasets.append(dataset) print('importing {}'.format(file.as_posix().split('/')[-1])) data_dict = {} for dataset in datasets: combined_list = [ name for name in dataset.data_vars if 'combined' in name.lower() ] dataset = dataset[combined_list] time = pd.date_range('1984-01-01', freq='MS', periods=len(dataset.time)) dataset['time'] = time if 'latpress' in dataset.attrs['filename'].split('-'): dname = '-'.join(dataset.attrs['filename'].split('-')[-3:-1]) elif 'lonlatpress' in dataset.attrs['filename'].split('-'): dname = '-'.join(dataset.attrs['filename'].split('-')[-4:-1]) elif 'lattheta' in dataset.attrs['filename'].split('-'): dname = '-'.join(dataset.attrs['filename'].split('-')[-3:-1]) data_dict[dname] = dataset data_dict[dname].to_netcdf(path / 'swoosh_{}.nc'.format(dname)) print('Saved {} to nc file, in {}'.format(dname, path)) print('Done!') return
def get_HP_params_from_optimized_model(path=ml_path, model='SVM'): import joblib from aux_functions_strat import path_glob files = path_glob(path, 'GRSRCHCV_*.pkl') file = [x for x in files if model in x.as_posix()][0] gr = joblib.load(file) df = read_one_gridsearchcv_object(gr) return df.iloc[0][:-2].to_dict()
def produce_eq_means_from_3D_MRI(path=mri_path, var='vmrh2o', lat_slice=[-30, 30]): from aux_functions_strat import path_glob import xarray as xr from aux_functions_strat import save_ncfile from aux_functions_strat import lat_mean print('producing equatorial means for {}.'.format(var)) filestr = 'monthly_MRI-ESM1r1_refC2_r1i1p1' files = sorted(path_glob(path, '{}_{}*.nc'.format(var, filestr))) dsl = [xr.open_dataset(x) for x in files] eqs = [] for ds in dsl: eq_var = ds[var].mean('lon', keep_attrs=True) eq_var = lat_mean(eq_var.sel(lat=slice(*lat_slice))) eqs.append(eq_var) eq_da = xr.concat(eqs, 'time') eq_da = eq_da.sortby('time') yrmax = eq_da.time.max().dt.year.item() yrmin = eq_da.time.min().dt.year.item() filename = '{}_{}_equatorial_{}-{}.nc'.format(var, filestr, yrmin, yrmax) save_ncfile(eq_da, path, filename) return
def siphon_igra2_to_xarray(station, path=sound_path, fields=['temperature', 'pressure'], times=['1984-01-01', '2019-12-31'], derived=False): from siphon.simplewebservice.igra2 import IGRAUpperAir import pandas as pd import numpy as np import xarray as xr from urllib.error import URLError import logging from aux_functions_strat import path_glob logger = logging.getLogger('strato_sounding') # logging.basicConfig(filename=path / 'siphon.log', level=logging.INFO, # format='%(asctime)s %(levelname)-10s %(processName)s %(name)s %(message)s') # check for already d/l files: files = path_glob(path, '*_derived.nc') names = [x.as_posix().split('/')[-1].split('.')[0] for x in files] if station in names: logging.warning('station {} already downloaded, skipping'.format(station)) return '1' logger.info('fields chosen are: {}'.format(fields)) logger.info('dates chosen are: {}'.format(times)) dates = pd.to_datetime(times) dates = [x.to_pydatetime() for x in dates] logger.info('getting {} from IGRA2...'.format(station)) try: df, header = IGRAUpperAir.request_data(dates, station, derived=derived) except URLError: logger.warning('file not found using siphon.skipping...') return '2' header = header[header['number_levels'] > 25] # enough resolution dates = header['date'].values logger.info('splicing dataframe and converting to xarray dataset...') ds_list = [] for date in dates: dff = df[fields].loc[df['date'] == date] # release = dff.iloc[0, 1] dss = dff.to_xarray() # dss.attrs['release'] = release ds_list.append(dss) max_ind = np.max([ds.index.size for ds in ds_list]) vars_ = np.nan * np.ones((len(dates), len(fields), max_ind)) for i, ds in enumerate(ds_list): size = ds[[x for x in ds.data_vars][0]].size vars_[i, :, 0:size] = ds.to_array().values Vars = xr.DataArray(vars_, dims=['time', 'var', 'point']) Vars['time'] = dates Vars['var'] = fields ds = Vars.to_dataset(dim='var') for field in fields: ds[field].attrs['units'] = df.units[field] ds.attrs['site_id'] = header.loc[:, 'site_id'].values[0] ds.attrs['lat'] = header.loc[:, 'latitude'].values[0] ds.attrs['lon'] = header.loc[:, 'longitude'].values[0] logger.info('Done!') if derived: filename = station + '_derived' + '.nc' else: filename = station + '_not_derived' + '.nc' comp = dict(zlib=True, complevel=9) # best compression encoding = {var: comp for var in ds.data_vars} ds.to_netcdf(path / filename, 'w', encoding=encoding) logger.info('saved {} to {}.'.format(filename, path)) return ds
def calc_cold_point_from_sounding(path=sound_path, times=['1993', '2017'], plot=True, return_mean=True, return_anom=True): import xarray as xr # import seaborn as sns from aux_functions_strat import deseason_xr from aux_functions_strat import path_glob def return_one_station(file_obj, name, times): print('proccessing station {}:'.format(name)) station = xr.open_dataset(file) if times is None: first = station['time'].min().dt.strftime('%Y-%m') last = station['time'].max().dt.strftime('%Y-%m') times = [first, last] station = station.sel(time=slice(times[0], times[1])) # take Majuro station data after 2011 only nighttime: if 'RMM00091376' in name: print('taking just the midnight soundings after 2011 for {}'.format(name)) station_after_2011 = station.sel( time=slice('2011', times[1])).where( station['time.hour'] == 00) station_before_2011 = station.sel(time=slice(times[0], '2010')) station = xr.concat([station_before_2011, station_after_2011], 'time') # slice with cold point being between 80 and 130 hPa cold = station['temperature'].where(station.pressure <= 120).where( station.pressure >= 80).min( dim='point') # take the min and ensure it is below -72 degC: cold = station.temperature.min('point') cold = cold.where(cold < -72) cold.attrs = station.attrs try: cold = cold.resample(time='MS').mean() except IndexError: return if return_anom: anom = deseason_xr(cold, how='mean') anom.name = name return anom cold.name = name return cold da_list = [] for file in path_glob(path, '*_derived.nc'): if file.is_dir(): continue name = file.as_posix().split('/')[-1].split('.')[0] da = return_one_station(file, name, times) da_list.append(da) # argmin_point = station.temperature.argmin(dim='point').values # p_points = [] # for i, argmin in enumerate(argmin_point): # p = station.pressure.sel(point=argmin).isel(time=i).values.item() # p_points.append(p) # sns.distplot(p_points, bins=100, color='c', # label='pressure_cold_points_' + name) ds = xr.merge(da_list) da = ds.to_array(dim='name') if return_anom: da.name = 'radiosonde_cold_point_anomalies' else: da.name = 'radiosonde_cold_point' # mean_da = da.where(np.abs(da) < 3).mean('name') mean_da = da.mean('name') if plot: da.to_dataset('name').to_dataframe().plot() if return_mean: return mean_da else: return da return da