def main(sDir, f): ff = pd.read_csv(os.path.join(sDir, f)) datasets = cf.get_nc_urls(ff['outputUrl'].tolist()) for d in datasets: print(d) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( d) save_dir = os.path.join(sDir, subsite, refdes, deployment) cf.create_dir(save_dir) sci_vars = cf.return_science_vars(stream) colors = cm.jet(np.linspace(0, 1, len(sci_vars))) with xr.open_dataset(d, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) t = ds['time'].data t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) fig, ax = plt.subplots() axes = [ax] for i in range(len(sci_vars)): if i > 0: axes.append(ax.twinx() ) # twin the x-axis to make independent y-axes fig.subplots_adjust(right=0.6) right_additive = (0.98 - 0.6) / float(5) for i in range(len(sci_vars)): if i > 0: axes[i].spines['right'].set_position( ('axes', 1. + right_additive * i)) y = ds[sci_vars[i]] ind = cf.reject_outliers(y, 5) yD = y.data[ind] x = t[ind] #yD = y.data c = colors[i] axes[i].plot(x, yD, '.', markersize=2, color=c) axes[i].set_ylabel((y.name + " (" + y.units + ")"), color=c, fontsize=9) axes[i].tick_params(axis='y', colors=c) if i == len( sci_vars) - 1: # if the last variable has been plotted pf.format_date_axis(axes[i], fig) axes[0].set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname, 'timeseries')) pf.save_fig(save_dir, sfile)
def main(sDir, url_list, deployment_num): reviewlist = pd.read_csv( 'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv') rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) json_file_list = [] for r in rd_list: dependencies = [] print('\n{}'.format(r)) data = OrderedDict(deployments=OrderedDict()) save_dir = os.path.join(sDir, r.split('-')[0], r) cf.create_dir(save_dir) # Deployment location test deploy_loc_test = cf.deploy_location_check(r) data['location_comparison'] = deploy_loc_test for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) catalog_rms = '-'.join((r, splitter[-2], splitter[-1])) # complete the analysis by reference designator if rd_check == r: udatasets = cf.get_nc_urls([u]) # check for the OOI 1.0 datasets for review rl_filtered = reviewlist.loc[ (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')] review_deployments = rl_filtered['deploymentNumber'].tolist() review_deployments_int = ['deployment%04d' % int(x) for x in review_deployments] for rev_dep in review_deployments_int: if deployment_num is not None: if int(rev_dep[-4:]) is not deployment_num: print('\nskipping {}'.format(rev_dep)) continue rdatasets = [s for s in udatasets if rev_dep in s] rdatasets.sort() if len(rdatasets) > 0: datasets = [] for dss in rdatasets: # filter out collocated data files if catalog_rms == dss.split('/')[-1].split('_20')[0][15:]: datasets.append(dss) else: drd = dss.split('/')[-1].split('_20')[0][15:42] if drd not in dependencies and drd != r: dependencies.append(drd) notes = [] time_ascending = '' sci_vars_dict = {} #datasets = datasets[0:2] #### for testing for i in range(len(datasets)): ds = xr.open_dataset(datasets[i], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) print('\nAppending data from {}: file {} of {}'.format(rev_dep, i+1, len(datasets))) # when opening multiple datasets, don't check that the timestamps are in ascending order time_ascending = 'not_tested' if i == 0: fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0]) fname = fname.split('_20')[0] # Get info from the data review database dr_data = cf.refdes_datareview_json(refdes) stream_vars = cf.return_stream_vars(data_stream) sci_vars = cf.return_science_vars(data_stream) node = refdes.split('-')[1] if 'cspp' in data_stream or 'WFP' in node: sci_vars.append('int_ctd_pressure') # Add pressure to the list of science variables press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) if press is not None: sci_vars.append(press) sci_vars.append('time') sci_vars = list(np.unique(sci_vars)) if 'ADCP' in r: sci_vars = [x for x in sci_vars if 'beam' not in x] for sci_var in sci_vars: if sci_var == 'time': sci_vars_dict.update( {sci_var: dict(values=np.array([], dtype=np.datetime64), units=[], fv=[])}) else: sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[])}) deploy_info = get_deployment_information(dr_data, int(deployment[-4:])) # Grab deployment Variables deploy_start = str(deploy_info['start_date']) deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] deploy_depth = deploy_info['deployment_depth'] # Calculate days deployed if deploy_stop != 'None': r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0) if deploy_stop.split('T')[1] == '00:00:00': r_deploy_stop = pd.to_datetime(deploy_stop) else: r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0) n_days_deployed = (r_deploy_stop - r_deploy_start).days else: n_days_deployed = None # Add reference designator to dictionary try: data['refdes'] except KeyError: data['refdes'] = refdes # append data for the deployment into a dictionary for s_v in sci_vars_dict.keys(): vv = ds[s_v] try: if vv.units not in sci_vars_dict[s_v]['units']: sci_vars_dict[s_v]['units'].append(vv.units) except AttributeError: print('') try: if vv._FillValue not in sci_vars_dict[s_v]['fv']: sci_vars_dict[s_v]['fv'].append(vv._FillValue) except AttributeError: print('') if len(vv.dims) == 1: if s_v in ['wavelength_a', 'wavelength_c']: # if the array is not same as the array that was already appended for these # two OPTAA variables, append. if it's already there, don't append if np.sum(vv.values == sci_vars_dict[s_v]['values']) != len(vv.values): sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values) else: sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values) elif len(vv.dims) == 2: # appending 2D datasets vD = vv.values.T if len(sci_vars_dict[s_v]['values']) == 0: sci_vars_dict[s_v]['values'] = vD else: sci_vars_dict[s_v]['values'] = np.concatenate((sci_vars_dict[s_v]['values'], vD), axis=1) deployments = data['deployments'].keys() data_start = pd.to_datetime(min(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S') data_stop = pd.to_datetime(max(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S') # Add deployment and info to dictionary and initialize delivery method sub-dictionary if deployment not in deployments: data['deployments'][deployment] = OrderedDict(deploy_start=deploy_start, deploy_stop=deploy_stop, n_days_deployed=n_days_deployed, lon=deploy_lon, lat=deploy_lat, deploy_depth=deploy_depth, method=OrderedDict()) # Add delivery methods to dictionary and initialize stream sub-dictionary methods = data['deployments'][deployment]['method'].keys() if method not in methods: data['deployments'][deployment]['method'][method] = OrderedDict( stream=OrderedDict()) # Add streams to dictionary and initialize file sub-dictionary streams = data['deployments'][deployment]['method'][method]['stream'].keys() if data_stream not in streams: data['deployments'][deployment]['method'][method]['stream'][ data_stream] = OrderedDict(file=OrderedDict()) # Get a list of data gaps >1 day time_df = pd.DataFrame(sci_vars_dict['time']['values'], columns=['time']) time_df = time_df.sort_values(by=['time']) gap_list = cf.timestamp_gap_test(time_df) # Calculate the sampling rate to the nearest second time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]') rates_df = time_df.groupby(['diff']).agg(['count']) n_diff_calc = len(time_df) - 1 rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict()) for i, row in rates_df.iterrows(): percent = (float(row['time']['count']) / float(n_diff_calc)) if percent > 0.1: rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)}) sampling_rt_sec = None for k, v in rates['common_sampling_rates'].items(): if float(v.strip('%')) > 50.00: sampling_rt_sec = k if not sampling_rt_sec: sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates']) # Don't do : Check that the timestamps in the file are unique time_test = '' # Count the number of days for which there is at least 1 timestamp n_days = len(np.unique(sci_vars_dict['time']['values'].astype('datetime64[D]'))) # Compare variables in file to variables in Data Review Database ds_variables = list(ds.data_vars.keys()) + list(ds.coords.keys()) ds_variables = eliminate_common_variables(ds_variables) ds_variables = [x for x in ds_variables if 'qc' not in x] [_, unmatch1] = compare_lists(stream_vars, ds_variables) [_, unmatch2] = compare_lists(ds_variables, stream_vars) # calculate mean pressure from data, excluding outliers +/- 3 SD try: pressure = sci_vars_dict[press] if len(pressure) > 1: # reject NaNs p_nonan = pressure['values'][~np.isnan(pressure['values'])] # reject fill values p_nonan_nofv = p_nonan[p_nonan != pressure['fv'][0]] # reject data outside of global ranges [pg_min, pg_max] = cf.get_global_ranges(r, press) if pg_min is not None and pg_max is not None: pgr_ind = cf.reject_global_ranges(p_nonan_nofv, pg_min, pg_max) p_nonan_nofv_gr = p_nonan_nofv[pgr_ind] else: p_nonan_nofv_gr = p_nonan_nofv if (len(p_nonan_nofv_gr) > 0): [press_outliers, pressure_mean, _, pressure_max, _, _] = cf.variable_statistics(p_nonan_nofv_gr, 3) pressure_mean = round(pressure_mean, 2) pressure_max = round(pressure_max, 2) else: press_outliers = None pressure_mean = None pressure_max = None if len(pressure) > 0 and len(p_nonan) == 0: notes.append('Pressure variable all NaNs') elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0: notes.append('Pressure variable all fill values') elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0: notes.append('Pressure variable outside of global ranges') else: # if there is only 1 data point press_outliers = 0 pressure_mean = round(ds[press].values.tolist()[0], 2) pressure_max = round(ds[press].values.tolist()[0], 2) try: pressure_units = pressure['units'][0] except AttributeError: pressure_units = 'no units attribute for pressure' if pressure_mean: if 'SF' in node: pressure_compare = int(round(pressure_max)) else: pressure_compare = int(round(pressure_mean)) if pressure_units == '0.001 dbar': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round((pressure_mean / 1000), 2) pressure_compare = round((pressure_compare / 1000), 2) notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison') elif pressure_units == 'daPa': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round((pressure_mean / 1000), 2) pressure_compare = round((pressure_compare / 1000), 2) notes.append('Pressure converted from daPa to dbar for pressure comparison') else: pressure_compare = None if (not deploy_depth) or (not pressure_mean): pressure_diff = None else: pressure_diff = pressure_compare - deploy_depth except KeyError: press = 'no seawater pressure in file' pressure_diff = None pressure_mean = None pressure_max = None pressure_compare = None press_outliers = None pressure_units = None # Add files and info to dictionary filenames = data['deployments'][deployment]['method'][method]['stream'][data_stream][ 'file'].keys() if fname not in filenames: data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][ fname] = OrderedDict( file_downloaded=pd.to_datetime(splitter[0][0:15]).strftime('%Y-%m-%dT%H:%M:%S'), file_coordinates=list(ds.coords.keys()), sampling_rate_seconds=sampling_rt_sec, sampling_rate_details=rates, data_start=data_start, data_stop=data_stop, time_gaps=gap_list, unique_timestamps=time_test, n_timestamps=len(sci_vars_dict['time']['values']), n_days=n_days, notes=notes, ascending_timestamps=time_ascending, pressure_comparison=dict(pressure_mean=pressure_mean, units=pressure_units, num_outliers=press_outliers, diff=pressure_diff, pressure_max=pressure_max, variable=press, pressure_compare=pressure_compare), vars_in_file=ds_variables, vars_not_in_file=[x for x in unmatch1 if 'time' not in x], vars_not_in_db=unmatch2, sci_var_stats=OrderedDict()) # calculate statistics for science variables, excluding outliers +/- 5 SD for sv in sci_vars_dict.keys(): if sv != 't_max': # for ADCP if sv != 'time': print(sv) var = sci_vars_dict[sv] vD = var['values'] var_units = var['units'] #if 'timedelta' not in str(vD.dtype): vnum_dims = len(np.shape(vD)) # for OPTAA wavelengths, print the array if sv == 'wavelength_a' or sv == 'wavelength_c': [g_min, g_max] = cf.get_global_ranges(r, sv) n_all = len(var) mean = list(vD) num_outliers = None vmin = None vmax = None sd = None n_stats = 'not calculated' n_nan = None n_fv = None n_grange = 'no global ranges' fv = var['fv'][0] else: if vnum_dims > 2: print('variable has more than 2 dimensions') num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable has more than 2 dimensions' n_nan = None n_fv = None n_grange = None fv = None n_all = None else: if vnum_dims > 1: n_all = [len(vD), len(vD.flatten())] else: n_all = len(vD) n_nan = int(np.sum(np.isnan(vD))) fv = var['fv'][0] vD[vD == fv] = np.nan # turn fill values to nans n_fv = int(np.sum(np.isnan(vD))) - n_nan [g_min, g_max] = cf.get_global_ranges(r, sv) if list(np.unique(np.isnan(vD))) != [True]: # reject data outside of global ranges if g_min is not None and g_max is not None: # turn data outside of global ranges to nans #var_gr = var_nofv.where((var_nofv >= g_min) & (var_nofv <= g_max)) vD[vD < g_min] = np.nan vD[vD > g_max] = np.nan n_grange = int(np.sum(np.isnan(vD)) - n_fv - n_nan) else: n_grange = 'no global ranges' if list(np.unique(np.isnan(vD))) != [True]: if sv == 'spkir_abj_cspp_downwelling_vector': # don't remove outliers from dataset [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics_spkir(vD) else: if vnum_dims > 1: var_gr = vD.flatten() else: var_gr = vD # drop nans before calculating stats var_gr = var_gr[~np.isnan(var_gr)] [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_gr, 5) else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None if vnum_dims > 1: sv = '{} (dims: {})'.format(sv, list(np.shape(var['values']))) else: sv = sv #if 'timedelta' not in str(var.values.dtype): data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][ fname]['sci_var_stats'][sv] = dict(n_outliers=num_outliers, mean=mean, min=vmin, max=vmax, stdev=sd, n_stats=n_stats, units=var_units, n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv), global_ranges=[g_min, g_max], n_grange=n_grange, n_all=n_all) sfile = os.path.join(save_dir, '{}-{}-file_analysis.json'.format(rev_dep, r)) with open(sfile, 'w') as outfile: json.dump(data, outfile) json_file_list.append(str(sfile)) depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r)) with open(depfile, 'w') as depf: depf.write(str(dependencies)) return json_file_list
def main(url_list, sDir, stime, etime): if len(url_list) != 2: print('Please provide 2 reference designators for plotting') else: uu0 = url_list[0] uu1 = url_list[1] rd0 = uu0.split('/')[-2][20:47] rd1 = uu1.split('/')[-2][20:47] array = rd0[0:2] inst = rd0.split('-')[-1] datasets0 = [] datasets1 = [] for i in range(len(url_list)): udatasets = cf.get_nc_urls([url_list[i]]) if i == 0: datasets0.append(udatasets) else: datasets1.append(udatasets) datasets0 = list(itertools.chain(*datasets0)) datasets1 = list(itertools.chain(*datasets1)) main_sensor0 = rd0.split('-')[-1] main_sensor1 = rd1.split('-')[-1] fdatasets0_sel = cf.filter_collocated_instruments( main_sensor0, datasets0) fdatasets1_sel = cf.filter_collocated_instruments( main_sensor1, datasets1) deployments = [ dd.split('/')[-1].split('_')[0] for dd in fdatasets0_sel ] for d in deployments: fd0 = [x for x in fdatasets0_sel if d in x] fd1 = [x for x in fdatasets1_sel if d in x] ds0 = xr.open_dataset(fd0[0], mask_and_scale=False) ds0 = ds0.swap_dims({'obs': 'time'}) ds1 = xr.open_dataset(fd1[0], mask_and_scale=False) ds1 = ds1.swap_dims({'obs': 'time'}) if stime is not None and etime is not None: ds0 = ds0.sel(time=slice(stime, etime)) ds1 = ds1.sel(time=slice(stime, etime)) if len(ds0['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd0[0]) sci_vars = cf.return_science_vars(stream) save_dir_profile = os.path.join(sDir, array, subsite, inst, 'profile_plots', deployment) cf.create_dir(save_dir_profile) # get pressure variable pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars( ds0) for sv in sci_vars: print('') print(sv) if 'pressure' not in sv: fig, ax = plt.subplots() plt.margins(y=.08, x=.02) plt.grid() title = ' '.join((deployment, subsite, inst, method)) sname = '-'.join((subsite, inst, method, sv)) for i in range(len(url_list)): if i == 0: ds = ds0 else: ds = ds1 t = ds['time'].values zpressure = ds[pvarname].values z1 = ds[sv].values fv = ds[sv]._FillValue sv_units = ds[sv].units # Check if the array is all NaNs if sum(np.isnan(z1)) == len(z1): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z1[z1 != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # get rid of 0.0 data if sv == 'salinity': ind = z1 > 1 elif sv == 'density': ind = z1 > 1000 elif sv == 'conductivity': ind = z1 > 0.1 elif sv == 'dissolved_oxygen': ind = z1 > 160 elif sv == 'estimated_oxygen_concentration': ind = z1 > 200 else: ind = z1 > 0 # if sv == 'sci_flbbcd_chlor_units': # ind = ndata < 7.5 # elif sv == 'sci_flbbcd_cdom_units': # ind = ndata < 25 # else: # ind = ndata > 0.0 # if 'CTD' in r: # ind = zpressure > 0.0 # else: # ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = t[ind] zpressure = zpressure[ind] zdata = z1[ind] if len(dtime) > 0: ax.scatter(zdata, zpressure, s=2, edgecolor='None') xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" ax.invert_yaxis() # plt.xlim([-0.5, 0.5]) ax.set_xlabel(xlabel, fontsize=9) ax.set_ylabel(ylabel, fontsize=9) ax.set_title(title + '\nWFP02 (blue) & WFP03 (orange)', fontsize=9) fig.tight_layout() pf.save_fig(save_dir_profile, sname)
def compare_data(df): names = df.columns summary = dict(deployments=dict()) for d, row in df.iterrows(): for i, n in enumerate(names): ii = i + 1 if ii > 1: f1 = row[n] try: if np.isnan(f1) is True: continue except TypeError: for x in range(ii - 1): f0 = row[names[x]] try: if np.isnan(f0) is True: continue except TypeError: if d not in summary['deployments'].keys(): summary['deployments'][d] = dict( comparison=dict()) compare = '{} {}'.format(names[x], n) if compare not in summary['deployments'][d][ 'comparison'].keys(): summary['deployments'][d]['comparison'][ compare] = dict(vars=dict()) if len(f0) == 1: ds0 = xr.open_dataset(f0[0]) ds0 = ds0.swap_dims({'obs': 'time'}) else: ds0 = xr.open_mfdataset(f0) ds0 = ds0.swap_dims({'obs': 'time'}) ds0 = ds0.chunk({'time': 100}) splt0 = compare.split(' ')[0].split('-') ds0_sci_vars = cf.return_science_vars(splt0[1]) ds0_method = splt0[0] if len(f1) == 1: ds1 = xr.open_dataset(f1[0]) ds1 = ds1.swap_dims({'obs': 'time'}) else: ds1 = xr.open_mfdataset(f1) ds1 = ds1.swap_dims({'obs': 'time'}) ds1 = ds1.chunk({'time': 100}) splt1 = compare.split(' ')[1].split('-') ds1_sci_vars = cf.return_science_vars(splt1[1]) ds1_method = splt1[0] # find where the variable long names are the same ds0names = long_names(ds0, ds0_sci_vars) ds0names.rename(columns={'name': 'name_ds0'}, inplace=True) ds1names = long_names(ds1, ds1_sci_vars) ds1names.rename(columns={'name': 'name_ds1'}, inplace=True) mapping = pd.merge(ds0names, ds1names, on='long_name', how='inner') print('----------------------') print('{}: {}'.format(d, compare)) print('----------------------') blank_dict = { 'missing_data_gaps': [], 'n_missing': [], 'n_missing_days_total': 0, 'n_missing_total': 0 } for rr in mapping.itertuples(): index, name_ds0, long_name, name_ds1 = rr print(long_name) # Compare data from two data streams (round timestamps to the nearest second). ds0_rename = '_'.join((str(name_ds0), 'ds0')) [ds0_df, ds0_units, n0, n0_nan] = get_ds_variable_info( ds0, name_ds0, ds0_rename) ds1_rename = '_'.join((str(name_ds1), 'ds1')) [ds1_df, ds1_units, n1, n1_nan] = get_ds_variable_info( ds1, name_ds1, ds1_rename) # Compare units if ds0_units == ds1_units: unit_test = 'pass' else: unit_test = 'fail' if unit_test == 'pass': # skip if the variables have more than 1 dimension if (type(ds0_df) == str) or (type(ds1_df) == str): n_comparison = 0 n_diff_g_zero = None min_diff = None max_diff = None ds0_missing_dict = '2D dataset' ds1_missing_dict = '2D dataset' else: # Merge dataframes from both methods merged = pd.merge(ds0_df, ds1_df, on='time', how='outer') # Drop rows where both variables are NaNs, and make sure the timestamps are in order merged.dropna( subset=[ds0_rename, ds1_rename], how='all', inplace=True) if len(merged) == 0: print('No valid data to compare') n_comparison = 0 n_diff_g_zero = None min_diff = None max_diff = None ds0_missing_dict = 'No valid data to compare' ds1_missing_dict = 'No valid data to compare' else: merged = merged.sort_values( 'time').reset_index(drop=True) m_intersect = merged[ merged[ds0_rename].notnull() & merged[ds1_rename].notnull()] # If the number of data points for comparison is less than 1% of the smaller sample size # compare the timestamps by rounding to the nearest hour if len(m_intersect) == 0 or float( len(m_intersect)) / float( min(n0, n1)) * 100 < 1.00: n_comparison = 0 n_diff_g_zero = None min_diff = None max_diff = None utime_df0 = unique_timestamps_hour( ds0) utime_df0['ds0'] = 'ds0' utime_df1 = unique_timestamps_hour( ds1) utime_df1['ds1'] = 'ds1' umerged = pd.merge(utime_df0, utime_df1, on='time', how='outer') umerged = umerged.sort_values( 'time').reset_index( drop=True) if 'telemetered' in ds0_method: ds0_missing_dict = 'method not checked for missing data' else: ds0_missing = umerged.loc[ umerged['ds0'].isnull( )] if len(ds0_missing) > 0: ds0_missing_dict = missing_data_times( ds0_missing) if ds0_missing_dict != blank_dict: ds0_missing_dict[ 'n_hours_missing'] = ds0_missing_dict.pop( 'n_missing' ) ds0_missing_dict[ 'n_hours_missing_total'] = ds0_missing_dict.pop( 'n_missing_total' ) else: ds0_missing_dict = 'timestamps rounded to the hour: no missing data' else: ds0_missing_dict = 'timestamps rounded to the hour: no missing data' if 'telemetered' in ds1_method: ds1_missing_dict = 'method not checked for missing data' else: ds1_missing = umerged.loc[ umerged['ds1'].isnull( )] if len(ds1_missing) > 0: ds1_missing_dict = missing_data_times( ds1_missing) if ds1_missing_dict != blank_dict: ds1_missing_dict[ 'n_hours_missing'] = ds1_missing_dict.pop( 'n_missing' ) ds1_missing_dict[ 'n_hours_missing_total'] = ds1_missing_dict.pop( 'n_missing_total' ) else: ds1_missing_dict = 'timestamps rounded to the hour: no missing data' else: ds1_missing_dict = 'timestamps rounded to the hour: no missing data' else: # Find where data are available in one dataset and missing in the other if # timestamps match exactly. Don't check for missing data in telemetered # datasets. if 'telemetered' in ds0_method: ds0_missing_dict = 'method not checked for missing data' else: ds0_missing = merged.loc[ merged[ds0_rename]. isnull()] if len(ds0_missing) > 0: ds0_missing_dict = missing_data_times( ds0_missing) if ds0_missing_dict == blank_dict: ds0_missing_dict = 'no missing data' else: ds0_missing_dict = 'no missing data' if 'telemetered' in ds1_method: ds1_missing_dict = 'method not checked for missing data' else: ds1_missing = merged.loc[ merged[ds1_rename]. isnull()] if len(ds1_missing) > 0: ds1_missing_dict = missing_data_times( ds1_missing) if ds1_missing_dict == blank_dict: ds1_missing_dict = 'no missing data' else: ds1_missing_dict = 'no missing data' # Where the data intersect, calculate the difference between the methods diff = m_intersect[ ds0_rename] - m_intersect[ ds1_rename] n_diff_g_zero = sum( abs(diff) > 0.99999999999999999) min_diff = round( min(abs(diff)), 10) max_diff = round( max(abs(diff)), 10) n_comparison = len(diff) summary['deployments'][d]['comparison'][ compare]['vars'][str( long_name )] = dict( ds0=dict(name=name_ds0, units=ds0_units, n=n0, n_nan=n0_nan, missing=ds0_missing_dict), ds1=dict(name=name_ds1, units=ds1_units, n=n1, n_nan=n1_nan, missing=ds1_missing_dict), unit_test=unit_test, n_comparison=n_comparison, n_diff_greater_zero=n_diff_g_zero, min_abs_diff=min_diff, max_abs_diff=max_diff) return summary
def main(sDir, url_list): reviewlist = pd.read_csv( 'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv' ) rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) json_file_list = [] for r in rd_list: dependencies = [] print('\n{}'.format(r)) data = OrderedDict(deployments=OrderedDict()) save_dir = os.path.join(sDir, r.split('-')[0], r) cf.create_dir(save_dir) # Deployment location test deploy_loc_test = cf.deploy_location_check(r) data['location_comparison'] = deploy_loc_test for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) catalog_rms = '-'.join((r, splitter[-2], splitter[-1])) # complete the analysis by reference designator if rd_check == r: udatasets = cf.get_nc_urls([u]) # check for the OOI 1.0 datasets for review rl_filtered = reviewlist.loc[ (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')] review_deployments = rl_filtered['deploymentNumber'].tolist() review_deployments_int = [ 'deployment%04d' % int(x) for x in review_deployments ] for rev_dep in review_deployments_int: rdatasets = [s for s in udatasets if rev_dep in s] if len(rdatasets) > 0: datasets = [] for dss in rdatasets: # filter out collocated data files if catalog_rms == dss.split('/')[-1].split( '_20')[0][15:]: datasets.append(dss) else: drd = dss.split('/')[-1].split('_20')[0][15:42] if drd not in dependencies and drd != r: dependencies.append(drd) notes = [] time_ascending = '' if len(datasets) == 1: try: ds = xr.open_dataset(datasets[0], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes( datasets[0]) except OSError: print('OSError - skipping file {}'.format( datasets[0])) continue elif len(datasets) > 1: ds = xr.open_mfdataset(datasets, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) #ds = ds.chunk({'time': 100}) fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes( datasets[0]) fname = fname.split('_20')[0] notes.append('multiple deployment .nc files') # when opening multiple datasets, don't check that the timestamps are in ascending order time_ascending = 'not_tested' else: continue print('\nAnalyzing file: {}'.format(fname)) # Get info from the data review database dr_data = cf.refdes_datareview_json(refdes) stream_vars = cf.return_stream_vars(data_stream) sci_vars = cf.return_science_vars(data_stream) node = refdes.split('-')[1] if 'cspp' in data_stream or 'WFP' in node: sci_vars.append('int_ctd_pressure') # if 'FDCHP' in refdes: # remove_vars = ['fdchp_wind_x', 'fdchp_wind_y', 'fdchp_wind_z', 'fdchp_speed_of_sound_sonic', # 'fdchp_x_accel_g', 'fdchp_y_accel_g', 'fdchp_z_accel_g'] # rv_regex = re.compile('|'.join(remove_vars)) # rv_sci_vars = [nn for nn in sci_vars if not rv_regex.search(nn)] # sci_vars = rv_sci_vars deploy_info = get_deployment_information( dr_data, int(deployment[-4:])) # Grab deployment Variables deploy_start = str(deploy_info['start_date']) deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] deploy_depth = deploy_info['deployment_depth'] # Calculate days deployed if deploy_stop != 'None': r_deploy_start = pd.to_datetime( deploy_start).replace(hour=0, minute=0, second=0) if deploy_stop.split('T')[1] == '00:00:00': r_deploy_stop = pd.to_datetime(deploy_stop) else: r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace( hour=0, minute=0, second=0) n_days_deployed = (r_deploy_stop - r_deploy_start).days else: n_days_deployed = None # Add reference designator to dictionary try: data['refdes'] except KeyError: data['refdes'] = refdes deployments = data['deployments'].keys() data_start = pd.to_datetime(min( ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S') data_stop = pd.to_datetime(max( ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S') # Add deployment and info to dictionary and initialize delivery method sub-dictionary if deployment not in deployments: data['deployments'][deployment] = OrderedDict( deploy_start=deploy_start, deploy_stop=deploy_stop, n_days_deployed=n_days_deployed, lon=deploy_lon, lat=deploy_lat, deploy_depth=deploy_depth, method=OrderedDict()) # Add delivery methods to dictionary and initialize stream sub-dictionary methods = data['deployments'][deployment][ 'method'].keys() if method not in methods: data['deployments'][deployment]['method'][ method] = OrderedDict(stream=OrderedDict()) # Add streams to dictionary and initialize file sub-dictionary streams = data['deployments'][deployment]['method'][ method]['stream'].keys() if data_stream not in streams: data['deployments'][deployment]['method'][method][ 'stream'][data_stream] = OrderedDict( file=OrderedDict()) # Get a list of data gaps >1 day time_df = pd.DataFrame(ds['time'].values, columns=['time']) gap_list = cf.timestamp_gap_test(time_df) # Calculate the sampling rate to the nearest second time_df['diff'] = time_df['time'].diff().astype( 'timedelta64[s]') rates_df = time_df.groupby(['diff']).agg(['count']) n_diff_calc = len(time_df) - 1 rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict()) for i, row in rates_df.iterrows(): percent = (float(row['time']['count']) / float(n_diff_calc)) if percent > 0.1: rates['common_sampling_rates'].update( {int(i): '{:.2%}'.format(percent)}) sampling_rt_sec = None for k, v in rates['common_sampling_rates'].items(): if float(v.strip('%')) > 50.00: sampling_rt_sec = k if not sampling_rt_sec: sampling_rt_sec = 'no consistent sampling rate: {}'.format( rates['common_sampling_rates']) # Check that the timestamps in the file are unique time = ds['time'] len_time = time.__len__() len_time_unique = np.unique(time).__len__() if len_time == len_time_unique: time_test = 'pass' else: time_test = 'fail' # Check that the timestamps in the file are in ascending order if time_ascending != 'not_tested': # convert time to number time_in = [ dt.datetime.utcfromtimestamp( np.datetime64(x).astype('O') / 1e9) for x in ds['time'].values ] time_data = nc.date2num( time_in, 'seconds since 1900-01-01') # Create a list of True or False by iterating through the array of time and checking # if every time stamp is increasing result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)] # Print outcome of the iteration with the list of indices when time is not increasing if result.count(True) == len(time) - 1: time_ascending = 'pass' else: ind_fail = { k: time_in[k] for k, v in enumerate(result) if v is False } time_ascending = 'fail: {}'.format(ind_fail) # Count the number of days for which there is at least 1 timestamp n_days = len( np.unique(time.values.astype('datetime64[D]'))) # Compare variables in file to variables in Data Review Database ds_variables = list(ds.data_vars.keys()) + list( ds.coords.keys()) #ds_variables = [k for k in ds] ds_variables = eliminate_common_variables(ds_variables) ds_variables = [ x for x in ds_variables if 'qc' not in x ] [_, unmatch1] = compare_lists(stream_vars, ds_variables) [_, unmatch2] = compare_lists(ds_variables, stream_vars) # Check deployment pressure from asset management against pressure variable in file press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) # calculate mean pressure from data, excluding outliers +/- 3 SD try: pressure = ds[press] num_dims = len(pressure.dims) if len(pressure) > 1: # if the pressure variable is an array of all zeros (as in the case of pressure_depth # for OPTAAs on surface piercing profilers if (len(np.unique(pressure)) == 1) & ( np.unique(pressure)[0] == 0.0): try: pressure = ds['int_ctd_pressure'] press = 'int_ctd_pressure' except KeyError: pressure = pressure # reject NaNs p_nonan = pressure.values[~np.isnan(pressure. values)] # reject fill values p_nonan_nofv = p_nonan[ p_nonan != pressure._FillValue] # reject data outside of global ranges [pg_min, pg_max] = cf.get_global_ranges(r, press) if pg_min is not None and pg_max is not None: pgr_ind = cf.reject_global_ranges( p_nonan_nofv, pg_min, pg_max) p_nonan_nofv_gr = p_nonan_nofv[pgr_ind] else: p_nonan_nofv_gr = p_nonan_nofv if (len(p_nonan_nofv_gr) > 0) and (num_dims == 1): [ press_outliers, pressure_mean, _, pressure_max, _, _ ] = cf.variable_statistics( p_nonan_nofv_gr, 3) pressure_mean = round(pressure_mean, 2) pressure_max = round(pressure_max, 2) elif (len(p_nonan_nofv_gr) > 0) and (num_dims > 1): print('variable has more than 1 dimension') press_outliers = 'not calculated: variable has more than 1 dimension' pressure_mean = round( np.nanmean(p_nonan_nofv_gr), 2) pressure_max = round( np.nanmax(p_nonan_nofv_gr), 2) else: press_outliers = None pressure_mean = None pressure_max = None if len(pressure) > 0 and len(p_nonan) == 0: notes.append( 'Pressure variable all NaNs') elif len(pressure) > 0 and len( p_nonan) > 0 and len( p_nonan_nofv) == 0: notes.append( 'Pressure variable all fill values' ) elif len(pressure) > 0 and len( p_nonan) > 0 and len( p_nonan_nofv) > 0 and len( p_nonan_nofv_gr) == 0: notes.append( 'Pressure variable outside of global ranges' ) else: # if there is only 1 data point press_outliers = 0 pressure_mean = round( ds[press].values.tolist()[0], 2) pressure_max = round( ds[press].values.tolist()[0], 2) try: pressure_units = pressure.units except AttributeError: pressure_units = 'no units attribute for pressure' if pressure_mean: if ('WFP' in node) or ('MOAS' in subsite) or ( 'SP' in node): pressure_compare = int(round(pressure_max)) else: pressure_compare = int( round(pressure_mean)) if pressure_units == '0.001 dbar': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round( (pressure_mean / 1000), 2) pressure_compare = round( (pressure_compare / 1000), 2) notes.append( 'Pressure converted from 0.001 dbar to dbar for pressure comparison' ) elif pressure_units == 'daPa': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round( (pressure_mean / 1000), 2) pressure_compare = round( (pressure_compare / 1000), 2) notes.append( 'Pressure converted from daPa to dbar for pressure comparison' ) else: pressure_compare = None if (not deploy_depth) or (not pressure_mean): pressure_diff = None else: pressure_diff = pressure_compare - deploy_depth except KeyError: press = 'no seawater pressure in file' pressure_diff = None pressure_mean = None pressure_max = None pressure_compare = None press_outliers = None pressure_units = None # Add files and info to dictionary filenames = data['deployments'][deployment]['method'][ method]['stream'][data_stream]['file'].keys() if fname not in filenames: data['deployments'][deployment]['method'][method][ 'stream'][data_stream]['file'][ fname] = OrderedDict( file_downloaded=pd.to_datetime( splitter[0][0:15]).strftime( '%Y-%m-%dT%H:%M:%S'), file_coordinates=list( ds.coords.keys()), sampling_rate_seconds=sampling_rt_sec, sampling_rate_details=rates, data_start=data_start, data_stop=data_stop, time_gaps=gap_list, unique_timestamps=time_test, n_timestamps=len_time, n_days=n_days, notes=notes, ascending_timestamps=time_ascending, pressure_comparison=dict( pressure_mean=pressure_mean, units=pressure_units, num_outliers=press_outliers, diff=pressure_diff, pressure_max=pressure_max, variable=press, pressure_compare=pressure_compare), vars_in_file=ds_variables, vars_not_in_file=[ x for x in unmatch1 if 'time' not in x ], vars_not_in_db=unmatch2, sci_var_stats=OrderedDict()) # calculate statistics for science variables, excluding outliers +/- 5 SD for sv in sci_vars: if sv != 't_max': # for ADCP if sv != 'wavss_a_buoymotion_time': print(sv) try: var = ds[sv] # need to round SPKIR values to 1 decimal place to match the global ranges. # otherwise, values that round to zero (e.g. 1.55294e-05) will be excluded by # the global range test # if 'spkir' in sv: # vD = np.round(var.values, 1) # else: # vD = var.values vD = var.values if 'timedelta' not in str( var.values.dtype): # for OPTAA wavelengths: when multiple files are opened with xr.open_mfdataset # xarray automatically forces all variables to have the same number of # dimensions. So in this case wavelength_a and wavelength_c have 1 dimension # in the individual files, so I'm forcing the analysis to treat them like # they have 1 dimension (when there are multiple files for 1 deployment) if sv == 'wavelength_a' or sv == 'wavelength_c': [g_min, g_max] = cf.get_global_ranges( r, sv) vnum_dims = len(var.dims) if vnum_dims == 1: n_all = len(var) mean = list(vD) else: vnum_dims = 1 n_all = len(vD[0]) mean = list(vD[0]) num_outliers = None vmin = None vmax = None sd = None n_stats = 'not calculated' var_units = var.units n_nan = None n_fv = None n_grange = 'no global ranges' fv = var._FillValue else: vnum_dims = len(var.dims) if vnum_dims > 2: print( 'variable has more than 2 dimensions' ) num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable has more than 2 dimensions' var_units = var.units n_nan = None n_fv = None n_grange = None fv = None n_all = None else: if vnum_dims > 1: n_all = [ len(vD), len(vD.flatten()) ] else: n_all = len(vD) n_nan = int( np.sum(np.isnan(vD))) fv = var._FillValue var_nofv = var.where( var != fv) n_fv = int( np.sum( np.isnan( var_nofv.values ))) - n_nan try: var_units = var.units except AttributeError: var_units = 'no_units' [g_min, g_max ] = cf.get_global_ranges( r, sv) if list( np.unique( np.isnan( var_nofv)) ) != [True]: # reject data outside of global ranges if g_min is not None and g_max is not None: var_gr = var_nofv.where( (var_nofv >= g_min) & (var_nofv <= g_max)) n_grange = int( np.sum( np.isnan( var_gr) ) - n_fv - n_nan) else: n_grange = 'no global ranges' var_gr = var_nofv if list( np.unique( np.isnan( var_gr) )) != [True]: if sv == 'spkir_abj_cspp_downwelling_vector': # don't remove outliers from dataset [ num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics_spkir( var_gr) else: if vnum_dims > 1: var_gr = var_gr.values.flatten( ) # drop nans before calculating stats var_gr = var_gr[ ~np.isnan( var_gr )] [ num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics( var_gr, 5) else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None except KeyError: if sv == 'int_ctd_pressure': continue else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable not found in file' var_units = None n_nan = None n_fv = None fv = None n_grange = None n_all = None if vnum_dims > 1: sv = '{} (dims: {})'.format( sv, list(var.dims)) else: sv = sv if 'timedelta' not in str( var.values.dtype): data['deployments'][deployment][ 'method'][method]['stream'][ data_stream]['file'][fname][ 'sci_var_stats'][sv] = dict( n_outliers=num_outliers, mean=mean, min=vmin, max=vmax, stdev=sd, n_stats=n_stats, units=var_units, n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv), global_ranges=[ g_min, g_max ], n_grange=n_grange, n_all=n_all) sfile = os.path.join(save_dir, '{}-file_analysis.json'.format(r)) with open(sfile, 'w') as outfile: json.dump(data, outfile) depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r)) with open(depfile, 'w') as depf: depf.write(str(dependencies)) json_file_list.append(str(sfile)) return json_file_list
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets fdatasets = np.unique(fdatasets).tolist() for fd in fdatasets: ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) #ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x] # get pressure variable from coordinates if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) sci_vars = cf.return_science_vars(stream) # drop the following list of key words from science variables list sci_vars = notin_list( sci_vars, ['bin_depths', 'salinity', 'temperature', 'beam']) sci_vars = [ name for name in sci_vars if ds[name].units != 'mm s-1' ] print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'preferred_method_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title_text = ' '.join((deployment, refdes, method)) for var in sci_vars: print(var) v = ds[var] fv = v._FillValue v_name = v.long_name if len(v.dims) == 1: v, n_nan, n_fv, n_ev, n_grange, g_min, g_max, n_std = reject_err_data_1_dims( v, fv, r, var, n=5) # Plot all data fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=None) ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, v_name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=5) title_i = 'removed: {} nans, {} fill values, {} extreme values, {} GR [{}, {}],' \ ' {} outliers +/- 5 SD'.format(n_nan, n_fv , n_ev, n_grange, g_min, g_max, n_std) ax.set_title( (title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8) sfile = '-'.join( (filename, v_name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile) else: v = v.values.T.astype(float) v_bad_beams = ds[ 'percent_bad_beams'] # get bad beams percent fv_bad_beam = v_bad_beams._FillValue v_bad_beams = v_bad_beams.values.T.astype(float) v_bad_beams[v_bad_beams == fv_bad_beam] = np.nan # mask fill values v, n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max = reject_err_data_2_dims( v, v_bad_beams, fv, r, var) ylabel = 'bin_depths ({})'.format(ds['bin_depths'].units) clabel = '{} ({})'.format(var, ds[var].units) # check bin depths for extreme values y = ds['bin_depths'].values.T y_nan = np.sum(np.isnan(y)) y = np.where(y < 6000, y, np.nan) # replace extreme bin_depths by nans bin_nan = np.sum(np.isnan(y)) - y_nan bin_title = 'removed: {} bin depths > 6000'.format(bin_nan) if 'echo' in var: color = 'BuGn' else: color = 'RdBu' new_y = dropna(y, axis=1) # convert to DataFrame to drop nan y_mask = new_y.loc[list(new_y.index), list(new_y.columns)] v_new = pd.DataFrame(v) v_mask = v_new.loc[list(new_y.index), list(new_y.columns)] tm_mask = tm[new_y.columns] fig, ax, __ = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color, n_stdev=None) if bin_nan > 0: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + bin_title), fontsize=8) else: ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=8) sfile = '-'.join((filename, var, t0[:10])) pf.save_fig(save_dir, sfile) fig, ax, n_nans_all = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color, n_stdev=5) title_i = 'removed: {} nans {} fill values, {} extreme values, {} bad beams, {} GR [{}, {}]'.format( n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max) if bin_nan > 0: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i + '\n' + bin_title), fontsize=8) else: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8) sfile = '-'.join((filename, var, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only, n_std, surface_params, depth_params): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'ENG' not in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments( main_sensor, fdatasets) for fd in fdatasets_sel: part_d = fd.split('/')[-1] print('\n{}'.format(part_d)) ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) array = subsite[0:2] sci_vars = cf.return_science_vars(stream) if 'CE05MOAS' in r or 'CP05MOAS' in r: # for coastal gliders, get m_water_depth for bathymetry eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng')) eng_url = [s for s in url_list if eng in s] if len(eng_url) == 1: eng_datasets = cf.get_nc_urls(eng_url) # filter out collocated datasets eng_dataset = [ j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1]) ] if len(eng_dataset) > 0: ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False) t_eng = ds_eng['time'].values m_water_depth = ds_eng['m_water_depth'].values # m_altimeter_status = 0 means a good reading (not nan or -1) eng_ind = ds_eng['m_altimeter_status'].values == 0 m_water_depth = m_water_depth[eng_ind] t_eng = t_eng[eng_ind] else: print('No engineering file for deployment {}'.format( deployment)) m_water_depth = None t_eng = None else: m_water_depth = None t_eng = None else: m_water_depth = None t_eng = None if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment, ext) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment, ext) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment, ext) else: save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment) tm = ds['time'].values try: ds_lat = ds['lat'].values except KeyError: ds_lat = None print('No latitude variable in file') try: ds_lon = ds['lon'].values except KeyError: ds_lon = None print('No longitude variable in file') # get pressure variable y, y_units, press = cf.add_pressure_to_dictionary_of_sci_vars(ds) for sv in sci_vars: print(sv) if 'pressure' not in sv: z = ds[sv].values fv = ds[sv]._FillValue sv_units = ds[sv].units # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \ cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon) # get rid of 0.0 data if 'CTD' in r: ind = zpressure > 0.0 else: ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = dtime[ind] zpressure = zpressure[ind] ndata = ndata[ind] if ds_lat is not None and ds_lon is not None: lat = lat[ind] lon = lon[ind] else: lat = None lon = None t0 = pd.to_datetime( dtime.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( dtime.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1 # reject time range from data portal file export t_portal, z_portal, y_portal, lat_portal, lon_portal = \ cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon) print( 'removed {} data points using visual inspection of data' .format(len(ndata) - len(z_portal))) # create data groups columns = ['tsec', 'dbar', str(sv)] # min_r = int(round(min(y_portal) - zcell_size)) # max_r = int(round(max(y_portal) + zcell_size)) # ranges = list(range(min_r, max_r, zcell_size)) #ranges = [0, 10, 20, 30, 40, 50, 60, 70, 80, 200] range1 = list( range(surface_params[0], surface_params[1], surface_params[2])) range2 = list( range(depth_params[0], depth_params[1] + depth_params[2], depth_params[2])) ranges = range1 + range2 groups, d_groups = gt.group_by_depth_range( t_portal, y_portal, z_portal, columns, ranges) if 'scatter' in sv: n_std = None # to use percentile else: n_std = n_std # get percentile analysis for printing on the profile plot inpercentile = [surface_params[3]] * len( range1) + [depth_params[3]] * len(range2) n_std = [surface_params[3]] * len( range1) + [depth_params[3]] * len(range2) y_plt, n_med, n_min, n_max, n0_std, n1_std, l_arr, time_ex = reject_timestamps_in_groups( groups, d_groups, n_std, inpercentile) """ Plot all data """ if len(tm) > 0: cf.create_dir(save_dir_profile) cf.create_dir(save_dir_xsection) sname = '-'.join((r, method, sv)) sfileall = '_'.join(('all_data', sname)) ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" clabel = 'Time' fig, ax = pf.plot_profiles(z, y, tm, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_profile, sfileall) ''' xsection plot ''' clabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" fig, ax, bar = pf.plot_xsection(subsite, tm, y, z, clabel, ylabel, t_eng, m_water_depth, inpercentile=None, stdev=None) ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_xsection, sfileall) """ Plot cleaned-up data """ if len(dtime) > 0: sfile = '_'.join(('rm_erroneous_data', sname)) ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" clabel = 'Time' fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) ax.plot(n_med, y_plt, '.k') ax.fill_betweenx(y_plt, n0_std, n1_std, color='m', alpha=0.2) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nexcluded {} suspect data points when inspected visually' .format(len(ndata) - len(z_portal)) + '\n(black) data median in {} dbar segments (break at {} dbar)' .format([surface_params[2], depth_params[2]], depth_params[0]) + '\n(magenta) upper and lower {} percentile envelope in {} dbar segments' .format( [surface_params[3], depth_params[3]], [surface_params[2], depth_params[2]]), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_profile, sfile) ''' xsection plot ''' clabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" # plot non-erroneous data fig, ax, bar = pf.plot_xsection(subsite, t_portal, y_portal, z_portal, clabel, ylabel, t_eng, m_water_depth, inpercentile=None, stdev=None) ax.set_title(title, fontsize=9) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nexcluded {} suspect data points when inspected visually' .format(len(ndata) - len(z_portal)), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_xsection, sfile) ''' 4D plot for gliders only ''' if 'MOAS' in r: if ds_lat is not None and ds_lon is not None: cf.create_dir(save_dir_4d) clabel = sv + " (" + sv_units + ")" zlabel = press[0] + " (" + y_units[0] + ")" fig = plt.figure() ax = fig.add_subplot(111, projection='3d') sct = ax.scatter(lon_portal, lat_portal, y_portal, c=z_portal, s=2) cbar = plt.colorbar(sct, label=clabel, extend='both') cbar.ax.tick_params(labelsize=8) ax.invert_zaxis() ax.view_init(25, 32) ax.invert_xaxis() ax.invert_yaxis() ax.set_zlabel(zlabel, fontsize=9) ax.set_ylabel('Latitude', fontsize=9) ax.set_xlabel('Longitude', fontsize=9) ax.set_title(title, fontsize=9) pf.save_fig(save_dir_4d, sfile)
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'PRESF' in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) for ud in udatasets: # filter out collocated data files if 'PRESF' in ud.split('/')[-1]: datasets.append(ud) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets fdatasets = np.unique(fdatasets).tolist() for fd in fdatasets: ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) sci_vars = cf.return_science_vars(stream) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) for var in sci_vars: print(var) if var != 'id': #if var == 'presf_wave_burst_pressure': y = ds[var] fv = y._FillValue if len(y.dims) == 1: # Check if the array is all NaNs if sum(np.isnan(y.values)) == len(y.values): print('Array of all NaNs - skipping plot.') # Check if the array is all fill values elif len(y[y != fv]) == 0: print('Array of all fill values - skipping plot.') else: # reject fill values ind = y.values != fv t = tm[ind] y = y[ind] # Plot all data fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join( (filename, y.name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile) else: v = y.values.T n_nan = np.sum(np.isnan(v)) # convert fill values to nans try: v[v == fv] = np.nan except ValueError: v = v.astype(float) v[v == fv] = np.nan n_fv = np.sum(np.isnan(v)) - n_nan # plot before global ranges are removed fig, ax = pf.plot_presf_2d(tm, v, y.name, y.units) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, var, t0[:10])) pf.save_fig(save_dir, sfile) # reject data outside of global ranges [g_min, g_max] = cf.get_global_ranges(r, var) if g_min is not None and g_max is not None: v[v < g_min] = np.nan v[v > g_max] = np.nan n_grange = np.sum(np.isnan(v)) - n_fv - n_nan if n_grange > 0: # don't plot if the array is all nans if len(np.unique( np.isnan(v))) == 1 and np.unique( np.isnan(v))[0] == True: continue else: # plot after global ranges are removed fig, ax = pf.plot_presf_2d( tm, v, y.name, y.units) title2 = 'removed: {} global ranges [{}, {}]'.format( n_grange, g_min, g_max) ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\n' + title2), fontsize=9) sfile = '-'.join( (filename, var, t0[:10], 'rmgr')) pf.save_fig(save_dir, sfile)
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets fdatasets = np.unique(fdatasets).tolist() for fd in fdatasets: ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) sci_vars = cf.return_science_vars(stream) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots') cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) # -------- plot entire deployment -------- for var in sci_vars: print(var) vv = ds[var] fv = vv._FillValue # need to round SPKIR values to 1 decimal place to match the global ranges. otherwise, values that # round to zero (e.g. 1.55294e-05) will be excluded by the global range test # v = np.round(vv.values.T, 1) # .T = transpose 2D array v = vv.values.T n_nan = np.sum(np.isnan(v)) # convert fill values to nans v[v == fv] = np.nan n_fv = np.sum(np.isnan(v)) - n_nan # plot before global ranges are removed fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, var, t0[:10])) pf.save_fig(save_dir, sfile) # reject data outside of global ranges [g_min, g_max] = cf.get_global_ranges(r, var) if g_min is not None and g_max is not None: v[v < g_min] = np.nan v[v > g_max] = np.nan n_grange = np.sum(np.isnan(v)) - n_fv - n_nan else: n_grange = 'no global ranges' # plot after global ranges are removed fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units) title2 = 'removed: {} global ranges [{}, {}]'.format( n_grange, g_min, g_max) ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\n' + title2), fontsize=9) sfile = '-'.join((filename, var, t0[:10], 'rmgr')) pf.save_fig(save_dir, sfile) # -------- break the deployment into months and plot -------- save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', 'monthly') cf.create_dir(save_dir) # create list of start and end dates dt_start = dt.datetime.strptime(t0, '%Y-%m-%dT%H:%M:%S') dt_end = dt.datetime.strptime(t1, '%Y-%m-%dT%H:%M:%S') start_dates = [dt_start.strftime('%m-%d-%YT00:00:00')] end_dates = [] ts1 = dt_start while ts1 <= dt_end: ts2 = ts1 + dt.timedelta(days=1) if ts2.month != ts1.month: start_dates.append(ts2.strftime('%m-%d-%YT00:00:00')) end_dates.append(ts1.strftime('%m-%d-%YT23:59:59')) ts1 = ts2 end_dates.append(dt_end.strftime('%m-%d-%YT23:59:59')) for sd, ed in zip(start_dates, end_dates): sd_format = dt.datetime.strptime(sd, '%m-%d-%YT%H:%M:%S') ed_format = dt.datetime.strptime(ed, '%m-%d-%YT%H:%M:%S') ds_month = ds.sel(time=slice(sd_format, ed_format)) if len(ds_month['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(sd, ed)) continue tm = ds_month['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') for var in sci_vars: print(var) vv = ds_month[var] fv = vv._FillValue v = vv.values.T # transpose 2D array n_nan = np.sum(np.isnan(v)) # convert fill values to nans v[v == fv] = np.nan n_fv = np.sum(np.isnan(v)) - n_nan # reject data outside of global ranges [g_min, g_max] = cf.get_global_ranges(r, var) if g_min is not None and g_max is not None: v[v < g_min] = np.nan v[v > g_max] = np.nan n_grange = np.sum(np.isnan(v)) - n_fv - n_nan else: n_grange = 'no global ranges' # plot after global ranges are removed fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units) title2 = 'removed: {} global ranges [{}, {}]'.format( n_grange, g_min, g_max) ax.set_title( (title + '\n' + t0 + ' - ' + t1 + '\n' + title2), fontsize=9) sfile = '-'.join((filename, var, t0[:7], 'rmgr')) pf.save_fig(save_dir, sfile)
def main(sDir, ncdir, start_time, end_time): rd_list = [ncdir.split('/')[-2]] for r in rd_list: print('\n{}'.format(r)) datasets = [] for root, dirs, files in os.walk(ncdir): for f in files: if f.endswith('.nc'): datasets.append(f) # for u in url_list: # splitter = u.split('/')[-2].split('-') # rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) # if rd_check == r: # udatasets = cf.get_nc_urls([u]) # datasets.append(udatasets) #datasets = list(itertools.chain(*datasets)) for fd in datasets: if '_blank' not in fd: ds = xr.open_dataset(os.path.join(ncdir, fd), mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) ds_vars = list(ds.data_vars.keys()) + [ x for x in ds.coords.keys() if 'pressure' in x ] # get pressure variable from coordinates #raw_vars = cf.return_raw_vars(ds_vars) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( os.path.join(ncdir, fd)) if 'NUTNR' in refdes or 'VEL3D in refdes': vars = cf.return_science_vars(stream) else: vars = cf.return_raw_vars(ds_vars) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) for var in vars: print(var) if var not in ['id', 'record_type', 'unique_id']: # if var != 'id' y = ds[var] try: fv = y._FillValue except AttributeError: fv = np.nan if len(y.dims) == 1: # Check if the array is all NaNs y[y == fv] = np.nan # turn fill values to nans if sum(np.isnan(y.values)) == len(y.values): print( 'Array of all NaNs and/or fill values - skipping plot.' ) # Check if the array is all fill values # elif len(y[y != fv]) == 0: # print('Array of all fill values - skipping plot.') else: # reject fill values ind = y.values != fv t = tm[ind] y = y[ind] # Plot all data fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def compare_plot_datasets(df, r, start_time, end_time, sDir, strm=None): names = df.columns for d, row in df.iterrows(): #if '0001' not in d: print('\n{}'.format(d)) for i, n in enumerate(names): ii = i + 1 if ii > 1: f1 = row[n] if type(f1) == float: continue elif type(f1) == list: for x in range(ii - 1): f0 = row[names[x]] if type(f0) == float: continue elif type(f0) == list: compare = '{} {}'.format(names[x], n) if len(f0) == 1: ds0 = xr.open_dataset(f0[0]) ds0 = ds0.swap_dims({'obs': 'time'}) else: ds0 = xr.open_mfdataset(f0) ds0 = ds0.swap_dims({'obs': 'time'}) ds0 = ds0.chunk({'time': 100}) splt0 = compare.split(' ')[0].split('-') ds0_sci_vars = cf.return_science_vars(splt0[1]) ds0_method = splt0[0] if start_time is not None and end_time is not None: ds0 = ds0.sel(time=slice(start_time, end_time)) if len(ds0['time'].values) == 0: print( 'No {} data to plot for specified time range: ({} to {})' .format(ds0_method, start_time, end_time)) continue if len(f1) == 1: ds1 = xr.open_dataset(f1[0]) ds1 = ds1.swap_dims({'obs': 'time'}) else: ds1 = xr.open_mfdataset(f1) ds1 = ds1.swap_dims({'obs': 'time'}) ds1 = ds1.chunk({'time': 100}) splt1 = compare.split(' ')[1].split('-') ds1_sci_vars = cf.return_science_vars(splt1[1]) ds1_method = splt1[0] if start_time is not None and end_time is not None: ds1 = ds1.sel(time=slice(start_time, end_time)) if len(ds1['time'].values) == 0: print( 'No {} data to plot for specified time range: ({} to {})' .format(ds1_method, start_time, end_time)) continue t0 = ds0['time'] t1 = ds1['time'] # find where the variable long names are the same ds0names = long_names(ds0, ds0_sci_vars) ds0names.rename(columns={'name': 'name_ds0'}, inplace=True) ds1names = long_names(ds1, ds1_sci_vars) ds1names.rename(columns={'name': 'name_ds1'}, inplace=True) mapping = pd.merge(ds0names, ds1names, on='long_name', how='inner') print('----------------------') print('{}: {}'.format(d, compare)) print('----------------------') subsite = r.split('-')[0] array = subsite[0:2] if start_time is not None and end_time is not None: stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = '-'.join( (d, compare) ) + '-' + stime + 'to' + etime #.join((ds0_method, ds1_method save_dir = os.path.join( sDir, array, subsite, r, 'method_compare_plots', ext) else: save_dir = os.path.join( sDir, array, subsite, r, 'method_compare_plots', '-'.join( (ds0_method, ds1_method))) cf.create_dir(save_dir) for rr in mapping.itertuples(): index, name_ds0, long_name, name_ds1 = rr print(long_name) ds0_var = ds0[name_ds0] ds1_var = ds1[name_ds1] # reject NaNs nan0_ind = ~np.isnan(ds0_var.data) ds0_nonan = ds0_var.data[nan0_ind] nan1_ind = ~np.isnan(ds1_var.data) ds1_nonan = ds1_var.data[nan1_ind] # only plot if both arrays have data if len(ds0_nonan) > 0 and len(ds1_nonan) > 0: # Plot all data fig, ax = pf.plot_timeseries_compare( t0, t1, ds0_var, ds1_var, ds0_method, ds1_method, long_name, stdev=None) title = ' '.join((d, r, '{} vs {}'.format( ds0_method, ds1_method))) ax.set_title(title, fontsize=9) if strm: sfile = '_'.join( (d, r, long_name, strm)) else: sfile = '_'.join((d, r, long_name)) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries_compare( t0, t1, ds0_var, ds1_var, ds0_method, ds1_method, long_name, stdev=5) title = ' '.join((d, r, '{} vs {}'.format( ds0_method, ds1_method))) ax.set_title(title, fontsize=9) if strm: sfile = '_'.join((d, r, long_name, strm, 'rmoutliers')) else: sfile = '_'.join( (d, r, long_name, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(sDir, url_list, start_time, end_time, deployment_num, interval): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] deployments = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) for ud in udatasets: if ud.split('/')[-1].split('_')[0] not in deployments: deployments.append(ud.split('/')[-1].split('_')[0]) datasets = list(itertools.chain(*datasets)) datasets = cf.filter_collocated_instruments(r, datasets) deployments.sort() fdatasets = np.unique(datasets).tolist() for deploy in deployments: if deployment_num is not None: if int(deploy[-4:]) is not deployment_num: print('\nskipping {}'.format(deploy)) continue rdatasets = [s for s in fdatasets if deploy in s] # break deployment into 4 segments or make a list of the time range specified if start_time is not None and end_time is not None: dt_range = [dt.datetime.strftime(start_time, '%Y-%m-%d'), dt.datetime.strftime(end_time, '%Y-%m-%d')] else: # Get deployment info from the data review database dr_data = cf.refdes_datareview_json(r) d_info = [x for x in dr_data['instrument']['deployments'] if x['deployment_number'] == int(deploy[-4:])] d_info = d_info[0] deploy_start = dt.datetime.strptime(str(d_info['start_date']).split('T')[0], '%Y-%m-%d') deploy_stop = dt.datetime.strptime(str(d_info['stop_date']).split('T')[0], '%Y-%m-%d') + dt.timedelta( days=1) dt_range = list(date_range(deploy_start, deploy_stop, 4)) sci_vars_dict = {'time': dict(values=np.array([], dtype=np.datetime64), fv=[], ln=[]), 'bin_depths': dict(values=np.array([]), units=[], fv=[], ln=[])} percentgood = {'percent_good_beam1': dict(values=np.array([])), 'percent_good_beam2': dict(values=np.array([])), 'percent_good_beam3': dict(values=np.array([])), 'percent_good_beam4': dict(values=np.array([]))} if interval is None: toplot = range(len(dt_range) - 1) else: toplot = [interval - 1] for dtri in toplot: stime = dt.datetime.strptime(dt_range[dtri], '%Y-%m-%d') etime = dt.datetime.strptime(dt_range[dtri + 1], '%Y-%m-%d') if len(rdatasets) > 0: for i in range(len(rdatasets)): #for i in range(0, 2): ##### for testing ds = xr.open_dataset(rdatasets[i], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) print('\nAppending data from {}: file {} of {}'.format(deploy, i + 1, len(rdatasets))) ds = ds.sel(time=slice(stime, etime)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue try: print(fname) except NameError: fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(rdatasets[0]) array = subsite[0:2] sci_vars = cf.return_science_vars(stream) # drop the following list of key words from science variables list sci_vars = notin_list(sci_vars, ['salinity', 'temperature', 'bin_depths', 'beam']) sci_vars = [name for name in sci_vars if ds[name].units != 'mm s-1'] for sci_var in sci_vars: sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[], ln=[])}) # append data for the deployment into a dictionary for s_v, info in sci_vars_dict.items(): print(s_v) vv = ds[s_v] try: if vv.units not in info['units']: info['units'].append(vv.units) except AttributeError: print('no units') try: if vv._FillValue not in info['fv']: info['fv'].append(vv._FillValue) except AttributeError: print('no fill value') try: if vv.long_name not in info['ln']: info['ln'].append(vv.long_name) except AttributeError: print('no long name') if len(vv.dims) == 1: info['values'] = np.append(info['values'], vv.values) else: if len(info['values']) == 0: info['values'] = vv.values.T else: info['values'] = np.concatenate((info['values'], vv.values.T), axis=1) # append percent good beams for j, k in percentgood.items(): pgvv = ds[j] fv_pgvv = pgvv._FillValue pgvv = pgvv.values.T.astype(float) pgvv[pgvv == fv_pgvv] = np.nan if len(k['values']) == 0: k['values'] = pgvv else: k['values'] = np.concatenate((k['values'], pgvv), axis=1) if len(sci_vars_dict['time']['values']) > 0: filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'plots', deployment) cf.create_dir(save_dir) tm = sci_vars_dict['time']['values'] t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title_text = ' '.join((deployment, refdes, method)) bd = sci_vars_dict['bin_depths'] ylabel = 'bin_depths ({})'.format(bd['units'][0]) print('\nPlotting interval {}'.format(int(dtri) + 1)) for var in sci_vars: print('----{}'.format(var)) v = sci_vars_dict[var] fv = v['fv'][0] v_name = v['ln'][0] units = v['units'][0] if len(np.shape(v['values'])) == 1: v, n_nan, n_fv, n_ev, n_grange, g_min, g_max, n_std = reject_err_data_1_dims(v['values'], fv, r, var, n=5) if len(tm) > np.sum(np.isnan(v)): # only plot if the array contains values # Plot all data fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=None) ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, v_name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=5) title_i = 'removed: {} nans, {} fill values, {} extreme values, {} GR [{}, {}],' \ ' {} outliers +/- 5 SD'.format(n_nan, n_fv , n_ev, n_grange, g_min, g_max, n_std) ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8) sfile = '-'.join((filename, v_name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile) else: print('Array of all nans - skipping plot') else: v, n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max = reject_err_data_2_dims(v['values'], percentgood, fv, r, var) clabel = '{} ({})'.format(var, units) # check bin depths for extreme values y = bd['values'] # if all the values are negative, take the absolute value (cabled data bin depths are negative) if int(np.nanmin(y)) < 0 and int(np.nanmax(y)) < 0: y = abs(y) y_nan = np.sum(np.isnan(y)) y = np.where(y < 6000, y, np.nan) # replace extreme bin_depths by nans bin_nan = np.sum(np.isnan(y)) - y_nan bin_title = 'removed: {} bin depths > 6000'.format(bin_nan) if 'echo' in var: color = 'BuGn' else: color = 'RdBu' new_y = dropna(y, axis=1) # convert to DataFrame to drop nan y_mask = new_y.loc[list(new_y.index), list(new_y.columns)] v_new = pd.DataFrame(v) v_mask = v_new.loc[list(new_y.index), list(new_y.columns)] tm_mask = tm[new_y.columns] fig, ax, __ = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color, n_stdev=None) if bin_nan > 0: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + bin_title), fontsize=8) else: ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=8) sfile = '-'.join((filename, var, t0[:10])) pf.save_fig(save_dir, sfile) fig, ax, n_nans_all = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color, n_stdev=5) title_i = 'removed: {} nans, {} fill values, {} extreme values, {} bad beams, {} GR [{}, {}]'.format( n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max) if bin_nan > 0: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i + '\n' + bin_title), fontsize=8) else: ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8) sfile = '-'.join((filename, var, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets fdatasets = np.unique(fdatasets).tolist() main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets) for fd in fdatasets: if '_blank' not in fd: ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x] # get pressure variable from coordinates #raw_vars = cf.return_raw_vars(ds_vars) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) if 'NUTNR' in refdes: vars = cf.return_science_vars(stream) else: vars = cf.return_raw_vars(ds_vars) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) for var in vars: print(var) if var != 'id': y = ds[var] try: fv = y._FillValue except AttributeError: fv = np.nan if len(y.dims) == 1: # Check if the array is all NaNs if sum(np.isnan(y.values)) == len(y.values): print('Array of all NaNs - skipping plot.') # Check if the array is all fill values elif len(y[y != fv]) == 0: print('Array of all fill values - skipping plot.') else: # reject fill values ind = y.values != fv t = tm[ind] y = y[ind] # Plot all data fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def main(sDir, f, start_time, end_time): ff = pd.read_csv(os.path.join(sDir, f)) url_list = ff['outputUrl'].tolist() for i, u in enumerate(url_list): print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u)) main_sensor = u.split('/')[-2].split('-')[4] datasets = cf.get_nc_urls([u]) datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets) for ii, d in enumerate(datasets_sel): print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), d)) with xr.open_dataset(d, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( d) vars = ds.data_vars.keys() if 'MOAS' in subsite and 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' else: pressure = pf.pressure_var(ds, vars) sci_vars = cf.return_science_vars(stream) sci_vars = [s for s in sci_vars if s not in [pressure] ] # remove pressure from sci_vars save_dir = os.path.join(sDir, subsite, refdes, 'xsection_plots', deployment) cf.create_dir(save_dir) t = ds['time'].values t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) y = ds[pressure] print('Plotting variables...') for var in sci_vars: print(var) z = ds[var] # Plot all data clabel = var + " (" + z.units + ")" ylabel = pressure + " (" + y.units + ")" fig, ax = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], z.name)) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], z.name, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time, preferred_only, glider, zdbar, n_std, inpercentile, zcell_size): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'ENG' not in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets) for fd in fdatasets_sel: part_d = fd.split('/')[-1] print(part_d) ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) array = subsite[0:2] sci_vars = cf.return_science_vars(stream) if 'CE05MOAS' in r or 'CP05MOAS' in r: # for coastal gliders, get m_water_depth for bathymetry eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng')) eng_url = [s for s in url_list if eng in s] if len(eng_url) == 1: eng_datasets = cf.get_nc_urls(eng_url) # filter out collocated datasets eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])] if len(eng_dataset) > 0: ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False) t_eng = ds_eng['time'].values m_water_depth = ds_eng['m_water_depth'].values # m_altimeter_status = 0 means a good reading (not nan or -1) eng_ind = ds_eng['m_altimeter_status'].values == 0 m_water_depth = m_water_depth[eng_ind] t_eng = t_eng[eng_ind] else: print('No engineering file for deployment {}'.format(deployment)) if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, deployment, ext) else: save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, deployment) cf.create_dir(save_dir) tm = ds['time'].values # get pressure variable ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x] y, y_units, press = cf.add_pressure_to_dictionary_of_sci_vars(ds) print(y_units, press) # press = pf.pressure_var(ds, ds_vars) # print(press) # y = ds[press].values # y_units = ds[press].units for sv in sci_vars: print(sv) if 'sci_water_pressure' not in sv: z = ds[sv].values fv = ds[sv]._FillValue z_units = ds[sv].units # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: """ clean up data """ # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \ cf.reject_erroneous_data(r, sv, tm, y, z, fv) # get rid of 0.0 data if 'CTD' in r: ind = zpressure > 0.0 else: ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = dtime[ind] zpressure = zpressure[ind] ndata = ndata[ind] # creating data groups columns = ['tsec', 'dbar', str(sv)] min_r = int(round(min(zpressure) - zcell_size)) max_r = int(round(max(zpressure) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) groups, d_groups = gt.group_by_depth_range(dtime, zpressure, ndata, columns, ranges) # rejecting timestamps from percentile analysis y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups( groups, d_groups, n_std, inpercentile) t_nospct, z_nospct, y_nospct = cf.reject_suspect_data(dtime, zpressure, ndata, time_ex) print('removed {} data points using {} percentile of data grouped in {} dbar segments'.format( len(zpressure) - len(z_nospct), inpercentile, zcell_size)) # reject time range from data portal file export t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal(subsite, r, t_nospct, y_nospct, z_nospct) print('removed {} data points using visual inspection of data'.format(len(z_nospct) - len(z_portal))) # reject data in a depth range if zdbar: y_ind = y_portal < zdbar n_zdbar = np.sum(~y_ind) t_array = t_portal[y_ind] y_array = y_portal[y_ind] z_array = z_portal[y_ind] else: n_zdbar = 0 t_array = t_portal y_array = y_portal z_array = z_portal print('{} in water depth > {} dbar'.format(n_zdbar, zdbar)) """ Plot data """ if len(dtime) > 0: sname = '-'.join((r, method, sv)) clabel = sv + " (" + z_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" if glider == 'no': t_eng = None m_water_depth = None # plot non-erroneous data fig, ax, bar = pf.plot_xsection(subsite, dtime, zpressure, ndata, clabel, ylabel, t_eng, m_water_depth, inpercentile, stdev=None) t0 = pd.to_datetime(dtime.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(dtime.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1 ax.set_title(title, fontsize=9) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_erroneous_data', sname)) pf.save_fig(save_dir, sfile) # plots removing all suspect data if len(t_array) > 0: if len(t_array) != len(dtime): # plot bathymetry only within data time ranges if glider == 'yes': eng_ind = (t_eng >= np.min(t_array)) & (t_eng <= np.max(t_array)) t_eng = t_eng[eng_ind] m_water_depth = m_water_depth[eng_ind] fig, ax, bar = pf.plot_xsection(subsite, t_array, y_array, z_array, clabel, ylabel, t_eng, m_water_depth, inpercentile, stdev=None) t0 = pd.to_datetime(t_array.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t_array.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1 ax.set_title(title, fontsize=9) if zdbar: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nremoved {} in the upper and lower {}th percentile of data grouped in {} dbar segments'.format( len(zpressure) - len(z_nospct), inpercentile, zcell_size) + '\nexcluded {} suspect data points when inspected visually'.format( len(z_nospct) - len(z_portal)) + '\nexcluded {} suspect data in water depth greater than {} dbar'.format(n_zdbar, zdbar), ) else: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nremoved {} in the upper and lower {}th percentile of data grouped in {} dbar segments'.format( len(zpressure) - len(z_nospct), inpercentile, zcell_size) + '\nexcluded {} suspect data points when inspected visually'.format( len(z_nospct) - len(z_portal)), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_suspect_data', sname)) pf.save_fig(save_dir, sfile)
def plot_velocity_variables(r, fdatasets, num_plots, save_dir): fig, ax = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True) fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig_file = 'calculated_currents_plot' fig0, ax0 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True) fig0.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig0_file = 'uvw_plots' fig1, ax1 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True) fig1.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig1_file = 'pressure_plots' fig2, ax2 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True) fig2.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig2_file = 'roll_plots' fig3, ax3 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True) fig3.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig3_file = 'pitch_plots' for ii in range(len(fdatasets)): if num_plots > len(fdatasets): for jj in range(len(fdatasets),num_plots,1): ax[jj].axis('off') ax0[jj].axis('off') # ax0[jj].axis('tight') ax1[jj].axis('off') ax2[jj].axis('off') ax3[jj].axis('off') print('\n', fdatasets[ii].split('/')[-1]) deployment = fdatasets[ii].split('/')[-1].split('_')[0].split('deployment')[-1] deployment = int(deployment) ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False) time = ds['time'].values ''' science veriable ''' sci_var = cf.return_science_vars(ds.stream) z_var = [z_var for z_var in sci_var if 'pressure' in z_var] z = ds[z_var[0]].values z_unit = ds[z_var[0]].units z_name = ds[z_var[0]].long_name z_fill = ds[z_var[0]]._FillValue z, err_count_z = reject_err_data_1_dims(z, z_fill, r, z_name[0], n=5) w_var = [w_var for w_var in sci_var if 'upward_velocity' in w_var] w = ds[w_var[0]].values w_unit = ds[w_var[0]].units w_name = ds[w_var[0]].long_name w_fill = ds[w_var[0]]._FillValue w, err_count_w = reject_err_data_1_dims(w, w_fill, r, w_name[0], n=5) u_var = [u_var for u_var in sci_var if 'eastward_velocity' in u_var] u = ds[u_var[0]].values u_unit = ds[u_var[0]].units u_name = ds[u_var[0]].long_name u_fill = ds[u_var[0]]._FillValue u, err_count_u = reject_err_data_1_dims(u, u_fill, r, u_name[0], n=5) v_var = [v_var for v_var in sci_var if 'northward_velocity' in v_var] v = ds[v_var[0]].values v_unit = ds[v_var[0]].units v_name = ds[v_var[0]].long_name v_fill = ds[v_var[0]]._FillValue v, err_count_v = reject_err_data_1_dims(v, v_fill, r, v_name[0], n=5) uv_magnitude = np.sqrt(u ** 2 + v ** 2) uv_maxmag = max(uv_magnitude) ''' non science veriable According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees redmine ticket: Marine Hardware #12960 ''' roll = ds['roll_decidegree'].values roll_unit = ds['roll_decidegree'].units roll_name = ds['roll_decidegree'].long_name roll_fill = ds['roll_decidegree']._FillValue roll, err_count_r = reject_err_data_1_dims(roll, roll_fill, r, 'roll_decidegree', n=5) pitch = ds['pitch_decidegree'].values pitch_units = ds['pitch_decidegree'].units pitch_name = ds['pitch_decidegree'].long_name pitch_fill = ds['pitch_decidegree']. _FillValue pitch, err_count_p = reject_err_data_1_dims(pitch, pitch_fill, r, 'pitch_decidegree', n=5) tilt_ind = np.logical_or(pitch > 200, roll > 200) ''' Plot pressure ''' z_fit = z[tilt_ind] percent_good = ((len(z) - len(z_fit)) / len(u)) * 100 ax1[ii].plot(time, z, 'b-', linestyle='--', linewidth=.6) ax1[ii].plot(time[tilt_ind], z_fit, 'r.', linestyle='None', marker='.', markersize=0.5, label= str(round(100 - percent_good, 3))+'%') prepare_axis(r, time, deployment, ax1[ii], ii, len(fdatasets), z_name, z_unit, err_count_z) fig1_file = fig1_file + str(deployment) ''' plot roll ''' roll_fit = roll[tilt_ind] percent_good = ((len(roll) - len(roll_fit)) / len(u)) * 100 ax2[ii].plot(time, roll, 'b-', linestyle='--', linewidth=.6) ax2[ii].plot(time[tilt_ind], roll_fit, 'r.', linestyle='None', marker='.', markersize=0.5, label= str(round(100 - percent_good,3)) + '%') prepare_axis(r, time, deployment, ax2[ii], ii, len(fdatasets), roll_name, roll_unit, err_count_r) fig2_file = fig2_file + str(deployment) ''' plot pitch ''' pitch_fit = pitch[tilt_ind] percent_good = ((len(pitch) - len(pitch_fit)) / len(u)) * 100 ax3[ii].plot(time, pitch, 'b-', linestyle='--', linewidth=.6) ax3[ii].plot(time[tilt_ind], pitch_fit, 'r.', linestyle='None', marker='.', markersize=0.5, label= str(round(100 - percent_good,3)) + '%') prepare_axis(r, time, deployment, ax3[ii], ii, len(fdatasets), pitch_name, pitch_units, err_count_p) fig3_file = fig3_file + str(deployment) ''' 1D Quiver plot ''' u_fit = u[tilt_ind] v_fit = v[tilt_ind] percent_good = ((len(u) - len(u_fit)) / len(u)) * 100 ax[ii].quiver(time, 0, u, v, color='b', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5) ax[ii].quiver(time[tilt_ind], 0, u_fit, v_fit, color='r', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5, label=str(round(100 - percent_good,3)) + '%') ax[ii].set_ylim(-uv_maxmag, uv_maxmag) prepare_axis(r, time, deployment, ax[ii], ii, len(fdatasets), 'Current Velocity', u_unit, err_count_u) fig_file = fig_file + str(deployment) ''' Plot u and v components ''' ax0[ii].plot(time, v, 'b-', linestyle='--', linewidth=.6, label='V') ax0[ii].plot(time, u, 'g-', linestyle='--', linewidth=.6, label='U') ax0[ii].plot(time, w, 'm-', linestyle='--', linewidth=.6, label='W') prepare_axis(r, time, deployment, ax0[ii], ii, len(fdatasets), 'Velocity Components', u_unit, err_count_u) fig0_file = fig0_file + str(deployment) save_file = os.path.join(save_dir, fig1_file) fig1.savefig(str(save_file), dpi=150, bbox_inches='tight') save_file = os.path.join(save_dir, fig_file) fig.savefig(str(save_file), dpi=150, bbox_inches='tight') save_file = os.path.join(save_dir, fig0_file) fig0.savefig(str(save_file), dpi=150, bbox_inches='tight') save_file = os.path.join(save_dir, fig2_file) fig2.savefig(str(save_file), dpi=150, bbox_inches='tight') save_file = os.path.join(save_dir, fig3_file) fig3.savefig(str(save_file), dpi=150, bbox_inches='tight')
def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only, n_std, inpercentile, zcell_size, zdbar): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] deployments = [] for url in url_list: splitter = url.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) catalog_rms = '-'.join((r, splitter[-2], splitter[-1])) if rd_check == r: udatasets = cf.get_nc_urls([url]) for u in udatasets: # filter out collocated data files if catalog_rms == u.split('/')[-1].split('_20')[0][15:]: datasets.append(u) deployments.append( int(u.split('/')[-1].split('_')[0][-4:])) deployments = np.unique(deployments).tolist() fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments( main_sensor, fdatasets) for dep in deployments: if deployment_num is not None: if dep is not deployment_num: print('\nskipping deployment {}'.format(dep)) continue rdatasets = [ s for s in fdatasets_sel if 'deployment%04d' % dep in s ] rdatasets.sort() if len(rdatasets) > 0: sci_vars_dict = {} # rdatasets = rdatasets[0:2] #### for testing for i in range(len(rdatasets)): ds = xr.open_dataset(rdatasets[i], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) print('\nAppending data from {}: file {} of {}'.format( 'deployment%04d' % dep, i + 1, len(rdatasets))) array = r[0:2] subsite = r.split('-')[0] if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir_profile = os.path.join( sDir, array, subsite, r, 'profile_plots', 'deployment%04d' % dep, ext) save_dir_xsection = os.path.join( sDir, array, subsite, r, 'xsection_plots', 'deployment%04d' % dep, ext) else: save_dir_profile = os.path.join( sDir, array, subsite, r, 'profile_plots', 'deployment%04d' % dep) save_dir_xsection = os.path.join( sDir, array, subsite, r, 'xsection_plots', 'deployment%04d' % dep) if len(sci_vars_dict) == 0: fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( rdatasets[0]) sci_vars = cf.return_science_vars(stream) if 'CTDPF' not in r: sci_vars.append('int_ctd_pressure') sci_vars.append('time') sci_vars = list(np.unique(sci_vars)) # initialize the dictionary for sci_var in sci_vars: if sci_var == 'time': sci_vars_dict.update({ sci_var: dict(values=np.array([], dtype=np.datetime64), units=[], fv=[]) }) else: sci_vars_dict.update({ sci_var: dict(values=np.array([]), units=[], fv=[]) }) # append data for the deployment into the dictionary for s_v in sci_vars_dict.keys(): vv = ds[s_v] try: if vv.units not in sci_vars_dict[s_v]['units']: sci_vars_dict[s_v]['units'].append(vv.units) except AttributeError: print('') try: if vv._FillValue not in sci_vars_dict[s_v]['fv']: sci_vars_dict[s_v]['fv'].append(vv._FillValue) vv_data = vv.values try: vv_data[ vv_data == vv. _FillValue] = np.nan # turn fill values to nans except ValueError: print('') except AttributeError: print('') if len(vv.dims) > 1: print('Skipping plot: variable has >1 dimension') else: sci_vars_dict[s_v]['values'] = np.append( sci_vars_dict[s_v]['values'], vv.values) # plot after appending all data into one file data_start = pd.to_datetime( min(sci_vars_dict['time']['values'])).strftime( '%Y-%m-%dT%H:%M:%S') data_stop = pd.to_datetime(max( sci_vars_dict['time']['values'])).strftime( '%Y-%m-%dT%H:%M:%S') time1 = sci_vars_dict['time']['values'] ds_lat1 = np.empty(np.shape(time1)) ds_lon1 = np.empty(np.shape(time1)) # define pressure variable try: pname = 'seawater_pressure' press = sci_vars_dict[pname] except KeyError: pname = 'int_ctd_pressure' press = sci_vars_dict[pname] y1 = press['values'] try: y_units = press['units'][0] except IndexError: y_units = '' for sv in sci_vars_dict.keys(): print('') print(sv) if sv not in [ 'seawater_pressure', 'int_ctd_pressure', 'time' ]: z1 = sci_vars_dict[sv]['values'] fv = sci_vars_dict[sv]['fv'][0] sv_units = sci_vars_dict[sv]['units'][0] # Check if the array is all NaNs if sum(np.isnan(z1)) == len(z1): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z1[z1 != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # remove unreasonable pressure data (e.g. for surface piercing profilers) if zdbar: po_ind = (0 < y1) & (y1 < zdbar) tm = time1[po_ind] y = y1[po_ind] z = z1[po_ind] ds_lat = ds_lat1[po_ind] ds_lon = ds_lon1[po_ind] else: tm = time1 y = y1 z = z1 ds_lat = ds_lat1 ds_lon = ds_lon1 # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \ cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon) # get rid of 0.0 data # if sv == 'salinity': # ind = ndata > 20 # elif sv == 'density': # ind = ndata > 1010 # elif sv == 'conductivity': # ind = ndata > 2 # else: # ind = ndata > 0 # if sv == 'sci_flbbcd_chlor_units': # ind = ndata < 7.5 # elif sv == 'sci_flbbcd_cdom_units': # ind = ndata < 25 # else: # ind = ndata > 0.0 if 'CTD' in r: ind = zpressure > 0.0 else: ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = dtime[ind] zpressure = zpressure[ind] ndata = ndata[ind] if ds_lat is not None and ds_lon is not None: lat = lat[ind] lon = lon[ind] else: lat = None lon = None if len(dtime) > 0: # reject time range from data portal file export t_portal, z_portal, y_portal, lat_portal, lon_portal = \ cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon) print( 'removed {} data points using visual inspection of data' .format(len(ndata) - len(z_portal))) # create data groups # if len(y_portal) > 0: # columns = ['tsec', 'dbar', str(sv)] # min_r = int(round(np.nanmin(y_portal) - zcell_size)) # max_r = int(round(np.nanmax(y_portal) + zcell_size)) # ranges = list(range(min_r, max_r, zcell_size)) # # groups, d_groups = gt.group_by_depth_range(t_portal, y_portal, z_portal, columns, ranges) # # if 'scatter' in sv: # n_std = None # to use percentile # else: # n_std = n_std # # # get percentile analysis for printing on the profile plot # y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups( # groups, d_groups, n_std, inpercentile) """ Plot all data """ if len(time1) > 0: cf.create_dir(save_dir_profile) cf.create_dir(save_dir_xsection) sname = '-'.join((r, method, sv)) # sfileall = '_'.join(('all_data', sname, pd.to_datetime(time1.min()).strftime('%Y%m%d'))) # tm0 = pd.to_datetime(time1.min()).strftime('%Y-%m-%dT%H:%M:%S') # tm1 = pd.to_datetime(time1.max()).strftime('%Y-%m-%dT%H:%M:%S') sfileall = '_'.join( (sname, pd.to_datetime( t_portal.min()).strftime('%Y%m%d'))) tm0 = pd.to_datetime(t_portal.min()).strftime( '%Y-%m-%dT%H:%M:%S') tm1 = pd.to_datetime(t_portal.max()).strftime( '%Y-%m-%dT%H:%M:%S') title = ' '.join( (deployment, refdes, method)) + '\n' + tm0 + ' to ' + tm1 if 'SPKIR' in r: title = title + '\nWavelength = 510 nm' ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = pname + " (" + y_units + ")" clabel = 'Time' # fig, ax = pf.plot_profiles(z1, y1, time1, ylabel, xlabel, clabel, stdev=None) fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_profile, sfileall) ''' xsection plot ''' clabel = sv + " (" + sv_units + ")" ylabel = pname + " (" + y_units + ")" # fig, ax, bar = pf.plot_xsection(subsite, time1, y1, z1, clabel, ylabel, t_eng=None, # m_water_depth=None, inpercentile=None, stdev=None) fig, ax, bar = pf.plot_xsection( subsite, t_portal, y_portal, z_portal, clabel, ylabel, t_eng=None, m_water_depth=None, inpercentile=None, stdev=None) if fig: ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_xsection, sfileall) """
def main(sDir, url_list, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) if preferred_only == 'yes': ps_df, n_streams = cf.get_preferred_stream_info(r) fdatasets = [] for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets) save_dir = os.path.join(sDir, array, subsite, r, 'preferred_method_plots') cf.create_dir(save_dir) # get the preferred stream information fig, ax = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig0, ax0 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig0.tight_layout() fig1, ax1 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig1.tight_layout() fig2, ax2 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig2.tight_layout() fig3, ax3 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig3.tight_layout() fig4, ax4 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True) fig4.tight_layout() for ii in range(len(fdatasets)): print('\n', fdatasets[ii]) deployment = fdatasets[ii].split('/')[-1].split('_')[0].split( 'deployment')[-1] deployment = int(deployment) ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False) time = ds['time'].values sci_var = cf.return_science_vars(ds.stream) # Plot pressure z_name = [z_var for z_var in sci_var if 'pressure' in z_var] z = ds[z_name[0]].values z_unit = ds[z_name[0]].units ax1[ii].plot(time, z, 'b-', linestyle='--', linewidth=.6, label='V') ax1[ii].set_ylabel(str(deployment), rotation=0, fontsize=8, color='b', labelpad=11) ax1[ii].yaxis.set_label_position("right") ax1[ii].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) if ii < len(fdatasets) - 1: ax1[ii].set_xlabel(' ') else: ax1[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b') if ii == 0: ax1[ii].set_title(r + ' - Pressure ' + z_unit, fontsize=8) sfile = 'pressure_plots' save_file = os.path.join(save_dir, sfile) fig1.savefig(str(save_file), dpi=150) # non science veriable # According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees # redmine ticket: Marine Hardware #12960 roll = ds['roll_decidegree'].values roll_unit = ds['roll_decidegree'].units pitch = ds['pitch_decidegree'].values pitch_units = ds['pitch_decidegree'].units headng = ds['heading_decidegree'].values headng_units = ds['heading_decidegree'].values tilt_ind = np.logical_or(pitch > 200, roll > 200) pitch_fit = pitch[tilt_ind] roll_fit = roll[tilt_ind] # plot roll ax2[ii].plot(time, roll, 'b-', linestyle='--', linewidth=.6, label='Roll') ax2[ii].plot(time[tilt_ind], roll_fit, 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') ax2[ii].set_ylabel(str(deployment), rotation=0, fontsize=8, color='b', labelpad=11) ax2[ii].yaxis.set_label_position("right") ax2[ii].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) if ii < len(fdatasets) - 1: ax2[ii].set_xlabel(' ') else: ax2[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b') if ii == 0: ax2[ii].set_title(r + ' - Roll ' + roll_unit, fontsize=8) leg2 = ax2[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) leg2._drawFrame = False sfile = 'roll_plots' save_file = os.path.join(save_dir, sfile) fig2.savefig(str(save_file), dpi=150) # plot pitch ax3[ii].plot(time, pitch, 'b-', linestyle='--', linewidth=.6, label='Roll') ax3[ii].plot(time[tilt_ind], pitch_fit, 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') ax3[ii].set_ylabel(str(deployment), rotation=0, fontsize=8, color='b', labelpad=11) ax3[ii].yaxis.set_label_position("right") ax3[ii].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) if ii < len(fdatasets) - 1: ax3[ii].set_xlabel(' ') else: ax3[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b') if ii == 0: ax3[ii].set_title(r + ' - Pitch ' + roll_unit, fontsize=8) leg3 = ax2[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) leg3._drawFrame = False sfile = 'pitch_plots' save_file = os.path.join(save_dir, sfile) fig3.savefig(str(save_file), dpi=150) # plot heading ax4[ii].plot(time, headng, 'b-', linestyle='None', marker='.', markersize=0.5, label='Roll') ax4[ii].plot(time[tilt_ind], headng[tilt_ind], 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') ax4[ii].set_ylabel(str(deployment), rotation=0, fontsize=8, color='b', labelpad=11) ax4[ii].yaxis.set_label_position("right") ax4[ii].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) if ii < len(fdatasets) - 1: ax4[ii].set_xlabel(' ') else: ax4[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b') if ii == 0: ax4[ii].set_title(r + ' - Heading ' + roll_unit, fontsize=8) leg4 = ax2[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) leg4._drawFrame = False sfile = 'heading_plots' save_file = os.path.join(save_dir, sfile) fig4.savefig(str(save_file), dpi=150) # velocity variable u_name = [ u_var for u_var in sci_var if 'eastward_velocity' in u_var ] v_name = [ v_var for v_var in sci_var if 'northward_velocity' in v_var ] w_name = [w_var for w_var in sci_var if 'upward_velocity' in w_var] w = ds[w_name[0]].values w_unit = ds[w_name[0]].units u = ds[u_name[0]].values v = ds[v_name[0]].values uv_magnitude = np.sqrt(u**2 + v**2) uv_maxmag = max(uv_magnitude) # 1D Quiver plot ax[ii].quiver(time, 0, u, v, color='r', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5) u_fit = u[tilt_ind] v_fit = v[tilt_ind] ax[ii].quiver(time[tilt_ind], 0, u_fit, v_fit, color='b', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5) percent_bad = round(((len(u) - len(u_fit)) / len(u)) * 100, 2) print(len(u_fit), len(u), percent_bad) ax[ii].text(time[-1], 0, ' ' + str(percent_bad) + '%', fontsize=5, style='italic', color='blue') ax[ii].set_ylim(-uv_maxmag, uv_maxmag) ax[ii].set_ylabel(str(deployment), rotation=0, fontsize=8, color='b', labelpad=11) ax[ii].yaxis.set_label_position("right") ax[ii].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) if ii < len(fdatasets) - 1: ax[ii].set_xlabel(' ') else: ax[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b') if ii == 0: ax[ii].set_title( r + ' - Current Velocity ' + w_unit + '\n' + ' Currents in blue when pitch or roll are > 20 degrees', fontsize=8) # ax[ii].text(time[0], uv_magnitude- 0.05, 'mim: ' + str(round(min(uv_magnitude),3)) + ' , max: ' + str(round(max(uv_magnitude),3)), fontsize=8) sfile = 'current_plot' save_file = os.path.join(save_dir, sfile) fig.savefig(str(save_file), dpi=150, bbox_inches='tight')
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): rms = '-'.join((r, row[ii])) for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments( main_sensor, fdatasets) for fd in fdatasets_sel: with xr.open_dataset(fd, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_panel_plots') filename = '_'.join(fname.split('_')[:-1]) sci_vars = cf.return_science_vars(stream) if len(sci_vars) > 1: cf.create_dir(save_dir) colors = cm.jet(np.linspace(0, 1, len(sci_vars))) t = ds['time'].values t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) # Plot data with outliers removed fig, ax = pf.plot_timeseries_panel(ds, t, sci_vars, colors, 5) plt.xticks(fontsize=7) ax[0].set_title((title + '\n' + t0 + ' - ' + t1), fontsize=7) sfile = '-'.join((filename, 'timeseries_panel', t0[:10])) pf.save_fig(save_dir, sfile) else: print( 'Only one science variable in file, no panel plots necessary' )
def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only, zdbar, n_std, inpercentile, zcell_size): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments( main_sensor, fdatasets) for fd in fdatasets_sel: part_d = fd.split('/')[-1] print('\n{}'.format(part_d)) ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) array = subsite[0:2] sci_vars = cf.return_science_vars(stream) # if 'CE05MOAS' in r or 'CP05MOAS' in r: # for coastal gliders, get m_water_depth for bathymetry # eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng')) # eng_url = [s for s in url_list if eng in s] # if len(eng_url) == 1: # eng_datasets = cf.get_nc_urls(eng_url) # # filter out collocated datasets # eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])] # if len(eng_dataset) > 0: # ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False) # t_eng = ds_eng['time'].values # m_water_depth = ds_eng['m_water_depth'].values # # # m_altimeter_status = 0 means a good reading (not nan or -1) # try: # eng_ind = ds_eng['m_altimeter_status'].values == 0 # except KeyError: # eng_ind = (~np.isnan(m_water_depth)) & (m_water_depth >= 0) # # m_water_depth = m_water_depth[eng_ind] # t_eng = t_eng[eng_ind] # # # get rid of any remaining nans or fill values # eng_ind2 = (~np.isnan(m_water_depth)) & (m_water_depth >= 0) # m_water_depth = m_water_depth[eng_ind2] # t_eng = t_eng[eng_ind2] # else: # print('No engineering file for deployment {}'.format(deployment)) # m_water_depth = None # t_eng = None # else: # m_water_depth = None # t_eng = None # else: # m_water_depth = None # t_eng = None if deployment_num is not None: if int(int(deployment[-4:])) is not deployment_num: print(type(int(deployment[-4:])), type(deployment_num)) continue if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment, ext) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment, ext) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment, ext) else: save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment) texclude_dir = os.path.join(sDir, array, subsite, refdes, 'time_to_exclude') cf.create_dir(texclude_dir) time1 = ds['time'].values try: ds_lat1 = ds['lat'].values except KeyError: ds_lat1 = None print('No latitude variable in file') try: ds_lon1 = ds['lon'].values except KeyError: ds_lon1 = None print('No longitude variable in file') # get pressure variable pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars( ds) # prepare file to list timestamps with suspect data for each data parameter stat_data = pd.DataFrame( columns=['deployments', 'time_to_exclude']) file_exclude = '{}/{}_{}_{}_excluded_timestamps.csv'.format( texclude_dir, deployment, refdes, method) stat_data.to_csv(file_exclude, index=True) # loop through sensor-data parameters for sv in sci_vars: print(sv) if 'pressure' not in sv: z1 = ds[sv].values fv = ds[sv]._FillValue sv_units = ds[sv].units # Check if the array is all NaNs if sum(np.isnan(z1)) == len(z1): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z1[z1 != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # remove unreasonable pressure data (e.g. for surface piercing profilers) if zdbar: po_ind = (0 < y1) & (y1 < zdbar) n_zdbar = np.sum(~po_ind) tm = time1[po_ind] y = y1[po_ind] z = z1[po_ind] ds_lat = ds_lat1[po_ind] ds_lon = ds_lon1[po_ind] print('{} in water depth > {} dbar'.format( n_zdbar, zdbar)) else: tm = time1 y = y1 z = z1 ds_lat = ds_lat1 ds_lon = ds_lon1 # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \ cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon) # get rid of 0.0 data if sv == 'salinity': ind = ndata > 30 elif sv == 'density': ind = ndata > 1022.5 elif sv == 'conductivity': ind = ndata > 3.45 else: ind = ndata > 0 # if sv == 'sci_flbbcd_chlor_units': # ind = ndata < 7.5 # elif sv == 'sci_flbbcd_cdom_units': # ind = ndata < 25 # else: # ind = ndata > 0.0 # if 'CTD' in r: # ind = zpressure > 0.0 # else: # ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = dtime[ind] zpressure = zpressure[ind] ndata = ndata[ind] if ds_lat is not None and ds_lon is not None: lat = lat[ind] lon = lon[ind] else: lat = None lon = None if len(dtime) > 0: # reject time range from data portal file export t_portal, z_portal, y_portal, lat_portal, lon_portal = \ cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon) print( 'removed {} data points using visual inspection of data' .format(len(ndata) - len(z_portal))) # create data groups if len(y_portal) > 0: columns = ['tsec', 'dbar', str(sv)] min_r = int(round(min(y_portal) - zcell_size)) max_r = int(round(max(y_portal) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) groups, d_groups = gt.group_by_depth_range( t_portal, y_portal, z_portal, columns, ranges) if 'scatter' in sv: n_std = None # to use percentile else: n_std = n_std # identifying timestamps from percentile analysis y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups( groups, d_groups, n_std, inpercentile) """ writing timestamps to .csv file to use with data_range.py script """ if len(time_ex) != 0: t_exclude = time_ex[0] for i in range( len(time_ex))[1:len(time_ex)]: t_exclude = '{}, {}'.format( t_exclude, time_ex[i]) stat_data = pd.DataFrame( { 'deployments': deployment, 'time_to_exclude': t_exclude }, index=[sv]) stat_data.to_csv(file_exclude, index=True, mode='a', header=False) # rejecting timestamps from percentile analysis if len(time_ex) > 0: t_nospct, z_nospct, y_nospct = cf.reject_suspect_data( t_portal, y_portal, z_portal, time_ex) else: t_nospct = t_portal z_nospct = z_portal y_nospct = y_portal """ Plot data """ if len(t_nospct) > 0: if len(t_nospct) != len(dtime): cf.create_dir(save_dir_profile) cf.create_dir(save_dir_xsection) sname = '-'.join((r, method, sv)) sfile = '_'.join( ('rm_suspect_data', sname, pd.to_datetime( t_nospct.min()).strftime( '%Y%m%d'))) t0 = pd.to_datetime( t_nospct.min()).strftime( '%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t_nospct.max()).strftime( '%Y-%m-%dT%H:%M:%S') title = ' '.join( (deployment, refdes, method)) + '\n' + t0 + ' to ' + t1 if zdbar: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges ' '[{} - {}], {} unreasonable values' .format( lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nremoved {} in the upper and lower {} percentile of data grouped in {} ' 'dbar segments'.format( len(z_portal) - len(z_nospct), inpercentile, zcell_size) + '\nexcluded {} suspect data points when inspected visually' .format( len(ndata) - len(z_portal)) + '\nexcluded {} suspect data in water depth greater than {} dbar' .format(n_zdbar, zdbar), ) elif n_std: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} unreasonable values'. format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nremoved {} data points +/- {} SD of data grouped in {} dbar segments' .format( len(z_portal) - len(z_nospct), n_std, zcell_size) + '\nexcluded {} suspect data points when inspected visually' .format( len(ndata) - len(z_portal)), ) else: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} unreasonable values'. format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nremoved {} in the upper and lower {} percentile of data grouped in {} dbar segments' .format( len(z_portal) - len(z_nospct), inpercentile, zcell_size) + '\nexcluded {} suspect data points when inspected visually' .format( len(ndata) - len(z_portal)), ) ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[ 0] + ")" clabel = 'Time' # plot non-erroneous data print('plotting profile') fig, ax = pf.plot_profiles(z_nospct, y_nospct, t_nospct, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) ax.plot(n_avg, y_avg, '-k') #ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_profile, sfile) ''' xsection plot ''' print('plotting xsection') clabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[ 0] + ")" # plot bathymetry only within data time ranges # if t_eng is not None: # eng_ind = (t_eng >= np.nanmin(t_array)) & (t_eng <= np.nanmax(t_array)) # t_eng = t_eng[eng_ind] # m_water_depth = m_water_depth[eng_ind] # plot non-erroneous data fig, ax, bar = pf.plot_xsection( subsite, t_nospct, y_nospct, z_nospct, clabel, ylabel, t_eng=None, m_water_depth=None, inpercentile=inpercentile, stdev=None) ax.set_title(title, fontsize=9) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_xsection, sfile)
def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only, n_std, inpercentile, zcell_size, zdbar): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets) for fd in fdatasets_sel: part_d = fd.split('/')[-1] print('\n{}'.format(part_d)) ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) array = subsite[0:2] sci_vars = cf.return_science_vars(stream) # if 'CE05MOAS' in r or 'CP05MOAS' in r: # for coastal gliders, get m_water_depth for bathymetry # eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng')) # eng_url = [s for s in url_list if eng in s] # if len(eng_url) == 1: # eng_datasets = cf.get_nc_urls(eng_url) # # filter out collocated datasets # eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])] # if len(eng_dataset) > 0: # ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False) # t_eng = ds_eng['time'].values # m_water_depth = ds_eng['m_water_depth'].values # # # m_altitude = glider height above seafloor # # m_depth = glider depth in the water column # # m_altitude = ds_eng['m_altitude'].values # # m_depth = ds_eng['m_depth'].values # # calc_water_depth = m_altitude + m_depth # # # m_altimeter_status = 0 means a good reading (not nan or -1) # try: # eng_ind = ds_eng['m_altimeter_status'].values == 0 # except KeyError: # eng_ind = (~np.isnan(m_water_depth)) & (m_water_depth >= 0) # # m_water_depth = m_water_depth[eng_ind] # t_eng = t_eng[eng_ind] # # # get rid of any remaining nans or fill values # eng_ind2 = (~np.isnan(m_water_depth)) & (m_water_depth >= 0) # m_water_depth = m_water_depth[eng_ind2] # t_eng = t_eng[eng_ind2] # else: # print('No engineering file for deployment {}'.format(deployment)) # m_water_depth = None # t_eng = None # else: # m_water_depth = None # t_eng = None # else: # m_water_depth = None # t_eng = None if deployment_num is not None: if int(int(deployment[-4:])) is not deployment_num: print(type(int(deployment[-4:])), type(deployment_num)) continue if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment, ext) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment, ext) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment, ext) else: save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment) save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment) save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment) time1 = ds['time'].values try: ds_lat1 = ds['lat'].values except KeyError: ds_lat1 = None print('No latitude variable in file') try: ds_lon1 = ds['lon'].values except KeyError: ds_lon1 = None print('No longitude variable in file') # get pressure variable pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars(ds) for sv in sci_vars: print('') print(sv) if 'pressure' not in sv: if sv == 'spkir_abj_cspp_downwelling_vector': pxso.pf_xs_spkir(ds, sv, time1, y1, ds_lat1, ds_lon1, zcell_size, inpercentile, save_dir_profile, save_dir_xsection, deployment, press, y_units, n_std, zdbar) elif 'OPTAA' in r: if sv not in ['wavelength_a', 'wavelength_c']: pxso.pf_xs_optaa(ds, sv, time1, y1, ds_lat1, ds_lon1, zcell_size, inpercentile, save_dir_profile, save_dir_xsection, deployment, press, y_units, n_std, zdbar) else: z1 = ds[sv].values fv = ds[sv]._FillValue sv_units = ds[sv].units # Check if the array is all NaNs if sum(np.isnan(z1)) == len(z1): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z1[z1 != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # remove unreasonable pressure data (e.g. for surface piercing profilers) if zdbar: po_ind = (0 < y1) & (y1 < zdbar) tm = time1[po_ind] y = y1[po_ind] z = z1[po_ind] ds_lat = ds_lat1[po_ind] ds_lon = ds_lon1[po_ind] else: tm = time1 y = y1 z = z1 ds_lat = ds_lat1 ds_lon = ds_lon1 # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \ cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon) # get rid of 0.0 data if sv == 'salinity': ind = ndata > 30 elif sv == 'density': ind = ndata > 1022.5 elif sv == 'conductivity': ind = ndata > 3.45 else: ind = ndata > 0 # if sv == 'sci_flbbcd_chlor_units': # ind = ndata < 7.5 # elif sv == 'sci_flbbcd_cdom_units': # ind = ndata < 25 # else: # ind = ndata > 0.0 # if 'CTD' in r: # ind = zpressure > 0.0 # else: # ind = ndata > 0.0 lenzero = np.sum(~ind) dtime = dtime[ind] zpressure = zpressure[ind] ndata = ndata[ind] if ds_lat is not None and ds_lon is not None: lat = lat[ind] lon = lon[ind] else: lat = None lon = None if len(dtime) > 0: # reject time range from data portal file export t_portal, z_portal, y_portal, lat_portal, lon_portal = \ cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon) print('removed {} data points using visual inspection of data'.format( len(ndata) - len(z_portal))) # create data groups if len(y_portal) > 0: columns = ['tsec', 'dbar', str(sv)] min_r = int(round(np.nanmin(y_portal) - zcell_size)) max_r = int(round(np.nanmax(y_portal) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) groups, d_groups = gt.group_by_depth_range(t_portal, y_portal, z_portal, columns, ranges) if 'scatter' in sv: n_std = None # to use percentile else: n_std = n_std # get percentile analysis for printing on the profile plot y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups( groups, d_groups, n_std, inpercentile) """ Plot all data """ if len(time1) > 0: cf.create_dir(save_dir_profile) cf.create_dir(save_dir_xsection) sname = '-'.join((r, method, sv)) sfileall = '_'.join(('all_data', sname, pd.to_datetime(time1.min()).strftime('%Y%m%d'))) tm0 = pd.to_datetime(time1.min()).strftime('%Y-%m-%dT%H:%M:%S') tm1 = pd.to_datetime(time1.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) + '\n' + tm0 + ' to ' + tm1 if 'SPKIR' in r: title = title + '\nWavelength = 510 nm' ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" clabel = 'Time' fig, ax = pf.plot_profiles(z1, y1, time1, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_profile, sfileall) ''' xsection plot ''' clabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" fig, ax, bar = pf.plot_xsection(subsite, time1, y1, z1, clabel, ylabel, t_eng=None, m_water_depth=None, inpercentile=None, stdev=None) if fig: ax.set_title(title, fontsize=9) fig.tight_layout() pf.save_fig(save_dir_xsection, sfileall) """ Plot cleaned-up data """ if len(dtime) > 0: if len(y_portal) > 0: sfile = '_'.join(('rm_erroneous_data', sname, pd.to_datetime(t_portal.min()).strftime('%Y%m%d'))) t0 = pd.to_datetime(t_portal.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t_portal.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1 if 'SPKIR' in r: title = title + '\nWavelength = 510 nm' ''' profile plot ''' xlabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" clabel = 'Time' fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal, ylabel, xlabel, clabel, stdev=None) ax.set_title(title, fontsize=9) ax.plot(n_avg, y_avg, '-k') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) if inpercentile: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nexcluded {} suspect data points when inspected visually'.format( len(ndata) - len(z_portal)) + '\n(black) data average in {} dbar segments'.format(zcell_size) + '\n(magenta) {} percentile envelope in {} dbar segments'.format( int(100 - inpercentile * 2), zcell_size),) elif n_std: leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nexcluded {} suspect data points when inspected visually'.format( len(ndata) - len(z_portal)) + '\n(black) data average in {} dbar segments'.format(zcell_size) + '\n(magenta) +/- {} SD envelope in {} dbar segments'.format( int(n_std), zcell_size),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_profile, sfile) ''' xsection plot ''' clabel = sv + " (" + sv_units + ")" ylabel = press[0] + " (" + y_units[0] + ")" # plot non-erroneous data fig, ax, bar = pf.plot_xsection(subsite, t_portal, y_portal, z_portal, clabel, ylabel, t_eng=None, m_water_depth=None, inpercentile=None, stdev=None) ax.set_title(title, fontsize=9) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], ' '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) + '\nexcluded {} suspect data points when inspected visually'.format( len(ndata) - len(z_portal)), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() pf.save_fig(save_dir_xsection, sfile) ''' 4D plot for gliders only ''' if 'MOAS' in r: if ds_lat is not None and ds_lon is not None: cf.create_dir(save_dir_4d) clabel = sv + " (" + sv_units + ")" zlabel = press[0] + " (" + y_units[0] + ")" fig = plt.figure() ax = fig.add_subplot(111, projection='3d') sct = ax.scatter(lon_portal, lat_portal, y_portal, c=z_portal, s=2) cbar = plt.colorbar(sct, label=clabel, extend='both') cbar.ax.tick_params(labelsize=8) ax.invert_zaxis() ax.view_init(25, 32) ax.invert_xaxis() ax.invert_yaxis() ax.set_zlabel(zlabel, fontsize=9) ax.set_ylabel('Latitude', fontsize=9) ax.set_xlabel('Longitude', fontsize=9) ax.set_title(title, fontsize=9) pf.save_fig(save_dir_4d, sfile)
def plot_data(fig, ax, fdatasets, save_dir, r): for ii in range(fdatasets): print('\n', fdatasets[ii]) deployment = fdatasets[ii].split('/')[-1].split('_')[0].split( 'deployment')[-1] deployment = int(deployment) ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False) time = ds['time'].values ''' science veriable ''' sci_var = cf.return_science_vars(ds.stream) z_name = [z_var for z_var in sci_var if 'pressure' in z_var] z = ds[z_name[0]].values z_unit = ds[z_name[0]].units z_fill = ds[z_name[0]]._FillValue z = reject_err_data(z, z_fill, r, z_name[0]) w_name = [w_var for w_var in sci_var if 'upward_velocity' in w_var] w = ds[w_name[0]].values w_unit = ds[w_name[0]].units w_fill = ds[w_name[0]]._FillValue w = reject_err_data(w, w_fill, r, w_name[0]) u_name = [u_var for u_var in sci_var if 'eastward_velocity' in u_var] u = ds[u_name[0]].values u_fill = ds[u_name[0]]._FillValue u = reject_err_data(u, u_fill, r, u_name[0]) v_name = [v_var for v_var in sci_var if 'northward_velocity' in v_var] v = ds[v_name[0]].values v_fill = ds[v_name[0]]._FillValue v = reject_err_data(v, v_fill, r, v_name[0]) uv_magnitude = np.sqrt(u**2 + v**2) uv_maxmag = max(uv_magnitude) ''' non science veriable According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees redmine ticket: Marine Hardware #12960 ''' roll = ds['roll_decidegree'].values roll_unit = ds['roll_decidegree'].units roll_fill = ds['roll_decidegree']._FillValue roll = reject_err_data(roll, roll_fill, r, 'roll_decidegree') pitch = ds['pitch_decidegree'].values pitch_units = ds['pitch_decidegree'].units pitch_fill = ds['pitch_decidegree']._FillValue pitch = reject_err_data(pitch, pitch_fill, r, 'pitch_decidegree') headng = ds['heading_decidegree'].values headng_units = ds['heading_decidegree'].units headng_fill = ds['heading_decidegree']._FillValue headng = reject_err_data(headng, headng_fill, r, 'heading_decidegree') tilt_ind = np.logical_or(pitch > 200, roll > 200) pitch_fit = pitch[tilt_ind] roll_fit = roll[tilt_ind] ''' Plot pressure ''' ax1[ii].plot(time, z, 'b-', linestyle='--', linewidth=.6, label='V') if ii == 0: ax1[ii].set_title(r + ' - Pressure ' + z_unit, fontsize=8) prepare_axis(time, deployment, ax1[ii], ii, num_plots) sfile = 'pressure_plots' + group_num save_file = os.path.join(save_dir, sfile) fig1.savefig(str(save_file), dpi=150) ''' plot roll ''' ax2[ii].plot(time, roll, 'b-', linestyle='--', linewidth=.6, label='Roll') ax2[ii].plot(time[tilt_ind], roll_fit, 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') prepare_axis(time, deployment, ax2[ii], ii, num_plots) if ii == 0: ax2[ii].set_title(r + ' - Roll ' + roll_unit, fontsize=8) # leg2 = ax2[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, # ncol=3, mode="expand", borderaxespad=0.) # leg2._drawFrame = False ax2[ii].legend() sfile = 'roll_plots' + group_num save_file = os.path.join(save_dir, sfile) fig2.savefig(str(save_file), dpi=150) ''' plot pitch ''' ax3[ii].plot(time, pitch, 'b-', linestyle='--', linewidth=.6, label='Roll') ax3[ii].plot(time[tilt_ind], pitch_fit, 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') prepare_axis(time, deployment, ax3[ii], ii, num_plots) if ii == 0: ax3[ii].set_title(r + ' - Pitch ' + pitch_units, fontsize=8) # leg3 = ax3[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, # ncol=3, mode="expand", borderaxespad=0.) # leg3._drawFrame = False ax3[ii].legend() sfile = 'pitch_plots' + group_num save_file = os.path.join(save_dir, sfile) fig3.savefig(str(save_file), dpi=150) ''' plot heading ''' ax4[ii].plot(time, headng, 'b-', linestyle='None', marker='.', markersize=0.5, label='Roll') ax4[ii].plot(time[tilt_ind], headng[tilt_ind], 'g.', linestyle='None', marker='.', markersize=0.5, label='Roll < 200') prepare_axis(time, deployment, ax4[ii], ii, num_plots) if ii == 0: ax4[ii].set_title(r + ' - Heading ' + headng_units, fontsize=8) # leg4 = ax4[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, # ncol=3, mode="expand", borderaxespad=0.) # leg4._drawFrame = False ax4[ii].legend() sfile = 'heading_plots' + group_num save_file = os.path.join(save_dir, sfile) fig4.savefig(str(save_file), dpi=150) ''' 1D Quiver plot ''' ax[ii].quiver(time, 0, u, v, color='r', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5) u_fit = u[tilt_ind] v_fit = v[tilt_ind] ax[ii].quiver(time[tilt_ind], 0, u_fit, v_fit, color='b', units='y', scale_units='y', scale=1, headlength=1, headaxislength=1, width=0.004, alpha=0.5) percent_bad = round(((len(u) - len(u_fit)) / len(u)) * 100, 2) print(len(u_fit), len(u), percent_bad) ax[ii].text(time[-1], 0, ' ' + str(percent_bad) + '%', fontsize=5, style='italic', color='blue') ax[ii].set_ylim(-uv_maxmag, uv_maxmag) prepare_axis(time, deployment, ax[ii], ii, num_plots) if ii == 0: ax[ii].set_title( r + ' - Current Velocity ' + w_unit + '\n' + ' Currents in blue when pitch or roll are > 20 degrees', fontsize=8) ax[ii].legend() sfile = 'current_plot' + group_num save_file = os.path.join(save_dir, sfile) fig.savefig(str(save_file), dpi=150, bbox_inches='tight') ''' Plot u and v components ''' ax0[ii].plot(time, v, 'b-', linestyle='--', linewidth=.6, label='V') ax0[ii].plot(time, u, 'g-', linestyle='--', linewidth=.6, label='U') ax0[ii].plot(time, w, 'r-', linestyle='--', linewidth=.6, label='W') prepare_axis(time, deployment, ax0[ii], ii, num_plots) # set title if ii == 0: ax0[ii].set_title(r + ' - Velocity Components' + w_unit, fontsize=8) # Set legend location - See: http://matplotlib.org/users/legend_guide.html#legend-location ax0[ii].legend() # leg0 = ax0[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3, # ncol=3, mode="expand", borderaxespad=0.) # leg0._drawFrame = False sfile = 'uv_plots' + group_num save_file = os.path.join(save_dir, sfile) fig0.savefig(str(save_file), dpi=150)