def main(sDir, plotting_sDir, url_list, sd_calc): dr = pd.read_csv('https://datareview.marine.rutgers.edu/notes/export') drn = dr.loc[dr.type == 'exclusion'] rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) pms = [] for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) pms.append(row[ii]) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments( main_sensor, fdatasets) # find time ranges to exclude from analysis for data review database subsite = r.split('-')[0] subsite_node = '-'.join((subsite, r.split('-')[1])) drne = drn.loc[drn.reference_designator.isin( [subsite, subsite_node, r])] et = [] for i, row in drne.iterrows(): sdate = cf.format_dates(row.start_date) edate = cf.format_dates(row.end_date) et.append([sdate, edate]) # get science variable long names from the Data Review Database stream_sci_vars = cd.sci_var_long_names(r) # check if the science variable long names are the same for each stream sci_vars_dict = cd.sci_var_long_names_check(stream_sci_vars) # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) # build dictionary of science data from the preferred dataset for each deployment print('\nAppending data from files') sci_vars_dict, pressure_unit, pressure_name = cd.append_science_data( ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et) # analyze combined dataset print('\nAnalyzing combined dataset and writing summary file') array = subsite[0:2] save_dir = os.path.join(sDir, array, subsite) cf.create_dir(save_dir) rows = [] if ('FLM' in r) and ( 'CTDMO' in r ): # calculate Flanking Mooring CTDMO stats based on pressure headers = [ 'common_stream_name', 'preferred_methods_streams', 'deployments', 'long_name', 'units', 't0', 't1', 'fill_value', 'global_ranges', 'n_all', 'press_min_max', 'n_excluded_forpress', 'n_nans', 'n_fillvalues', 'n_grange', 'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max', 'stdev', 'note' ] else: headers = [ 'common_stream_name', 'preferred_methods_streams', 'deployments', 'long_name', 'units', 't0', 't1', 'fill_value', 'global_ranges', 'n_all', 'n_nans', 'n_fillvalues', 'n_grange', 'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max', 'stdev' ] for m, n in sci_vars_dict.items(): print('\nSTREAM: ', m) if m == 'common_stream_placeholder': m = 'science_data_stream' if m == 'metbk_hourly': # don't calculate ranges for metbk_hourly continue if ('FLM' in r) and ( 'CTDMO' in r ): # calculate Flanking Mooring CTDMO stats based on pressure # index the pressure variable to filter and calculate stats on the rest of the variables sv_press = 'Seawater Pressure' vinfo_press = n['vars'][sv_press] # first, index where data are nans, fill values, and outside of global ranges fv_press = list(np.unique(vinfo_press['fv']))[0] pdata = vinfo_press['values'] [pind, __, __, __, __, __] = index_dataset(r, vinfo_press['var_name'], pdata, fv_press) pdata_filtered = pdata[pind] [__, pmean, __, __, psd, __] = cf.variable_statistics(pdata_filtered, None) # index of pressure = average of all 'valid' pressure data +/- 1 SD ipress_min = pmean - psd ipress_max = pmean + psd ind_press = (pdata >= ipress_min) & (pdata <= ipress_max) # calculate stats for all variables print('\nPARAMETERS:') for sv, vinfo in n['vars'].items(): print(sv) fv_lst = np.unique(vinfo['fv']).tolist() if len(fv_lst) == 1: fill_value = fv_lst[0] else: print('No unique fill value for {}'.format(sv)) lunits = np.unique(vinfo['units']).tolist() n_all = len(vinfo['t']) # filter data based on pressure index t_filtered = vinfo['t'][ind_press] data_filtered = vinfo['values'][ind_press] deploy_filtered = vinfo['deployments'][ind_press] n_excluded = n_all - len(t_filtered) [dataind, g_min, g_max, n_nan, n_fv, n_grange] = index_dataset(r, vinfo['var_name'], data_filtered, fill_value) t_final = t_filtered[dataind] data_final = data_filtered[dataind] deploy_final = deploy_filtered[dataind] t0 = pd.to_datetime( min(t_final)).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( max(t_final)).strftime('%Y-%m-%dT%H:%M:%S') deploy = list(np.unique(deploy_final)) deployments = [int(dd) for dd in deploy] if len(data_final) > 1: [num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics(data_final, sd_calc) else: mean = None vmin = None vmax = None sd = None n_stats = None note = 'restricted stats calculation to data points where pressure is within defined ranges' \ ' (average of all pressure data +/- 1 SD)' rows.append([ m, list(np.unique(pms)), deployments, sv, lunits, t0, t1, fv_lst, [g_min, g_max], n_all, [round(ipress_min, 2), round(ipress_max, 2)], n_excluded, n_nan, n_fv, n_grange, sd_calc, num_outliers, n_stats, mean, vmin, vmax, sd, note ]) # plot CTDMO data used for stats psave_dir = os.path.join(plotting_sDir, array, subsite, r, 'timeseries_plots_stats') cf.create_dir(psave_dir) dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append( pd.to_datetime(deploy_info['stop_date'])) sname = '-'.join((r, sv)) fig, ax = pf.plot_timeseries_all(t_final, data_final, sv, lunits[0], stdev=None) ax.set_title( (r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) if sd_calc: sname = '-'.join((r, sv, 'rmoutliers')) fig, ax = pf.plot_timeseries_all(t_final, data_final, sv, lunits[0], stdev=sd_calc) ax.set_title( (r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) else: if not sd_calc: sdcalc = None print('\nPARAMETERS: ') for sv, vinfo in n['vars'].items(): print(sv) fv_lst = np.unique(vinfo['fv']).tolist() if len(fv_lst) == 1: fill_value = fv_lst[0] else: print(fv_lst) print('No unique fill value for {}'.format(sv)) lunits = np.unique(vinfo['units']).tolist() t = vinfo['t'] if len(t) > 1: data = vinfo['values'] n_all = len(t) if 'SPKIR' in r or 'presf_abc_wave_burst' in m: if 'SPKIR' in r: [dd_data, g_min, g_max, n_nan, n_fv, n_grange] = index_dataset_2d( r, 'spkir_abj_cspp_downwelling_vector', data, fill_value) else: [dd_data, g_min, g_max, n_nan, n_fv, n_grange] = index_dataset_2d( r, 'presf_wave_burst_pressure', data, fill_value) t_final = t t0 = pd.to_datetime( min(t_final)).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( max(t_final)).strftime('%Y-%m-%dT%H:%M:%S') deploy_final = vinfo['deployments'] deploy = list(np.unique(deploy_final)) deployments = [int(dd) for dd in deploy] num_outliers = [] mean = [] vmin = [] vmax = [] sd = [] n_stats = [] for i in range(len(dd_data)): dd = data[i] # drop nans before calculating stats dd = dd[~np.isnan(dd)] [ num_outliersi, meani, vmini, vmaxi, sdi, n_statsi ] = cf.variable_statistics(dd, sd_calc) num_outliers.append(num_outliersi) mean.append(meani) vmin.append(vmini) vmax.append(vmaxi) sd.append(sdi) n_stats.append(n_statsi) else: [dataind, g_min, g_max, n_nan, n_fv, n_grange] = index_dataset(r, vinfo['var_name'], data, fill_value) t_final = t[dataind] if len(t_final) > 0: t0 = pd.to_datetime( min(t_final)).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( max(t_final)).strftime('%Y-%m-%dT%H:%M:%S') data_final = data[dataind] # if sv == 'Dissolved Oxygen Concentration': # xx = (data_final > 0) & (data_final < 400) # data_final = data_final[xx] # t_final = t_final[xx] # if sv == 'Seawater Conductivity': # xx = (data_final > 1) & (data_final < 400) # data_final = data_final[xx] # t_final = t_final[xx] deploy_final = vinfo['deployments'][dataind] deploy = list(np.unique(deploy_final)) deployments = [int(dd) for dd in deploy] if len(data_final) > 1: [ num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics( data_final, sd_calc) else: sdcalc = None num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = None else: sdcalc = None num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = None deployments = None t0 = None t1 = None else: sdcalc = None num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = None deployments = None t0 = None t1 = None t_final = [] if sd_calc: print_sd = sd_calc else: print_sd = sdcalc rows.append([ m, list(np.unique(pms)), deployments, sv, lunits, t0, t1, fv_lst, [g_min, g_max], n_all, n_nan, n_fv, n_grange, print_sd, num_outliers, n_stats, mean, vmin, vmax, sd ]) if len(t_final) > 0: # plot data used for stats psave_dir = os.path.join( plotting_sDir, array, subsite, r, 'timeseries_reviewed_datarange') cf.create_dir(psave_dir) dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append( pd.to_datetime(deploy_info['stop_date'])) sname = '-'.join((r, sv)) # plot hourly averages for streaming data if 'streamed' in sci_vars_dict[list( sci_vars_dict.keys())[0]]['ms'][0]: sname = '-'.join((sname, 'hourlyavg')) df = pd.DataFrame({ 'dfx': t_final, 'dfy': data_final }) dfr = df.resample('H', on='dfx').mean() # Plot all data fig, ax = pf.plot_timeseries_all(dfr.index, dfr['dfy'], sv, lunits[0], stdev=None) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) if sd_calc: sname = '-'.join( (sname, 'hourlyavg_rmoutliers')) fig, ax = pf.plot_timeseries_all(dfr.index, dfr['dfy'], sv, lunits[0], stdev=sd_calc) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) elif 'SPKIR' in r: fig, ax = pf.plot_spkir(t_final, dd_data, sv, lunits[0]) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) # plot each wavelength wavelengths = [ '412nm', '443nm', '490nm', '510nm', '555nm', '620nm', '683nm' ] for wvi in range(len(dd_data)): fig, ax = pf.plot_spkir_wv( t_final, dd_data[wvi], sv, lunits[0], wvi) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) snamewvi = '-'.join((sname, wavelengths[wvi])) pf.save_fig(psave_dir, snamewvi) elif 'presf_abc_wave_burst' in m: fig, ax = pf.plot_presf_2d(t_final, dd_data, sv, lunits[0]) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) snamewave = '-'.join((sname, m)) pf.save_fig(psave_dir, snamewave) else: # plot all data if not streamed fig, ax = pf.plot_timeseries_all(t_final, data_final, sv, lunits[0], stdev=None) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) if sd_calc: sname = '-'.join((r, sv, 'rmoutliers')) fig, ax = pf.plot_timeseries_all(t_final, data_final, sv, lunits[0], stdev=sd_calc) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(psave_dir, sname) fsum = pd.DataFrame(rows, columns=headers) fsum.to_csv('{}/{}_data_ranges.csv'.format(save_dir, r), index=False)
def main(url_list, sDir, plot_type, start_time, end_time, deployment_num): for i, u in enumerate(url_list): elements = u.split('/')[-2].split('-') r = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = u.split(r + '-')[1].split('/')[0] subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] datasets = cf.get_nc_urls([u]) datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets) save_dir = os.path.join(sDir, array, subsite, r, plot_type) cf.create_dir(save_dir) sname = '-'.join((r, ms, 'track')) print('Appending....') sh = pd.DataFrame() deployments = [] end_times = [] for ii, d in enumerate(datasets_sel): print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), d.split('/')[-1])) ds = xr.open_dataset(d, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})'. format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( d) if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue # get end times of deployments ps_df, n_streams = cf.get_preferred_stream_info(r) dr_data = cf.refdes_datareview_json(r) for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) if int(deploy[-4:]) not in deployments: deployments.append(int(deploy[-4:])) if pd.to_datetime(deploy_info['stop_date']) not in end_times: end_times.append(pd.to_datetime(deploy_info['stop_date'])) data = {'lat': ds['lat'].values, 'lon': ds['lon'].values} new_r = pd.DataFrame(data, columns=['lat', 'lon'], index=ds['time'].values) sh = sh.append(new_r) xD = sh.lon.values yD = sh.lat.values tD = sh.index.values clabel = 'Time' ylabel = 'Latitude' xlabel = 'Longitude' fig, ax = pf.plot_profiles(xD, yD, tD, ylabel, xlabel, clabel, end_times, deployments, stdev=None) ax.invert_yaxis() ax.set_title('Glider Track - ' + r + '\n' + 'x: platform location', fontsize=9) ax.set_xlim(-71.75, -69.75) ax.set_ylim(38.75, 40.75) #cbar.ax.set_yticklabels(end_times) # add Pioneer glider sampling area ax.add_patch( Rectangle((-71.5, 39.0), 1.58, 1.67, linewidth=3, edgecolor='b', facecolor='none')) ax.text(-71, 40.6, 'Pioneer Glider Sampling Area', color='blue', fontsize=8) # add Pioneer AUV sampling area # ax.add_patch(Rectangle((-71.17, 39.67), 0.92, 1.0, linewidth=3, edgecolor='m', facecolor='none')) array_loc = cf.return_array_subsites_standard_loc(array) ax.scatter(array_loc.lon, array_loc.lat, s=40, marker='x', color='k', alpha=0.3) #ax.legend(legn, array_loc.index, scatterpoints=1, loc='lower left', ncol=4, fontsize=8) pf.save_fig(save_dir, sname)
def main(url_list, sDir, mDir, zcell_size, zdbar, start_time, end_time, inpercentile): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save plots mDir : path to the directory on your machine to save data ranges zcell_size : depth cell size to group data zdbar : define depth where suspect data are identified start_time : select start date to slice timeseries end_time : select end date to slice timeseries """"" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) # get science variable long names from the Data Review Database stream_sci_vars = cd.sci_var_long_names(r) # check if the science variable long names are the same for each stream and initialize empty arrays sci_vars_dict0 = cd.sci_var_long_names_check(stream_sci_vars) # get the list of data files and filter out collocated instruments and other streams datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) # select the list of data files from the preferred dataset for each deployment fdatasets_final = [] for ii in range(len(ps_df)): for x in fdatasets: if ps_df['deployment'][ii] in x and ps_df[0][ii] in x: fdatasets_final.append(x) # build dictionary of science data from the preferred dataset for each deployment print('\nAppending data from files') sci_vars_dict, y_unit, y_name, l0 = cd.append_evaluated_science_data( sDir, ps_df, n_streams, r, fdatasets_final, sci_vars_dict0, zdbar, start_time, end_time) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # create data range output folders save_dir_stat = os.path.join(mDir, array, subsite) cf.create_dir(save_dir_stat) # create plots output folder save_fdir = os.path.join(sDir, array, subsite, r, 'data_range') cf.create_dir(save_fdir) stat_df = pd.DataFrame() """ create data ranges csv file and figures """ for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print('\n' + vinfo['var_name']) if len(vinfo['t']) < 1: print('no variable data to plot') continue else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: if len(y) > 0: if m == 'common_stream_placeholder': sname = '-'.join((vinfo['var_name'], r)) else: sname = '-'.join((vinfo['var_name'], r, m)) """ create data ranges for non - pressure data only """ if 'pressure' in vinfo['var_name']: pass else: columns = ['tsec', 'dbar', str(vinfo['var_name'])] # create depth ranges min_r = int(round(min(y) - zcell_size)) max_r = int(round(max(y) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) # group data by depth groups, d_groups = gt.group_by_depth_range(t, y, z, columns, ranges) print('writing data ranges for {}'.format(vinfo['var_name'])) stat_data = groups.describe()[vinfo['var_name']] stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False) t_deploy = deployments[0] for i in range(len(deployments))[1:len(deployments)]: t_deploy = '{}, {}'.format(t_deploy, deployments[i]) stat_data.insert(loc=1, column='deployments', value=t_deploy, allow_duplicates=False) stat_df = stat_df.append(stat_data, ignore_index=False) """ plot full time range free from errors and suspect data """ clabel = sv + " (" + sv_units + ")" ylabel = (y_name[0][0] + " (" + y_unit[0][0] + ")") t_eng = None m_water_depth = None # plot non-erroneous -suspect data fig, ax, bar = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, t_eng, m_water_depth, inpercentile, stdev=None) title0 = 'Data colored using the upper and lower {} percentile.'.format(inpercentile) ax.set_title(r+'\n'+title0, fontsize=9) leg_text = ('{} % erroneous values removed after Human In the Loop review'.format( (len(t)/l0) * 100),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) for ii in range(len(end_times)): ax.axvline(x=end_times[ii], color='b', linestyle='--', linewidth=.8) ax.text(end_times[ii], min(y)-5, 'End' + str(deployments[ii]), fontsize=6, style='italic', bbox=dict(boxstyle='round', ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) # fig.tight_layout() sfile = '_'.join(('data_range', sname)) pf.save_fig(save_fdir, sfile) # write stat file stat_df.to_csv('{}/{}_data_ranges.csv'.format(save_dir_stat, r), index=True, float_format='%11.6f')
def main(url_list, sDir, mDir, zcell_size, zdbar, start_time, end_time): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) # get science variable long names from the Data Review Database stream_sci_vars = cd.sci_var_long_names(r) #stream_vars = cd.var_long_names(r) # check if the science variable long names are the same for each stream and initialize empty arrays sci_vars_dict0 = cd.sci_var_long_names_check(stream_sci_vars) # get the list of data files and filter out collocated instruments and other streams datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) # select the list of data files from the preferred dataset for each deployment fdatasets_final = [] for ii in range(len(ps_df)): for x in fdatasets: if ps_df['deployment'][ii] in x and ps_df[0][ii] in x: fdatasets_final.append(x) # build dictionary of science data from the preferred dataset for each deployment print('\nAppending data from files') et = [] sci_vars_dict, y_unit, y_name = cd.append_evaluated_science_data( sDir, ps_df, n_streams, r, fdatasets_final, sci_vars_dict0, et, start_time, end_time) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) """ create a data-ranges table and figure for full data time range """ # create a folder to save data ranges save_dir_stat = os.path.join(mDir, array, subsite) cf.create_dir(save_dir_stat) save_fdir = os.path.join(sDir, array, subsite, r, 'data_range') cf.create_dir(save_fdir) stat_df = pd.DataFrame() for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(vinfo['var_name']) if len(vinfo['t']) < 1: print('no variable data to plot') continue else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: """ clean up data """ # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \ cf.reject_erroneous_data(r, sv, t, y, z, fv) # reject timestamps from stat analysis Dpath = '{}/{}/{}/{}/{}'.format(sDir, array, subsite, r, 'time_to_exclude') onlyfiles = [] for item in os.listdir(Dpath): if not item.startswith('.') and os.path.isfile( os.path.join(Dpath, item)): onlyfiles.append(join(Dpath, item)) dre = pd.DataFrame() for nn in onlyfiles: dr = pd.read_csv(nn) dre = dre.append(dr, ignore_index=True) drn = dre.loc[dre['Unnamed: 0'] == vinfo['var_name']] list_time = [] for itime in drn.time_to_exclude: ntime = itime.split(', ') list_time.extend(ntime) u_time_list = np.unique(list_time) if len(u_time_list) != 0: t_nospct, z_nospct, y_nospct = cf.reject_suspect_data( dtime, zpressure, ndata, u_time_list) print( '{} using {} percentile of data grouped in {} dbar segments' .format( len(zpressure) - len(z_nospct), inpercentile, zcell_size)) # reject time range from data portal file export t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal( subsite, r, t_nospct, y_nospct, z_nospct) print('{} using visual inspection of data'.format( len(z_nospct) - len(z_portal), inpercentile, zcell_size)) # reject data in a depth range if zdbar is not None: y_ind = y_portal < zdbar t_array = t_portal[y_ind] y_array = y_portal[y_ind] z_array = z_portal[y_ind] else: y_ind = [] t_array = t_portal y_array = y_portal z_array = z_portal print('{} in water depth > {} dbar'.format( len(y_ind), zdbar)) if len(y_array) > 0: if m == 'common_stream_placeholder': sname = '-'.join((vinfo['var_name'], r)) else: sname = '-'.join((vinfo['var_name'], r, m)) """ create data ranges for non - pressure data only """ if 'pressure' in vinfo['var_name']: pass else: columns = ['tsec', 'dbar', str(vinfo['var_name'])] # create depth ranges min_r = int(round(min(y_array) - zcell_size)) max_r = int(round(max(y_array) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) # group data by depth groups, d_groups = gt.group_by_depth_range( t_array, y_array, z_array, columns, ranges) print('writing data ranges for {}'.format( vinfo['var_name'])) stat_data = groups.describe()[vinfo['var_name']] stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False) t_deploy = deployments[0] for i in range( len(deployments))[1:len(deployments)]: t_deploy = '{}, {}'.format( t_deploy, deployments[i]) stat_data.insert(loc=1, column='deployments', value=t_deploy, allow_duplicates=False) stat_df = stat_df.append(stat_data, ignore_index=True) """ plot full time range free from errors and suspect data """ clabel = sv + " (" + sv_units + ")" ylabel = (y_name[0][0] + " (" + y_unit[0][0] + ")") title = ' '.join((r, m)) # plot non-erroneous -suspect data fig, ax, bar = pf.plot_xsection(subsite, t_array, y_array, z_array, clabel, ylabel, inpercentile=None, stdev=None) ax.set_title(title, fontsize=9) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}]' .format( len(z) - lenfv, len(z) - lennan, len(z) - lenev, lengr, global_min, global_max) + '\n' + ('removed {} in the upper and lower {} percentile of data grouped in {} dbar segments' .format( len(zpressure) - len(z_nospct), inpercentile, zcell_size)), ) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) for ii in range(len(end_times)): ax.axvline(x=end_times[ii], color='b', linestyle='--', linewidth=.8) ax.text(end_times[ii], min(y_array) - 5, 'End' + str(deployments[ii]), fontsize=6, style='italic', bbox=dict( boxstyle='round', ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) fig.tight_layout() sfile = '_'.join(('data_range', sname)) pf.save_fig(save_fdir, sfile) # write stat file stat_df.to_csv('{}/{}_data_ranges.csv'.format(save_dir_stat, r), index=True, float_format='%11.6f')
def main(url_list, sDir, plot_type): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get preferred stream ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate data files by methods ''' for ms in ms_list: fdatasets_sel = [x for x in fdatasets if ms in x] # create a folder to save figures save_dir = os.path.join(sDir, array, subsite, r, plot_type, ms.split('-')[0]) cf.create_dir(save_dir) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print('\nAppending data file: {}'.format(fd.split('/')[-1])) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values sh['pressure'] = np.append(sh['pressure'], y) try: ds[pressure].units if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) except AttributeError: print('pressure attributes missing units') if 'pressure unit missing' not in y_unit: y_unit.append('pressure unit missing') try: ds[pressure].long_name if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) except AttributeError: print('pressure attributes missing long_name') if 'pressure long name missing' not in y_name: y_name.append('pressure long name missing') # create a csv file with diagnostic results: if len(y_unit) != 1: print('pressure unit varies') if 'dbar' in y_unit: y_unit = 'dbar' print(y_unit) else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies') if 'Seawater Pressure' in y_name: y_name = 'Seawater Pressure' print(y_name) else: y_name = y_name[0] # create a folder to save variables statistics mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats' save_dir_stat = os.path.join(mDir, array, subsite) cf.create_dir(save_dir_stat) stat_df = pd.DataFrame() for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] title = ' '.join((r, ms)) # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject fill values fv_ind = z != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] print(len(z) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(z_nofv) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] print(len(z) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] print( len(z) - len(ev_ind), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges( r, sv) # platform not in qc-table (parad_k_par) # global_min = 0 # global_max = 2500 print('global ranges for : {}-{} {} - {}'.format( r, sv, global_min, global_max)) if isinstance(global_min, (int, float)) and isinstance( global_max, (int, float)): gr_ind = cf.reject_global_ranges( z_nofv_nonan_noev, global_min, global_max) t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[ gr_ind] y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[ gr_ind] z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[ gr_ind] else: t_nofv_nonan_noev_nogr = t_nofv_nonan_noev y_nofv_nonan_noev_nogr = y_nofv_nonan_noev z_nofv_nonan_noev_nogr = z_nofv_nonan_noev if len(z_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # group by depth range sname = '_'.join((sname, sv_units)) # if sv != 'pressure': # columns = ['tsec', 'dbar', str(sv)] # # # select depth bin size for the data group function # bin_size = 10 # min_r = int(round(min(y_nofv_nonan_noev) - bin_size)) # max_r = int(round(max(y_nofv_nonan_noev) + bin_size)) # ranges = list(range(min_r, max_r, bin_size)) # groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, # z_nofv_nonan_noev_nogr, columns, ranges) # # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]): # if 'pressure' not in sv: # print('final_stats_{}-{}-{}-{}'.format(r, # ms.split('-')[0], # ps_df[0].values[0].split('-')[0], # sv)) # stat_data = groups.describe()[sv] # stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False) # stat_df = stat_df.append(stat_data) # if sv == 'optical_backscatter': # less_ind = z_nofv_nonan_noev < 0.0004 # print(sv, ' < 0.0004', len(less_ind)) # more_ind = z_nofv_nonan_noev > 0.01 # print(sv, ' > 0.01', len(more_ind)) # Plot all data clabel = sv + " (" + sv_units + ")" ylabel = y_name + " (" + y_unit + ")" fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev, y_nofv_nonan_noev, z_nofv_nonan_noev, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, z_nofv_nonan_noev_nogr, clabel, ylabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile) # plot data with excluded time range removed dr = pd.read_csv( 'https://datareview.marine.rutgers.edu/notes/export' ) drn = dr.loc[dr.type == 'exclusion'] if len(drn) != 0: subsite_node = '-'.join((subsite, r.split('-')[1])) drne = drn.loc[drn.reference_designator.isin( [subsite, subsite_node, r])] t_ex = t_nofv_nonan_noev_nogr y_ex = y_nofv_nonan_noev_nogr z_ex = z_nofv_nonan_noev_nogr for i, row in drne.iterrows(): sdate = cf.format_dates(row.start_date) edate = cf.format_dates(row.end_date) ts = np.datetime64(sdate) te = np.datetime64(edate) ind = np.where((t_ex < ts) | (t_ex > te), True, False) if len(ind) != 0: t_ex = t_ex[ind] z_ex = z_ex[ind] y_ex = y_ex[ind] fig, ax = pf.plot_xsection(subsite, t_ex, y_ex, z_ex, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmsuspectdata')) pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time, method_num, zdbar, n_std, inpercentile, zcell_size): for i, u in enumerate(url_list): print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u)) elements = u.split('/')[-2].split('-') r = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = u.split(r + '-')[1].split('/')[0] subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] # read URL to get data datasets = cf.get_nc_urls([u]) datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets) # get sci data review list dr_data = cf.refdes_datareview_json(r) ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update({y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars for ii, d in enumerate(datasets_sel): part_d = d.split('/')[-1] print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), part_d)) with xr.open_dataset(d, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(d) if method_num is not None: if method != method_num: print(method_num, method) continue if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue stime = start_time.strftime('%Y-%m-%d') etime = end_time.strftime('%Y-%m-%d') ext = stime + 'to' + etime # .join((ds0_method, ds1_method save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment, ext) else: save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment) cf.create_dir(save_dir) texclude_dir = os.path.join(sDir, array, subsite, refdes, 'time_to_exclude') cf.create_dir(texclude_dir) # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) sh['t'] = np.append(sh['t'], ds['time'].values) # t = ds['time'].values sh['values'] = np.append(sh['values'], ds[var].values) # z = ds[var].values y, y_unit, y_name = cf.add_pressure_to_dictionary_of_sci_vars(ds) sh['pressure'] = np.append(sh['pressure'], y) stat_data = pd.DataFrame(columns=['deployments', 'time_to_exclude']) file_exclude = '{}/{}_{}_{}_excluded_timestamps.csv'.format(texclude_dir, deployment, refdes, method) stat_data.to_csv(file_exclude, index=True) for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') colors = cm.rainbow(np.linspace(0, 1, len(vinfo['t']))) t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject erroneous data dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \ cf.reject_erroneous_data(r, sv, t, y, z, fv) # create data groups columns = ['tsec', 'dbar', str(sv)] min_r = int(round(min(zpressure) - zcell_size)) max_r = int(round(max(zpressure) + zcell_size)) ranges = list(range(min_r, max_r, zcell_size)) groups, d_groups = gt.group_by_depth_range(dtime, zpressure, ndata, columns, ranges) # ... excluding timestamps if 'scatter' in sv: n_std = None #to use percentile else: n_std = n_std # rejecting timestamps from percentile analysis y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex, \ t_nospct, z_nospct, y_nospct = cf.reject_timestamps_in_groups(groups, d_groups, n_std, dtime, zpressure, ndata, inpercentile) print('{} using {} percentile of data grouped in {} dbar segments'.format( len(zpressure) - len(z_nospct), inpercentile, zcell_size)) """ writing timestamps to .csv file to use with data_range.py script """ if len(time_ex) != 0: t_exclude = time_ex[0] for i in range(len(time_ex))[1:len(time_ex)]: t_exclude = '{}, {}'.format(t_exclude, time_ex[i]) stat_data = pd.DataFrame({'deployments': deployment, 'time_to_exclude': t_exclude}, index=[sv]) stat_data.to_csv(file_exclude, index=True, mode='a', header=False) # reject time range from data portal file export t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal(subsite, r, t_nospct, z_nospct, y_nospct) print('{} using visual inspection of data'.format(len(z_nospct) - len(z_portal), inpercentile, zcell_size)) # reject data in a depth range if zdbar is not None: y_ind = y_portal < zdbar t_array = t_portal[y_ind] y_array = y_portal[y_ind] z_array = z_portal[y_ind] else: y_ind = [] t_array = t_portal y_array = y_portal z_array = z_portal print('{} in water depth > {} dbar'.format(len(y_ind), zdbar)) """ Plot data """ if len(t_array) > 0: if m == 'common_stream_placeholder': sname = '-'.join((sv, r)) else: sname = '-'.join((sv, r, m)) xlabel = sv + " (" + sv_units + ")" ylabel = y_name[0] + " (" + y_unit[0] + ")" clabel = 'Time' title = ' '.join((deployment, r, m)) # plot non-erroneous data fig, ax = pf.plot_profiles(ndata, zpressure, dtime, ylabel, xlabel, clabel, end_times, deployments, stdev=None) ax.set_title(title, fontsize=9) ax.plot(n_avg, y_avg, '-k') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) leg_text = ( 'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}]'.format( len(z) - lenfv, len(z) - lennan, len(z) - lenev, lengr, global_min, global_max) + '\n' + ('(black) data average in {} dbar segments'.format(zcell_size)) + '\n' + ('(magenta) upper and lower {} percentile envelope in {} dbar segments'.format( inpercentile, zcell_size)),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_erroneous_data', sname)) pf.save_fig(save_dir, sfile) # plot excluding time ranges for suspect data if len(z_nospct) != len(zpressure): fig, ax = pf.plot_profiles(z_nospct, y_nospct, t_nospct, ylabel, xlabel, clabel, end_times, deployments, stdev=None) ax.set_title(title, fontsize=9) leg_text = ( 'removed {} in the upper and lower {} percentile of data grouped in {} dbar segments'.format( len(zpressure) - len(z_nospct), inpercentile, zcell_size),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_suspect_data', sname)) pf.save_fig(save_dir, sfile) # plot excluding time ranges from data portal export if len(z_nospct) - len(z_portal): fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal, ylabel, xlabel, clabel, end_times, deployments, stdev=None) ax.set_title(title, fontsize=9) leg_text = ('excluded {} suspect data when inspected visually'.format( len(z_nospct) - len(z_portal)),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_v_suspect_data', sname)) pf.save_fig(save_dir, sfile) # Plot excluding a selected depth value if len(z_array) != len(z_array): fig, ax = pf.plot_profiles(z_array, y_array, t_array, ylabel, xlabel, clabel, end_times, deployments, stdev=None) ax.set_title(title, fontsize=9) leg_text = ('excluded {} suspect data in water depth greater than {} dbar'.format(len(y_ind), zdbar),) ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6) fig.tight_layout() sfile = '_'.join(('rm_depth_range', sname)) pf.save_fig(save_dir, sfile)