def reject_erroneous_data(r, v, t, y, z, d, fz): """ :param r: reference designator :param v: data parameter name :param t: time array :param y: pressure array :param z: data values :param d: deployment number :param fz: fill values defined in the data file :return: filtered data from fill values, NaNs, extreme values '|1e7|' and data outside global ranges """ # reject fill values fv_ind = z != fz y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] d_nofv = d[fv_ind] print(len(z) - len(z_nofv), ' fill values') # reject NaNs nan_ind = ~np.isnan(z_nofv) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] d_nofv_nonan = d_nofv[nan_ind] print(len(z_nofv) - len(z_nofv_nonan), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] d_nofv_nonan_noev = d_nofv_nonan[ev_ind] print( len(z_nofv_nonan) - len(z_nofv_nonan_noev), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, v) if isinstance(global_min, (int, float)) and isinstance(global_max, (int, float)): gr_ind = cf.reject_global_ranges(z_nofv_nonan_noev, global_min, global_max) dtime = t_nofv_nonan_noev[gr_ind] zpressure = y_nofv_nonan_noev[gr_ind] ndata = z_nofv_nonan_noev[gr_ind] ndeploy = d_nofv_nonan_noev[gr_ind] else: gr_ind = [] dtime = t_nofv_nonan_noev zpressure = y_nofv_nonan_noev ndata = z_nofv_nonan_noev ndeploy = d_nofv_nonan_noev print('{} global ranges [{} - {}]'.format( len(ndata) - len(z_nofv_nonan_noev), global_min, global_max)) return dtime, zpressure, ndata, ndeploy
def plot_timeseries(x, y, y_name, stdev=None): """ Create a simple timeseries plot :param x: array containing data for x-axis (e.g. time) :param y: .nc data array for plotting on the y-axis, including data values, coordinates, and variable attributes :param stdev: desired standard deviation to exclude from plotting """ if type(y) is not np.ndarray: yval = y.values else: yval = y if type(x) is not np.ndarray: x = x.values if stdev is None: xD = x yD = yval leg_text = () else: ind = cf.reject_extreme_values(yval) ydata = yval[ind] xdata = x[ind] if len(xdata) > 0: ind2 = cf.reject_outliers(ydata, stdev) yD = ydata[ind2] xD = xdata[ind2] outliers = str(len(y) - len(yD)) leg_text = ('removed {} outliers (SD={})'.format(outliers, stdev), ) else: xD = [] fig, ax = plt.subplots() plt.grid() if len(xD) > 0: plt.plot(xD, yD, '.', markersize=2) y_units = get_units(y) ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9) format_date_axis(ax, fig) y_axis_disable_offset(ax) ax.legend(leg_text, loc='best', fontsize=6) return fig, ax
def plot_timeseries_panel(ds, x, vars, colors, stdev=None): """ Create a timeseries plot with horizontal panels of each science parameter :param ds: dataset (e.g. .nc file opened with xarray) containing data for plotting :param x: array containing data for x-axis (e.g. time) :param vars: list of science variables to plot :param colors: list of colors to be used for plotting :param stdev: desired standard deviation to exclude from plotting """ fig, ax = plt.subplots(len(vars), sharex=True) for i in range(len(vars)): y = ds[vars[i]] if stdev is None: yD = y.values xD = x leg_text = () else: ind = cf.reject_extreme_values(y.values) ydata = y[ind] xdata = x[ind] ind2 = cf.reject_outliers(ydata.values, stdev) yD = ydata[ind2].values xD = xdata[ind2] outliers = str(len(y) - len(yD)) leg_text = ('{}: rm {} outliers'.format(vars[i], outliers), ) y_units = get_units(y) c = colors[i] ax[i].plot(xD, yD, '.', markersize=2, color=c) ax[i].set_ylabel(('(' + y_units + ')'), fontsize=5) ax[i].tick_params(axis='y', labelsize=6) ax[i].legend(leg_text, loc='best', fontsize=4) y_axis_disable_offset(ax[i]) if i == len(vars) - 1: # if the last variable has been plotted format_date_axis(ax[i], fig) return fig, ax
def plot_timeseries_all(x, y, y_name, y_units, stdev=None): """ Create a simple timeseries plot :param x: array containing data for x-axis (e.g. time) :param y: array containing data for y-axis :param stdev: desired standard deviation to exclude from plotting """ if stdev is None: xD = x yD = y leg_text = () else: ind = cf.reject_extreme_values(y) ydata = y[ind] xdata = x[ind] ind2 = cf.reject_outliers(ydata, stdev) yD = ydata[ind2] xD = xdata[ind2] # ind2 = cf.reject_outliers(y, stdev) # yD = y[ind2] # xD = x[ind2] outliers = str(len(y) - len(yD)) leg_text = ('removed {} outliers (SD={})'.format(outliers, stdev), ) fig, ax = plt.subplots() plt.grid() plt.plot(xD, yD, '.', markersize=2) #plt.ylim([-10, 50]) ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9) format_date_axis(ax, fig) y_axis_disable_offset(ax) ax.legend(leg_text, loc='best', fontsize=6) return fig, ax
def main(sDir, url_list, preferred_only): rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] # filter datasets datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join( (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets) # ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # # filter datasets # datasets = [] # for u in url_list: # print(u) # splitter = u.split('/')[-2].split('-') # rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) # if rd_check == r: # udatasets = cf.get_nc_urls([u]) # datasets.append(udatasets) # datasets = list(itertools.chain(*datasets)) # main_sensor = r.split('-')[-1] # fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) # fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) methodstream = [] for f in fdatasets: methodstream.append('-'.join((f.split('/')[-2].split('-')[-2], f.split('/')[-2].split('-')[-1]))) ms_dict = save_dir_path(ms_list) for ms in np.unique(methodstream): fdatasets_sel = [x for x in fdatasets if ms in x] check_ms = ms.split('-')[1] if 'recovered' in check_ms: check_ms = check_ms.split('_recovered')[0] if ms_dict['ms_count'][ms_dict['ms_unique'] == ms.split('-') [0]] == 1: save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_yearly_plot', ms.split('-')[0]) else: save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_yearly_plot', ms.split('-')[0], check_ms) cf.create_dir(save_dir) stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print(fd) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] try: ds[var] print(var) deployment_num = np.unique(ds['deployment'].values)[0] sh['deployments'] = np.append(sh['deployments'], deployment_num) if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values sh['t'] = np.append(sh['t'], tD) sh['values'] = np.append(sh['values'], varD) except KeyError: print('KeyError: ', var) print('\nPlotting data') for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] deployments_num = vinfo['deployments'] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') x = vinfo['t'] y = vinfo['values'] # reject NaNs nan_ind = ~np.isnan(y) x_nonan = x[nan_ind] y_nonan = y[nan_ind] # reject fill values fv_ind = y_nonan != vinfo['fv'][0] x_nonan_nofv = x_nonan[fv_ind] y_nonan_nofv = y_nonan[fv_ind] # reject extreme values Ev_ind = cf.reject_extreme_values(y_nonan_nofv) y_nonan_nofv_nE = y_nonan_nofv[Ev_ind] x_nonan_nofv_nE = x_nonan_nofv[Ev_ind] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) print('global ranges: ', global_min, global_max) if global_min and global_max: gr_ind = cf.reject_global_ranges( y_nonan_nofv_nE, global_min, global_max) y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind] x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind] else: y_nonan_nofv_nE_nogr = y_nonan_nofv_nE x_nonan_nofv_nE_nogr = x_nonan_nofv_nE # check array length if len(y_nonan_nofv_nE_nogr) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) print(var, 'empty array') else: sname = '-'.join((r, m, sv)) # group data by year groups, g_data = gt.group_by_time_range( x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, 'A') # create bins # groups_min = min(groups.describe()['DO']['min']) # lower_bound = int(round(groups_min)) # groups_max = max(groups.describe()['DO']['max']) # if groups_max < 1: # upper_bound = 1 # step_bound = 1 # else: # upper_bound = int(round(groups_max + (groups_max / 50))) # step_bound = int(round((groups_max - groups_min) / 10)) # # if step_bound == 0: # step_bound += 1 # # if (upper_bound - lower_bound) == step_bound: # lower_bound -= 1 # upper_bound += 1 # if (upper_bound - lower_bound) < step_bound: # print('<') # step_bound = int(round(step_bound / 10)) # print(lower_bound, upper_bound, step_bound) # bin_range = list(range(lower_bound, upper_bound, step_bound)) # print(bin_range) # preparing color palette colors = color_names[:len(groups)] # colors = [color['color'] for color in # list(pyplot.rcParams['axes.prop_cycle'][:len(groups)])] fig0, ax0 = pyplot.subplots(nrows=2, ncols=1) # subplot for histogram and basic statistics table ax0[1].axis('off') ax0[1].axis('tight') the_table = ax0[1].table( cellText=groups.describe().round(2).values, rowLabels=groups.describe().index.year, rowColours=colors, colLabels=groups.describe().columns.levels[1], loc='center') the_table.set_fontsize(5) # subplot for data fig, ax = pyplot.subplots(nrows=len(groups), ncols=1, sharey=True) if len(groups) == 1: ax = [ax] t = 1 for ny in range(len(groups)): # prepare data for plotting y_data = g_data[ny + (t + 1)].dropna(axis=0) x_time = g_data[ny + t].dropna(axis=0) t += 1 if len(y_data) != 0 and len(x_time) != 0: n_year = x_time[0].year col_name = str(n_year) serie_n = pd.DataFrame(columns=[col_name], index=x_time) serie_n[col_name] = list(y_data[:]) # plot histogram # serie_n.plot.hist(ax=ax0[0], bins=bin_range, # histtype='bar', color=colors[ny], stacked=True) if len(serie_n) != 1: serie_n.plot.kde(ax=ax0[0], color=colors[ny]) ax0[0].legend(fontsize=8, bbox_to_anchor=(0., 1.12, 1., .102), loc=3, ncol=len(groups), mode="expand", borderaxespad=0.) # ax0[0].set_xticks(bin_range) ax0[0].set_xlabel('Observation Ranges', fontsize=8) ax0[0].set_ylabel( 'Density', fontsize=8 ) #'Number of Observations' ax0[0].set_title( ms.split('-')[0] + ' (' + sv + ', ' + sv_units + ')' + ' Kernel Density Estimates', fontsize=8) # plot data serie_n.plot(ax=ax[ny], linestyle='None', marker='.', markersize=0.5, color=colors[ny]) ax[ny].legend().set_visible(False) # plot Mean and Standard deviation ma = serie_n.rolling('86400s').mean() mstd = serie_n.rolling('86400s').std() ax[ny].plot(ma.index, ma[col_name].values, 'k', linewidth=0.15) ax[ny].fill_between( mstd.index, ma[col_name].values - 2 * mstd[col_name].values, ma[col_name].values + 2 * mstd[col_name].values, color='b', alpha=0.2) # prepare the time axis parameters datemin = datetime.date(n_year, 1, 1) datemax = datetime.date(n_year, 12, 31) ax[ny].set_xlim(datemin, datemax) xlocator = mdates.MonthLocator( ) # every month myFmt = mdates.DateFormatter('%m') ax[ny].xaxis.set_minor_locator( xlocator) ax[ny].xaxis.set_major_formatter(myFmt) # prepare the time axis parameters # ax[ny].set_yticks(bin_range) ylocator = MaxNLocator(prune='both', nbins=3) ax[ny].yaxis.set_major_locator( ylocator) # format figure ax[ny].tick_params(axis='both', color='r', labelsize=7, labelcolor='m') if ny < len(groups) - 1: ax[ny].tick_params( which='both', pad=0.1, length=1, labelbottom=False) ax[ny].set_xlabel(' ') else: ax[ny].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) ax[ny].set_xlabel('Months', rotation=0, fontsize=8, color='b') ax[ny].set_ylabel(n_year, rotation=0, fontsize=8, color='b', labelpad=20) ax[ny].yaxis.set_label_position( "right") if ny == 0: if global_min and global_max: ax[ny].set_title( sv + '( ' + sv_units + ') -- Global Range: [' + str(int(global_min)) + ',' + str(int(global_max)) + '] \n' 'Plotted: Data, Mean and 2STD (Method: One day rolling window calculations) \n', fontsize=8) else: ax[ny].set_title( sv + '( ' + sv_units + ') -- Global Range: [] \n' 'Plotted: Data, Mean and 2STD (Method: One day rolling window calculations) \n', fontsize=8) # plot global ranges # ax[ny].axhline(y=global_min, color='r', linestyle='--', linewidth=.6) # ax[ny].axhline(y=global_max, color='r', linestyle='--', linewidth=.6) # mark deployment end times on figure ymin, ymax = ax[ny].get_ylim() #dep = 1 for etimes in range(len(end_times)): if end_times[ etimes].year == n_year: ax[ny].axvline( x=end_times[etimes], color='b', linestyle='--', linewidth=.6) ax[ny].text( end_times[etimes], ymin, 'End' + str(deployments_num[etimes] ), fontsize=6, style='italic', bbox=dict(boxstyle='round', ec=(0., 0.5, 0.5), fc=(1., 1., 1.))) # dep += 1 # ax[ny].set_ylim(5, 12) # save figure to a file sfile = '_'.join(('all', sname)) save_file = os.path.join(save_dir, sfile) fig.savefig(str(save_file), dpi=150) sfile = '_'.join(('Statistics', sname)) save_file = os.path.join(save_dir, sfile) fig0.savefig(str(save_file), dpi=150) pyplot.close()
def plot_xsection(subsite, x, y, z, clabel, ylabel, t_eng=None, m_water_depth=None, inpercentile=None, stdev=None): """ Create a cross-section plot for mobile instruments :param subsite: subsite part of reference designator to plot :param x: array containing data for x-axis (e.g. time) :param y: .nc data array containing data for plotting on the y-axis (e.g. pressure) :param z: .nc data array containing data for plotting variable of interest (e.g. density) :param clabel: label for the colorbar :param ylabel: label for the y-axis :param t_eng: .nc data array containing engineering timestamps (to plot water depth) :param m_water_depth: .nc data array containing water depth data from the engineering data stream :param inpercentile: percentile of data to exclude from plot :param stdev: desired standard deviation to exclude from plotting """ if type(z) is not np.ndarray: z = z.values if type(y) is not np.ndarray: y = y.values if type(x) is not np.ndarray: x = x.values # when plotting gliders, remove zeros (glider fill values) and negative numbers if 'MOAS' in subsite: z[z <= 0.0] = np.nan zeros = str(len(z) - np.count_nonzero(~np.isnan(z))) if stdev is None: xD = x yD = y zD = z else: ind = cf.reject_extreme_values(z) xdata = x[ind] ydata = y[ind] zdata = z[ind] ind2 = cf.reject_outliers(zdata, stdev) xD = xdata[ind2] yD = ydata[ind2] zD = zdata[ind2] outliers = str(len(zdata) - len(zD)) try: zeros except NameError: zeros = None try: outliers except NameError: outliers = None fig, ax = plt.subplots() plt.margins(y=.08, x=.02) try: xc = ax.scatter(xD, yD, c=zD, s=2, edgecolor='None') #plt.ylim([0, 100]) ax.invert_yaxis() # add bathymetry for coastal gliders if t_eng is not None and m_water_depth is not None: if len(t_eng) > 1: ax.fill_between(t_eng, m_water_depth, np.max(m_water_depth) + 2, facecolor='k', alpha=0.4) # add color bar #ticks = np.linspace(np.nanmin(zD), np.nanmax(zD), 5).tolist() bar = fig.colorbar(xc, ax=ax, label=clabel, extend='both') bar.formatter.set_useOffset(False) bar.ax.tick_params(labelsize=8) if inpercentile is not None: upper_lim = np.percentile(zD, 100 - inpercentile) # upper_mid = np.percentile(zD, 100 - 15*inpercentile) # lower_mid = np.percentile(zD, 100 - 10*inpercentile) lower_lim = np.percentile(zD, inpercentile) bar.set_clim(lower_lim, upper_lim) bar.set_ticks([lower_lim, upper_lim], update_ticks=True) #lower_mid, upper_mid, ax.set_ylabel(ylabel, fontsize=9) format_date_axis(ax, fig) if zeros is None and type(outliers) is str: leg = ('rm: {} outliers (SD={})'.format(outliers, stdev), ) ax.legend(leg, loc=1, fontsize=6) if type(zeros) is str and outliers is None: leg = ('rm: {} values <=0.0'.format(zeros), ) ax.legend(leg, loc=1, fontsize=6) if type(zeros) is str and type(outliers) is str: leg = ('rm: {} values <=0.0, rm: {} outliers (SD={})'.format( zeros, outliers, stdev), ) ax.legend(leg, loc=1, fontsize=6) except ValueError: print("plot can't be generated") fig = None ax = None bar = None return fig, ax, bar
def plot_timeseries_compare(t0, t1, var0, var1, m0, m1, long_name, stdev=None): """ Create a timeseries plot containing two datasets :param t0: data array of time for dataset 0 :param t1: data array of time for dataset 1 :param var0: .nc data array for plotting on the y-axis for dataset 0, including data values and variable attributes :param var1: .nc data array for plotting on the y-axis for dataset 1, including data values and variable attributes :param stdev: desired standard deviation to exclude from plotting """ if stdev is None: t0_data = t0.values var0_data = var0.values leg_text = ('{}'.format(m0), ) t1_data = t1.values var1_data = var1.values leg_text += ('{}'.format(m1), ) else: ind0 = cf.reject_extreme_values(var0.values) t0i = t0[ind0] var0i = var0[ind0] ind02 = cf.reject_outliers(var0i.values, stdev) t0_data = t0i[ind02].values var0_data = var0i[ind02].values #var0_data[var0_data <= 0.0] = np.nan # get rid of zeros and negative numbers outliers0 = str((len(var0) - len(var0_data)) + (len(t0_data) - np.count_nonzero(~np.isnan(var0_data)))) leg_text = ('{}: removed {} outliers (SD={})'.format( m0, outliers0, stdev), ) ind1 = cf.reject_extreme_values(var1.values) t1i = t1[ind1] var1i = var1[ind1] ind12 = cf.reject_outliers(var1i.values, stdev) t1_data = t1i[ind12].values var1_data = var1i[ind12].values #var1_data[var1_data <= 0.0] = np.nan # get rid of zeros and negative numbers outliers1 = str((len(var1) - len(var1_data)) + (len(t1_data) - np.count_nonzero(~np.isnan(var1_data)))) leg_text += ('{}: removed {} outliers (SD={})'.format( m1, outliers1, stdev), ) y_units = get_units(var0) fig, ax = plt.subplots() plt.grid() #plt.ylim([2000, 2500]) ax.plot(t0_data, var0_data, 'o', markerfacecolor='none', markeredgecolor='r', markersize=5, lw=.75) #ax.plot(t1_data, var1_data, 'x', markeredgecolor='b', markersize=5, lw=.75) ax.plot(t1_data, var1_data, '.', markeredgecolor='b', markersize=2) ax.set_ylabel((long_name + " (" + y_units + ")"), fontsize=9) format_date_axis(ax, fig) y_axis_disable_offset(ax) ax.legend(leg_text, loc='best', fontsize=6) return fig, ax
def main(sDir, url_list, start_time, end_time): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) main_sensor = r.split('-')[-1] fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets) # get science variable long names from the Data Review Database #stream_sci_vars = cd.sci_var_long_names(r) if 'SPKIR' in r or 'PRESF' in r: # only get the main science variable for SPKIR stream_vars = cd.sci_var_long_names(r) else: stream_vars = var_long_names(r) # check if the science variable long names are the same for each stream and initialize empty arrays sci_vars_dict = cd.sci_var_long_names_check(stream_vars) # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) # build dictionary of science data from the preferred dataset for each deployment print('\nAppending data from files') et = [] sci_vars_dict, __, __ = cd.append_science_data(ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et, start_time, end_time) # get end times of deployments dr_data = cf.refdes_datareview_json(r) deployments = [] dend_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) dend_times.append(pd.to_datetime(deploy_info['stop_date'])) subsite = r.split('-')[0] array = subsite[0:2] save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_plots_preferred_all') cf.create_dir(save_dir) print('\nPlotting data') for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if 'SPKIR' in r: fv_lst = np.unique(vinfo['fv']).tolist() if len(fv_lst) == 1: fill_value = fv_lst[0] else: print(fv_lst) print('No unique fill value for {}'.format(sv)) sv_units = np.unique(vinfo['units']).tolist() t = vinfo['t'] if len(t) > 1: data = vinfo['values'] [dd_data, g_min, g_max] = index_dataset_2d(r, 'spkir_abj_cspp_downwelling_vector', data, fill_value) t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S') deploy_final = vinfo['deployments'] deploy = list(np.unique(deploy_final)) deployments = [int(dd) for dd in deploy] sname = '-'.join((r, sv)) fig, ax = pf.plot_spkir(t, dd_data, sv, sv_units[0]) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n' + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8) for etimes in dend_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(save_dir, sname) # plot each wavelength wavelengths = ['412nm', '443nm', '490nm', '510nm', '555nm', '620nm', '683nm'] for wvi in range(len(dd_data)): fig, ax = pf.plot_spkir_wv(t, dd_data[wvi], sv, sv_units[0], wvi) ax.set_title( (r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n' + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8) for etimes in dend_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) snamewvi = '-'.join((sname, wavelengths[wvi])) pf.save_fig(save_dir, snamewvi) elif 'presf_abc_wave_burst' in m: fv_lst = np.unique(vinfo['fv']).tolist() if len(fv_lst) == 1: fill_value = fv_lst[0] else: print(fv_lst) print('No unique fill value for {}'.format(sv)) sv_units = np.unique(vinfo['units']).tolist() t = vinfo['t'] if len(t) > 1: data = vinfo['values'] [dd_data, g_min, g_max] = index_dataset_2d(r, 'presf_wave_burst_pressure', data, fill_value) t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S') deploy_final = vinfo['deployments'] deploy = list(np.unique(deploy_final)) deployments = [int(dd) for dd in deploy] sname = '-'.join((r, sv)) fig, ax = pf.plot_presf_2d(t, dd_data, sv, sv_units[0]) ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n' + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8) for etimes in dend_times: ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6) pf.save_fig(save_dir, sname) else: if type(vinfo['values']) != dict: # if the variable is not a 2D array if 'Spectra' not in sv: if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] sv_name = vinfo['var_name'] t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') x = vinfo['t'] y = vinfo['values'] # reject NaNs and values of 0.0 nan_ind = (~np.isnan(y)) & (y != 0.0) x_nonan = x[nan_ind] y_nonan = y[nan_ind] # reject fill values fv_ind = y_nonan != vinfo['fv'][0] x_nonan_nofv = x_nonan[fv_ind] y_nonan_nofv = y_nonan[fv_ind] # reject extreme values Ev_ind = cf.reject_extreme_values(y_nonan_nofv) y_nonan_nofv_nE = y_nonan_nofv[Ev_ind] x_nonan_nofv_nE = x_nonan_nofv[Ev_ind] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv_name) if any(e is None for e in [global_min, global_max]): y_nonan_nofv_nE_nogr = y_nonan_nofv_nE x_nonan_nofv_nE_nogr = x_nonan_nofv_nE else: gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max) y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind] x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind] if len(y_nonan_nofv) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) plt_deploy = [int(x) for x in list(np.unique(vinfo['deployments']))] # plot hourly averages for cabled and FDCHP data if 'streamed' in sci_vars_dict[list(sci_vars_dict.keys())[0]]['ms'][0] or 'FDCHP' in r: sname = '-'.join((sname, 'hourlyavg')) df = pd.DataFrame({'dfx': x_nonan_nofv_nE_nogr, 'dfy': y_nonan_nofv_nE_nogr}) dfr = df.resample('H', on='dfx').mean() # Plot all data fig, ax = pf.plot_timeseries_all(dfr.index, dfr['dfy'], sv, sv_units, stdev=None) ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1), fontsize=8) # if plotting a specific time range, plot deployment lines only for those deployments if type(start_time) == dt.datetime: for e in list(np.unique(vinfo['deployments'])): etime = dend_times[int(e) - 1] ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) else: for etime in dend_times: ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) pf.save_fig(save_dir, sname) else: # Plot all data fig, ax = pf.plot_timeseries_all(x_nonan_nofv, y_nonan_nofv, sv, sv_units, stdev=None) ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1), fontsize=8) # if plotting a specific time range, plot deployment lines only for those deployments if type(start_time) == dt.datetime: # for e in list(np.unique(vinfo['deployments'])): # etime = dend_times[int(e) - 1] # ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1] ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) else: for etime in dend_times: ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) # if not any(e is None for e in [global_min, global_max]): # ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6) # ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6) # else: # maxpoint = x[np.argmax(y_nonan_nofv)], max(y_nonan_nofv) # ax.annotate('No Global Ranges', size=8, # xy=maxpoint, xytext=(5, 5), textcoords='offset points') pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_timeseries_all(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, sv, sv_units, stdev=5) ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1), fontsize=8) # if plotting a specific time range, plot deployment lines only for those deployments if type(start_time) == dt.datetime: # for e in list(np.unique(vinfo['deployments'])): # etime = dend_times[int(e) - 1] # ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1] ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) else: for etime in dend_times: ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6) # if not any(e is None for e in [global_min, global_max]): # ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6) # ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6) # else: # maxpoint = x[np.argmax(y_nonan_nofv_nE_nogr)], max(y_nonan_nofv_nE_nogr) # ax.annotate('No Global Ranges', size=8, # xy=maxpoint, xytext=(5, 5), textcoords='offset points') sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(sDir, ncdir): rd_list = [ncdir.split('/')[-2]] for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # filter datasets fdatasets = [] for root, dirs, files in os.walk(ncdir): for f in files: if f.endswith('.nc'): fdatasets.append(f) # for u in url_list: # splitter = u.split('/')[-2].split('-') # rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) # if rd_check == r: # udatasets = cf.get_nc_urls([u]) # datasets.append(udatasets) # datasets = list(itertools.chain(*datasets)) # main_sensor = r.split('-')[-1] # fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) methodstream = [] for f in fdatasets: strm = '_'.join((f.split('-')[-2].split('_')[0], f.split('-')[-2].split('_')[1])) methodstream.append('-'.join((f.split('-')[-3], strm))) for ms in np.unique(methodstream): fdatasets_sel = [x for x in fdatasets if ms in x] save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_plots_all') cf.create_dir(save_dir) stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update({y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) for fd in fdatasets_sel: ds = xr.open_dataset(os.path.join(ncdir, fd), mask_and_scale=False) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values sh['t'] = np.append(sh['t'], tD) sh['values'] = np.append(sh['values'], varD) print('\nPlotting data') for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') x = vinfo['t'] y = vinfo['values'] # reject NaNs nan_ind = ~np.isnan(y) x_nonan = x[nan_ind] y_nonan = y[nan_ind] # reject fill values fv_ind = y_nonan != vinfo['fv'][0] x_nonan_nofv = x_nonan[fv_ind] y_nonan_nofv = y_nonan[fv_ind] # reject extreme values Ev_ind = cf.reject_extreme_values(y_nonan_nofv) y_nonan_nofv_nE = y_nonan_nofv[Ev_ind] x_nonan_nofv_nE = x_nonan_nofv[Ev_ind] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) if global_min is not None and global_max is not None: gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max) y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind] x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind] else: y_nonan_nofv_nE_nogr = y_nonan_nofv_nE x_nonan_nofv_nE_nogr = x_nonan_nofv_nE title = ' '.join((r, ms.split('-')[0])) if len(y_nonan_nofv) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # Plot all data fig, ax = pf.plot_timeseries_all(x_nonan_nofv, y_nonan_nofv, sv, sv_units, stdev=None) ax.set_title((title + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='b', linestyle='--', linewidth=.6) # if global_min is not None and global_max is not None: # ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6) # ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6) pf.save_fig(save_dir, sname) # Plot data with extreme values, data outside global ranges and outliers removed fig, ax = pf.plot_timeseries_all(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, sv, sv_units, stdev=5) ax.set_title((title + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1), fontsize=8) for etimes in end_times: ax.axvline(x=etimes, color='b', linestyle='--', linewidth=.6) # if global_min is not None and global_max is not None: # ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6) # ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(sDir, url_list): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # filter datasets datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) methodstream = [] for f in fdatasets: methodstream.append('-'.join((f.split('/')[-2].split('-')[-2], f.split('/')[-2].split('-')[-1]))) for ms in np.unique(methodstream): fdatasets_sel = [x for x in fdatasets if ms in x] check_ms = ms.split('-')[1] if 'recovered' in check_ms: check_ms = check_ms.split('_recovered') save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_monthly_plot', check_ms[0], ms.split('-')[0]) cf.create_dir(save_dir) stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update({y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values sh['t'] = np.append(sh['t'], tD) sh['values'] = np.append(sh['values'], varD) print('\nPlotting data') for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') x = vinfo['t'] y = vinfo['values'] # reject NaNs nan_ind = ~np.isnan(y) x_nonan = x[nan_ind] y_nonan = y[nan_ind] # reject fill values fv_ind = y_nonan != vinfo['fv'][0] x_nonan_nofv = x_nonan[fv_ind] y_nonan_nofv = y_nonan[fv_ind] # reject extreme values Ev_ind = cf.reject_extreme_values(y_nonan_nofv) y_nonan_nofv_nE = y_nonan_nofv[Ev_ind] x_nonan_nofv_nE = x_nonan_nofv[Ev_ind] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max) y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind] x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind] title = ' '.join((r, ms.split('-')[0])) if len(y_nonan_nofv) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # 1st group by year ygroups, gy_data = gt.group_by_timerange(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, 'A') tn = 1 for n in range(len(ygroups)): x_time = gy_data[n+tn].dropna(axis=0) y_data = gy_data[n+(tn+1)].dropna(axis=0) y_data = y_data.astype(float) # 2nd group by month mgroups, gm_data = gt.group_by_timerange(x_time.values, y_data.values, 'M') x_year = x_time[0].year print(x_year) # # create bins for histogram mgroups_min = min(mgroups.describe()['DO']['min']) mgroups_max = max(mgroups.describe()['DO']['max']) lower_bound = int(round(mgroups_min)) upper_bound = int(round(mgroups_max + (mgroups_max / 50))) step_bound = int(round((mgroups_max - mgroups_min) / 10)) lower_bound = int(round(global_min)) upper_bound = int(round(global_max + (global_max / 50))) step_bound = int(round((global_max - global_min) / 10)) if step_bound == 0: step_bound += 1 if (upper_bound - lower_bound) == step_bound: lower_bound -= 1 upper_bound += 1 if (upper_bound - lower_bound) < step_bound: step_bound = int(round(step_bound / 10)) bin_range = list(range(lower_bound, upper_bound, step_bound)) print(bin_range) # create color palette colors = color_names[:len(mgroups)] print('1--- ', len(colors)) print(colors) fig0, ax0 = pyplot.subplots(nrows=2, ncols=1) # # subplot for histogram and basic statistics table ax0[0].axis('off') ax0[0].axis('tight') the_table = ax0[0].table(cellText=mgroups.describe().round(2).values, rowLabels=mgroups.describe().index.month, rowColours=colors, colLabels=mgroups.describe().columns.levels[1], loc='center') the_table.set_fontsize(5) fig, ax = pyplot.subplots(nrows=12, ncols=1, sharey=True) for kk in list(range(0, 12)): ax[kk].tick_params(axis='both', which='both', color='r', labelsize=7, labelcolor='m', rotation=0, pad=0.1, length=1) month_name = calendar.month_abbr[kk + 1] ax[kk].set_ylabel(month_name, rotation=0, fontsize=8, color='b', labelpad=20) if kk == 0: ax[kk].set_title(str(x_year) + '\n ' + sv + " (" + sv_units + ")" + ' Global Range: [' + str(int(global_min)) + ',' + str(int(global_max)) + ']' + '\n End of deployments are marked with a vertical line \n ' + 'Plotted: Data, Mean and STD (Method: 1 day' + ' rolling window calculations)', fontsize=8) if kk < 11: ax[kk].tick_params(labelbottom=False) if kk == 11: ax[kk].set_xlabel('Days', rotation=0, fontsize=8, color='b') tm = 1 for mt in range(len(mgroups)): x_time = gm_data[mt+tm].dropna(axis=0) y_data = gm_data[mt+(tm+1)].dropna(axis=0) if len(x_time) == 0: # ax[plt_index].tick_params(which='both', labelbottom=False, labelleft=False, # pad=0.1, length=1) continue x_month = x_time[0].month col_name = str(x_month) series_m = pd.DataFrame(columns=[col_name], index=x_time) series_m[col_name] = list(y_data[:]) # serie_n.plot.hist(ax=ax0[0], bins=bin_range, # histtype='bar', color=colors[ny], stacked=True) series_m.plot.kde(ax=ax0[0], color=colors[mt]) ax0[0].legend(fontsize=8, bbox_to_anchor=(0., 1.12, 1., .102), loc=3, ncol=len(mgroups), mode="expand", borderaxespad=0.) # ax0[0].set_xticks(bin_range) ax0[0].set_xlabel('Observation Ranges' + ' (' + sv + ', ' + sv_units + ')', fontsize=8) ax0[0].set_ylabel('Density', fontsize=8) # 'Number of Observations' ax0[0].set_title('Kernel Density Estimates', fontsize=8) ax0[0].tick_params(which='both', labelsize=7, pad=0.1, length=1, rotation=0) plt_index = x_month - 1 # Plot data series_m.plot(ax=ax[plt_index], linestyle='None', marker='.', markersize=1) ax[plt_index].legend().set_visible(False) ma = series_m.rolling('86400s').mean() mstd = series_m.rolling('86400s').std() ax[plt_index].plot(ma.index, ma[col_name].values, 'b') ax[plt_index].fill_between(mstd.index, ma[col_name].values-3*mstd[col_name].values, ma[col_name].values+3*mstd[col_name].values, color='b', alpha=0.2) # prepare the time axis parameters mm, nod = monthrange(x_year, x_month) datemin = datetime.date(x_year, x_month, 1) datemax = datetime.date(x_year, x_month, nod) ax[plt_index].set_xlim(datemin, datemax) xlocator = mdates.DayLocator() # every day myFmt = mdates.DateFormatter('%d') ax[plt_index].xaxis.set_major_locator(xlocator) ax[plt_index].xaxis.set_major_formatter(myFmt) ax[plt_index].xaxis.set_minor_locator(pyplot.NullLocator()) ax[plt_index].xaxis.set_minor_formatter(pyplot.NullFormatter()) # data_min = min(ma.DO_n.dropna(axis=0) - 5 * mstd.DO_n.dropna(axis=0)) # 0data_max = max(ma.DO_n.dropna(axis=0) + 5 * mstd.DO_n.dropna(axis=0)) # ax[plt_index].set_ylim([data_min, data_max]) ylocator = MaxNLocator(prune='both', nbins=3) ax[plt_index].yaxis.set_major_locator(ylocator) if x_month != 12: ax[plt_index].tick_params(which='both', labelbottom=False, pad=0.1, length=1) ax[plt_index].set_xlabel(' ') else: ax[plt_index].tick_params(which='both', color='r', labelsize=7, labelcolor='m', pad=0.1, length=1, rotation=0) ax[plt_index].set_xlabel('Days', rotation=0, fontsize=8, color='b') dep = 1 for etimes in end_times: ax[plt_index].axvline(x=etimes, color='b', linestyle='--', linewidth=.8) if ma[col_name].values.any(): ax[plt_index].text(etimes, max(ma[col_name].dropna(axis=0)), 'End' + str(dep), fontsize=6, style='italic', bbox=dict(boxstyle='round', ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) else: ax[plt_index].text(etimes, min(series_m['DO_n']), 'End' + str(dep), fontsize=6, style='italic', bbox=dict(boxstyle='round', ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) dep += 1 tm += 1 tn += 1 # pyplot.show() sfile = '_'.join((str(x_year), sname)) save_file = os.path.join(save_dir, sfile) fig.savefig(str(save_file), dpi=150) sfile = '_'.join(('Statistics', str(x_year), sname)) save_file = os.path.join(save_dir, sfile) fig0.savefig(str(save_file), dpi=150)
def main(url_list, sDir, plot_type): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams chat datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate the data files by methods ''' for ms in ms_list: # np.unique(methodstream) fdatasets_sel = [x for x in fdatasets if ms in x] # create a folder to save figures save_dir = os.path.join(sDir, array, subsite, r, plot_type, ms.split('-')[0]) cf.create_dir(save_dir) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print('\nAppending data file: {}'.format(fd.split('/')[-1])) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) sh['pressure'] = np.append(sh['pressure'], y) if len(y_unit) != 1: print('pressure unit varies!') else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies!') else: y_name = y_name[0] for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print('\nWorking on variable: {}'.format(sv)) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] x = vinfo['values'] y = vinfo['pressure'] # Check if the array is all NaNs if sum(np.isnan(x)) == len(x): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(x[x != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject fill values fv_ind = x != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] c_nofv = cm.rainbow(np.linspace(0, 1, len(t[fv_ind]))) x_nofv = x[fv_ind] print(len(x) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(x) t_nofv_nonan = t_nofv[nan_ind] c_nofv_nonan = c_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] x_nofv_nonan = x_nofv[nan_ind] print(len(x) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(x_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] c_nofv_nonan_noev = c_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] x_nofv_nonan_noev = x_nofv_nonan[ev_ind] print(len(z) - len(ev_ind), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) # platform not in qc-table (parad_k_par) # global_min = 0 # global_max = 2500 print('global ranges for : {}-{} {} - {}'.format( r, sv, global_min, global_max)) if isinstance(global_min, (int, float)) and isinstance( global_max, (int, float)): gr_ind = cf.reject_global_ranges( x_nofv_nonan_noev, global_min, global_max) t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[gr_ind] y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[gr_ind] x_nofv_nonan_noev_nogr = x_nofv_nonan_noev[gr_ind] else: t_nofv_nonan_noev_nogr = t_nofv_nonan_noev y_nofv_nonan_noev_nogr = y_nofv_nonan_noev x_nofv_nonan_noev_nogr = x_nofv_nonan_noev if len(x_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) if sv != 'pressure': columns = ['tsec', 'dbar', str(sv)] bin_size = 10 min_r = int(round(min(y_nofv_nonan_noev) - bin_size)) max_r = int(round(max(y_nofv_nonan_noev) + bin_size)) ranges = list(range(min_r, max_r, bin_size)) groups, d_groups = gt.group_by_depth_range( t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, x_nofv_nonan_noev_nogr, columns, ranges) y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], [] tm = 1 for ii in range(len(groups)): nan_ind = d_groups[ii + tm].notnull() xtime = d_groups[ii + tm][nan_ind] colors = cm.rainbow(np.linspace(0, 1, len(xtime))) ypres = d_groups[ii + tm + 1][nan_ind] nval = d_groups[ii + tm + 2][nan_ind] tm += 2 l_arr.append(len( nval)) # count of data to filter out small groups y_avg.append(ypres.mean()) n_avg.append(nval.mean()) n_min.append(nval.min()) n_max.append(nval.max()) n_std = 3 n0_std.append(nval.mean() + n_std * nval.std()) n1_std.append(nval.mean() - n_std * nval.std()) # Plot all data ylabel = y_name + " (" + y_unit + ")" xlabel = sv + " (" + sv_units + ")" clabel = 'Time' fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, t_nofv_nonan_noev_nogr, ylabel, xlabel, clabel, end_times, deployments, stdev=None) title_text = ' '.join((r, ms.split('-')[-1])) + '\n' \ + t0 + ' - ' + t1 + '\n' + str(bin_size) +\ ' m average and ' + str(n_std) + ' std shown' ax.set_title(title_text, fontsize=9) ax.plot(n_avg, y_avg, '-k') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, t_nofv_nonan_noev_nogr, ylabel, xlabel, clabel, end_times, deployments, stdev=5) ax.set_title(' '.join((r, ms.split('-')[-1])) + '\n' \ + t0 + ' - ' + t1, fontsize=9) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(sDir, url_list): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments dr_data = cf.refdes_datareview_json(r) deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # filter datasets datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) methodstream = [] for f in fdatasets: methodstream.append('-'.join((f.split('/')[-2].split('-')[-2], f.split('/')[-2].split('-')[-1]))) for ms in np.unique(methodstream): fdatasets_sel = [x for x in fdatasets if ms in x] save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_daily_plots', ms.split('-')[0]) cf.create_dir(save_dir) stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values sh['t'] = np.append(sh['t'], tD) sh['values'] = np.append(sh['values'], varD) print('\nPlotting data') for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') x = vinfo['t'] y = vinfo['values'] # reject NaNs nan_ind = ~np.isnan(y) x_nonan = x[nan_ind] y_nonan = y[nan_ind] # reject fill values fv_ind = y_nonan != vinfo['fv'][0] x_nonan_nofv = x_nonan[fv_ind] y_nonan_nofv = y_nonan[fv_ind] # reject extreme values Ev_ind = cf.reject_extreme_values(y_nonan_nofv) y_nonan_nofv_nE = y_nonan_nofv[Ev_ind] x_nonan_nofv_nE = x_nonan_nofv[Ev_ind] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) gr_ind = cf.reject_global_ranges( y_nonan_nofv_nE, global_min, global_max) y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind] x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind] if len(y_nonan_nofv) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # 1st group by year ygroups, gy_data = gt.group_by_timerange( x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, 'A') tn = 1 for n in range(len(ygroups)): x_time = gy_data[n + tn].dropna(axis=0) y_data = gy_data[n + (tn + 1)].dropna(axis=0) # 2nd group by month mgroups, gm_data = gt.group_by_timerange( x_time.values, y_data.values, 'M') if len(x_time) == 0: continue td = 1 for jj in range(len(mgroups)): x_time = gm_data[jj + td].dropna(axis=0) y_data = gm_data[jj + (td + 1)].dropna(axis=0) if len(x_time) == 0: continue # 3rd group by day dgroups, gd_data = gt.group_by_timerange( x_time.values, y_data.values, 'D') x_year = x_time[0].year x_month = x_time[0].month month_name = calendar.month_abbr[x_month] print(x_year, x_month) sfile = '_'.join( (str(x_year), str(x_month), sname)) # prepare plot layout fig, ax = pyplot.subplots(nrows=7, ncols=5, sharey=True) title_in = month_name + '-' + str(x_year) + \ ' calendar days \n Parameter: ' + \ sv + " (" + sv_units + ")" ax[0][2].text(0.5, 1.5, title_in, horizontalalignment='center', fontsize=8, transform=ax[0][2].transAxes) num_i = 0 day_i = {} for kk in list(range(0, 7)): for ff in list(range(0, 5)): num_i += 1 day_i[num_i] = [kk, ff] ax[kk][ff].tick_params( axis='both', which='both', color='r', labelsize=7, labelcolor='m', rotation=0) ax[kk][ff].text( 0.1, 0.75, str(num_i), horizontalalignment='center', fontsize=7, transform=ax[kk][ff].transAxes, bbox=dict( boxstyle="round", ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) if kk is not 6: ax[kk][ff].tick_params( labelbottom=False) if ff is not 0: ax[kk][ff].tick_params( labelright=False) if kk is 6 and ff is 0: ax[kk][ff].set_xlabel( 'Hours', rotation=0, fontsize=8, color='b') if kk is 6 and ff in list( range(1, 5)): fig.delaxes(ax[kk][ff]) tm = 1 for mt in range(len(dgroups)): x_time = gd_data[mt + tm].dropna(axis=0) y_DO = gd_data[mt + (tm + 1)].dropna(axis=0) series_m = pd.DataFrame( columns=['DO_n'], index=x_time) series_m['DO_n'] = list(y_DO[:]) if len(x_time) == 0: continue x_day = x_time[0].day print(x_time[0].year, x_time[0].month, x_day) i0 = day_i[x_day][0] i1 = day_i[x_day][1] # Plot data series_m.plot(ax=ax[i0][i1], linestyle='None', marker='.', markersize=1) ax[i0][i1].legend().set_visible(False) ma = series_m.rolling('3600s').mean() mstd = series_m.rolling('3600s').std() ax[i0][i1].plot(ma.index, ma.DO_n, 'b', linewidth=0.25) ax[i0][i1].fill_between( mstd.index, ma.DO_n - 3 * mstd.DO_n, ma.DO_n + 3 * mstd.DO_n, color='b', alpha=0.2) # prepare the time axis parameters datemin = datetime.datetime( x_year, x_month, x_day, 0) datemax = datetime.datetime( x_year, x_month, x_day, 23) ax[i0][i1].set_xlim(datemin, datemax) xLocator = mdates.HourLocator( interval=4) # every hour myFmt = mdates.DateFormatter('%H') ax[i0][i1].xaxis.set_minor_locator( xLocator) ax[i0][i1].xaxis.set_minor_formatter( myFmt) ax[i0][i1].xaxis.set_major_locator( pyplot.NullLocator()) ax[i0][i1].xaxis.set_major_formatter( pyplot.NullFormatter()) yLocator = MaxNLocator(prune='both', nbins=3) ax[i0][i1].yaxis.set_major_locator( yLocator) if x_day is not 31: ax[i0][i1].tick_params( labelbottom=False) ax[i0][i1].set_xlabel(' ') else: ax[i0][i1].tick_params( which='both', color='r', labelsize=7, labelcolor='m', length=0.1, pad=0.1) ax[i0][i1].set_xlabel('Hours', rotation=0, fontsize=8, color='b') ymin, ymax = ax[i0][i1].get_ylim() dep = 1 for etimes in end_times: ax[i0][i1].axvline(x=etimes, color='b', linestyle='--', linewidth=.6) ax[i0][i1].text( etimes, ymin + 50, str(dep), fontsize=6, style='italic', bbox=dict( boxstyle="round", ec=(0., 0.5, 0.5), fc=(1., 1., 1.), )) dep += 1 tm += 1 td += 1 pf.save_fig(save_dir, sfile) tn += 1
def main(url_list, sDir, plot_type): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get preferred stream ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate data files by methods ''' for ms in ms_list: fdatasets_sel = [x for x in fdatasets if ms in x] # create a folder to save figures save_dir = os.path.join(sDir, array, subsite, r, plot_type, ms.split('-')[0]) cf.create_dir(save_dir) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print('\nAppending data file: {}'.format(fd.split('/')[-1])) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values sh['pressure'] = np.append(sh['pressure'], y) try: ds[pressure].units if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) except AttributeError: print('pressure attributes missing units') if 'pressure unit missing' not in y_unit: y_unit.append('pressure unit missing') try: ds[pressure].long_name if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) except AttributeError: print('pressure attributes missing long_name') if 'pressure long name missing' not in y_name: y_name.append('pressure long name missing') # create a csv file with diagnostic results: if len(y_unit) != 1: print('pressure unit varies') if 'dbar' in y_unit: y_unit = 'dbar' print(y_unit) else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies') if 'Seawater Pressure' in y_name: y_name = 'Seawater Pressure' print(y_name) else: y_name = y_name[0] # create a folder to save variables statistics mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats' save_dir_stat = os.path.join(mDir, array, subsite) cf.create_dir(save_dir_stat) stat_df = pd.DataFrame() for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] title = ' '.join((r, ms)) # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject fill values fv_ind = z != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] print(len(z) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(z_nofv) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] print(len(z) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] print( len(z) - len(ev_ind), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges( r, sv) # platform not in qc-table (parad_k_par) # global_min = 0 # global_max = 2500 print('global ranges for : {}-{} {} - {}'.format( r, sv, global_min, global_max)) if isinstance(global_min, (int, float)) and isinstance( global_max, (int, float)): gr_ind = cf.reject_global_ranges( z_nofv_nonan_noev, global_min, global_max) t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[ gr_ind] y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[ gr_ind] z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[ gr_ind] else: t_nofv_nonan_noev_nogr = t_nofv_nonan_noev y_nofv_nonan_noev_nogr = y_nofv_nonan_noev z_nofv_nonan_noev_nogr = z_nofv_nonan_noev if len(z_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # group by depth range sname = '_'.join((sname, sv_units)) # if sv != 'pressure': # columns = ['tsec', 'dbar', str(sv)] # # # select depth bin size for the data group function # bin_size = 10 # min_r = int(round(min(y_nofv_nonan_noev) - bin_size)) # max_r = int(round(max(y_nofv_nonan_noev) + bin_size)) # ranges = list(range(min_r, max_r, bin_size)) # groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, # z_nofv_nonan_noev_nogr, columns, ranges) # # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]): # if 'pressure' not in sv: # print('final_stats_{}-{}-{}-{}'.format(r, # ms.split('-')[0], # ps_df[0].values[0].split('-')[0], # sv)) # stat_data = groups.describe()[sv] # stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False) # stat_df = stat_df.append(stat_data) # if sv == 'optical_backscatter': # less_ind = z_nofv_nonan_noev < 0.0004 # print(sv, ' < 0.0004', len(less_ind)) # more_ind = z_nofv_nonan_noev > 0.01 # print(sv, ' > 0.01', len(more_ind)) # Plot all data clabel = sv + " (" + sv_units + ")" ylabel = y_name + " (" + y_unit + ")" fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev, y_nofv_nonan_noev, z_nofv_nonan_noev, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, z_nofv_nonan_noev_nogr, clabel, ylabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile) # plot data with excluded time range removed dr = pd.read_csv( 'https://datareview.marine.rutgers.edu/notes/export' ) drn = dr.loc[dr.type == 'exclusion'] if len(drn) != 0: subsite_node = '-'.join((subsite, r.split('-')[1])) drne = drn.loc[drn.reference_designator.isin( [subsite, subsite_node, r])] t_ex = t_nofv_nonan_noev_nogr y_ex = y_nofv_nonan_noev_nogr z_ex = z_nofv_nonan_noev_nogr for i, row in drne.iterrows(): sdate = cf.format_dates(row.start_date) edate = cf.format_dates(row.end_date) ts = np.datetime64(sdate) te = np.datetime64(edate) ind = np.where((t_ex < ts) | (t_ex > te), True, False) if len(ind) != 0: t_ex = t_ex[ind] z_ex = z_ex[ind] y_ex = y_ex[ind] fig, ax = pf.plot_xsection(subsite, t_ex, y_ex, z_ex, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmsuspectdata')) pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams chat datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate the data files by methods ''' for ms in ms_list: fdatasets_sel = [x for x in fdatasets if ms in x] # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print(fd) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment) cf.create_dir(save_dir) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) sh['pressure'] = np.append(sh['pressure'], y) if len(y_unit) != 1: print('pressure unit varies UHHHHHHHHH') else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies UHHHHHHHHH') else: y_name = y_name[0] for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] title = ' '.join((r, ms.split('-')[1])) # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') else: # reject fill values fv_ind = z != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] print(len(z) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(z) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] print(len(z) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] colors = cm.rainbow( np.linspace(0, 1, len(t_nofv_nonan_noev))) y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] print( len(z) - len(ev_ind), ' Extreme Values', '|1e7|') if len(y_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # Plot all data ylabel = y_name + " (" + y_unit + ")" xlabel = sv + " (" + sv_units + ")" clabel = 'Time' clabel = sv + " (" + sv_units + ")" fig, ax = pf.plot_profiles(z_nofv_nonan_noev, y_nofv_nonan_noev, colors, xlabel, ylabel, stdev=None) ax.set_title(( title + '\n' + str(deployment_num) + ': ' + t0 + ' - ' + t1 + '\n' + 'used bin = 2 dbar to calculate an average profile (black line) and 3-STD envelope (shaded area)' ), fontsize=9) # group by depth range columns = ['time', 'pressure', str(sv)] # ranges = [0, 50, 100, 200, 400, 600] ranges = list( range(int(round(min(y_nofv_nonan_noev))), int(round(max(y_nofv_nonan_noev))), 1)) groups, d_groups = gt.group_by_depth_range( t_nofv_nonan_noev, y_nofv_nonan_noev, z_nofv_nonan_noev, columns, ranges) # describe_file = '_'.join((sname, 'statistics.csv')) # # groups.describe().to_csv(save_dir + '/' + describe_file) ind = groups.describe()[sv]['mean'].notnull() groups.describe()[sv][ind].to_csv( '{}/{}_statistics.csv'.format(save_dir, sname), index=True) tm = 1 fig, ax = pyplot.subplots(nrows=2, ncols=1) pyplot.margins(y=.08, x=.02) pyplot.grid() y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], [] for ii in range(len(groups)): nan_ind = d_groups[ii + tm].notnull() xtime = d_groups[ii + tm][nan_ind] colors = cm.rainbow(np.linspace(0, 1, len(xtime))) ypres = d_groups[ii + tm + 1][nan_ind] nval = d_groups[ii + tm + 2][nan_ind] tm += 2 # fig, ax = pf.plot_xsection(subsite, xtime, ypres, nval, clabel, ylabel, stdev=None) # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) # pf.plot_profiles(nval, ypres, colors, ylabel, clabel, stdev=None) # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) ind2 = cf.reject_outliers(nval, 5) xD = nval[ind2] yD = ypres[ind2] nZ = colors[ind2] outliers = str(len(nval) - len(xD)) leg_text = ('removed {} outliers (SD={})'.format( outliers, stdev), ) ax.scatter(xD, yD, c=nZ, s=2, edgecolor='None') ax.invert_yaxis() ax.set_xlabel(clabel, fontsize=9) ax.set_ylabel(ylabel, fontsize=9) ax.legend(leg_text, loc='best', fontsize=6) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) l_arr.append( len(nval) ) # count of data to filter out small groups y_avg.append(ypres.mean()) n_avg.append(nval.mean()) n_min.append(nval.min()) n_max.append(nval.max()) n0_std.append(nval.mean() + 3 * nval.std()) n1_std.append(nval.mean() - 3 * nval.std()) ax.plot(n_avg, y_avg, '-k') # ax.plot(n_min, y_avg, '-b') # ax.plot(n_max, y_avg, '-b') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) sfile = '_'.join((sname, 'statistics')) pf.save_fig(save_dir, sfile)