def main(sDir, plotting_sDir, url_list, sd_calc):
    dr = pd.read_csv('https://datareview.marine.rutgers.edu/notes/export')
    drn = dr.loc[dr.type == 'exclusion']
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)
        pms = []
        for index, row in ps_df.iterrows():
            for ii in range(n_streams):
                try:
                    rms = '-'.join((r, row[ii]))
                    pms.append(row[ii])
                except TypeError:
                    continue
                for dd in datasets:
                    spl = dd.split('/')[-2].split('-')
                    catalog_rms = '-'.join(
                        (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                    fdeploy = dd.split('/')[-1].split('_')[0]
                    if rms == catalog_rms and fdeploy == row['deployment']:
                        fdatasets.append(dd)

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        # find time ranges to exclude from analysis for data review database
        subsite = r.split('-')[0]
        subsite_node = '-'.join((subsite, r.split('-')[1]))

        drne = drn.loc[drn.reference_designator.isin(
            [subsite, subsite_node, r])]
        et = []
        for i, row in drne.iterrows():
            sdate = cf.format_dates(row.start_date)
            edate = cf.format_dates(row.end_date)
            et.append([sdate, edate])

        # get science variable long names from the Data Review Database
        stream_sci_vars = cd.sci_var_long_names(r)

        # check if the science variable long names are the same for each stream
        sci_vars_dict = cd.sci_var_long_names_check(stream_sci_vars)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        sci_vars_dict, pressure_unit, pressure_name = cd.append_science_data(
            ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et)

        # analyze combined dataset
        print('\nAnalyzing combined dataset and writing summary file')

        array = subsite[0:2]
        save_dir = os.path.join(sDir, array, subsite)
        cf.create_dir(save_dir)

        rows = []
        if ('FLM' in r) and (
                'CTDMO' in r
        ):  # calculate Flanking Mooring CTDMO stats based on pressure
            headers = [
                'common_stream_name', 'preferred_methods_streams',
                'deployments', 'long_name', 'units', 't0', 't1', 'fill_value',
                'global_ranges', 'n_all', 'press_min_max',
                'n_excluded_forpress', 'n_nans', 'n_fillvalues', 'n_grange',
                'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max',
                'stdev', 'note'
            ]
        else:
            headers = [
                'common_stream_name', 'preferred_methods_streams',
                'deployments', 'long_name', 'units', 't0', 't1', 'fill_value',
                'global_ranges', 'n_all', 'n_nans', 'n_fillvalues', 'n_grange',
                'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max',
                'stdev'
            ]

        for m, n in sci_vars_dict.items():
            print('\nSTREAM: ', m)
            if m == 'common_stream_placeholder':
                m = 'science_data_stream'
            if m == 'metbk_hourly':  # don't calculate ranges for metbk_hourly
                continue

            if ('FLM' in r) and (
                    'CTDMO' in r
            ):  # calculate Flanking Mooring CTDMO stats based on pressure
                # index the pressure variable to filter and calculate stats on the rest of the variables
                sv_press = 'Seawater Pressure'
                vinfo_press = n['vars'][sv_press]

                # first, index where data are nans, fill values, and outside of global ranges
                fv_press = list(np.unique(vinfo_press['fv']))[0]
                pdata = vinfo_press['values']

                [pind, __, __, __, __,
                 __] = index_dataset(r, vinfo_press['var_name'], pdata,
                                     fv_press)

                pdata_filtered = pdata[pind]
                [__, pmean, __, __, psd,
                 __] = cf.variable_statistics(pdata_filtered, None)

                # index of pressure = average of all 'valid' pressure data +/- 1 SD
                ipress_min = pmean - psd
                ipress_max = pmean + psd
                ind_press = (pdata >= ipress_min) & (pdata <= ipress_max)

                # calculate stats for all variables
                print('\nPARAMETERS:')
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print('No unique fill value for {}'.format(sv))

                    lunits = np.unique(vinfo['units']).tolist()
                    n_all = len(vinfo['t'])

                    # filter data based on pressure index
                    t_filtered = vinfo['t'][ind_press]
                    data_filtered = vinfo['values'][ind_press]
                    deploy_filtered = vinfo['deployments'][ind_press]

                    n_excluded = n_all - len(t_filtered)

                    [dataind, g_min, g_max, n_nan, n_fv,
                     n_grange] = index_dataset(r, vinfo['var_name'],
                                               data_filtered, fill_value)

                    t_final = t_filtered[dataind]
                    data_final = data_filtered[dataind]
                    deploy_final = deploy_filtered[dataind]

                    t0 = pd.to_datetime(
                        min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                    t1 = pd.to_datetime(
                        max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                    deploy = list(np.unique(deploy_final))
                    deployments = [int(dd) for dd in deploy]

                    if len(data_final) > 1:
                        [num_outliers, mean, vmin, vmax, sd, n_stats
                         ] = cf.variable_statistics(data_final, sd_calc)
                    else:
                        mean = None
                        vmin = None
                        vmax = None
                        sd = None
                        n_stats = None

                    note = 'restricted stats calculation to data points where pressure is within defined ranges' \
                           ' (average of all pressure data +/- 1 SD)'
                    rows.append([
                        m,
                        list(np.unique(pms)), deployments, sv, lunits, t0, t1,
                        fv_lst, [g_min, g_max], n_all,
                        [round(ipress_min, 2),
                         round(ipress_max,
                               2)], n_excluded, n_nan, n_fv, n_grange, sd_calc,
                        num_outliers, n_stats, mean, vmin, vmax, sd, note
                    ])

                    # plot CTDMO data used for stats
                    psave_dir = os.path.join(plotting_sDir, array, subsite, r,
                                             'timeseries_plots_stats')
                    cf.create_dir(psave_dir)

                    dr_data = cf.refdes_datareview_json(r)
                    deployments = []
                    end_times = []
                    for index, row in ps_df.iterrows():
                        deploy = row['deployment']
                        deploy_info = cf.get_deployment_information(
                            dr_data, int(deploy[-4:]))
                        deployments.append(int(deploy[-4:]))
                        end_times.append(
                            pd.to_datetime(deploy_info['stop_date']))

                    sname = '-'.join((r, sv))
                    fig, ax = pf.plot_timeseries_all(t_final,
                                                     data_final,
                                                     sv,
                                                     lunits[0],
                                                     stdev=None)
                    ax.set_title(
                        (r + '\nDeployments: ' + str(sorted(deployments)) +
                         '\n' + t0 + ' - ' + t1),
                        fontsize=8)
                    for etimes in end_times:
                        ax.axvline(x=etimes,
                                   color='k',
                                   linestyle='--',
                                   linewidth=.6)
                    pf.save_fig(psave_dir, sname)

                    if sd_calc:
                        sname = '-'.join((r, sv, 'rmoutliers'))
                        fig, ax = pf.plot_timeseries_all(t_final,
                                                         data_final,
                                                         sv,
                                                         lunits[0],
                                                         stdev=sd_calc)
                        ax.set_title(
                            (r + '\nDeployments: ' + str(sorted(deployments)) +
                             '\n' + t0 + ' - ' + t1),
                            fontsize=8)
                        for etimes in end_times:
                            ax.axvline(x=etimes,
                                       color='k',
                                       linestyle='--',
                                       linewidth=.6)
                        pf.save_fig(psave_dir, sname)

            else:
                if not sd_calc:
                    sdcalc = None

                print('\nPARAMETERS: ')
                for sv, vinfo in n['vars'].items():
                    print(sv)

                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    lunits = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        n_all = len(t)

                        if 'SPKIR' in r or 'presf_abc_wave_burst' in m:
                            if 'SPKIR' in r:
                                [dd_data, g_min, g_max, n_nan, n_fv,
                                 n_grange] = index_dataset_2d(
                                     r, 'spkir_abj_cspp_downwelling_vector',
                                     data, fill_value)
                            else:
                                [dd_data, g_min, g_max, n_nan, n_fv,
                                 n_grange] = index_dataset_2d(
                                     r, 'presf_wave_burst_pressure', data,
                                     fill_value)
                            t_final = t
                            t0 = pd.to_datetime(
                                min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(
                                max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                            deploy_final = vinfo['deployments']
                            deploy = list(np.unique(deploy_final))
                            deployments = [int(dd) for dd in deploy]

                            num_outliers = []
                            mean = []
                            vmin = []
                            vmax = []
                            sd = []
                            n_stats = []
                            for i in range(len(dd_data)):
                                dd = data[i]
                                # drop nans before calculating stats
                                dd = dd[~np.isnan(dd)]
                                [
                                    num_outliersi, meani, vmini, vmaxi, sdi,
                                    n_statsi
                                ] = cf.variable_statistics(dd, sd_calc)
                                num_outliers.append(num_outliersi)
                                mean.append(meani)
                                vmin.append(vmini)
                                vmax.append(vmaxi)
                                sd.append(sdi)
                                n_stats.append(n_statsi)

                        else:
                            [dataind, g_min, g_max, n_nan, n_fv,
                             n_grange] = index_dataset(r, vinfo['var_name'],
                                                       data, fill_value)
                            t_final = t[dataind]
                            if len(t_final) > 0:
                                t0 = pd.to_datetime(
                                    min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                                t1 = pd.to_datetime(
                                    max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                                data_final = data[dataind]
                                # if sv == 'Dissolved Oxygen Concentration':
                                #     xx = (data_final > 0) & (data_final < 400)
                                #     data_final = data_final[xx]
                                #     t_final = t_final[xx]
                                # if sv == 'Seawater Conductivity':
                                #     xx = (data_final > 1) & (data_final < 400)
                                #     data_final = data_final[xx]
                                #     t_final = t_final[xx]
                                deploy_final = vinfo['deployments'][dataind]
                                deploy = list(np.unique(deploy_final))
                                deployments = [int(dd) for dd in deploy]

                                if len(data_final) > 1:
                                    [
                                        num_outliers, mean, vmin, vmax, sd,
                                        n_stats
                                    ] = cf.variable_statistics(
                                        data_final, sd_calc)
                                else:
                                    sdcalc = None
                                    num_outliers = None
                                    mean = None
                                    vmin = None
                                    vmax = None
                                    sd = None
                                    n_stats = None
                            else:
                                sdcalc = None
                                num_outliers = None
                                mean = None
                                vmin = None
                                vmax = None
                                sd = None
                                n_stats = None
                                deployments = None
                                t0 = None
                                t1 = None
                    else:
                        sdcalc = None
                        num_outliers = None
                        mean = None
                        vmin = None
                        vmax = None
                        sd = None
                        n_stats = None
                        deployments = None
                        t0 = None
                        t1 = None
                        t_final = []

                    if sd_calc:
                        print_sd = sd_calc
                    else:
                        print_sd = sdcalc

                    rows.append([
                        m,
                        list(np.unique(pms)), deployments, sv, lunits, t0, t1,
                        fv_lst, [g_min, g_max], n_all, n_nan, n_fv, n_grange,
                        print_sd, num_outliers, n_stats, mean, vmin, vmax, sd
                    ])

                    if len(t_final) > 0:
                        # plot data used for stats
                        psave_dir = os.path.join(
                            plotting_sDir, array, subsite, r,
                            'timeseries_reviewed_datarange')
                        cf.create_dir(psave_dir)

                        dr_data = cf.refdes_datareview_json(r)
                        deployments = []
                        end_times = []
                        for index, row in ps_df.iterrows():
                            deploy = row['deployment']
                            deploy_info = cf.get_deployment_information(
                                dr_data, int(deploy[-4:]))
                            deployments.append(int(deploy[-4:]))
                            end_times.append(
                                pd.to_datetime(deploy_info['stop_date']))

                        sname = '-'.join((r, sv))

                        # plot hourly averages for streaming data
                        if 'streamed' in sci_vars_dict[list(
                                sci_vars_dict.keys())[0]]['ms'][0]:
                            sname = '-'.join((sname, 'hourlyavg'))
                            df = pd.DataFrame({
                                'dfx': t_final,
                                'dfy': data_final
                            })
                            dfr = df.resample('H', on='dfx').mean()

                            # Plot all data
                            fig, ax = pf.plot_timeseries_all(dfr.index,
                                                             dfr['dfy'],
                                                             sv,
                                                             lunits[0],
                                                             stdev=None)
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            if sd_calc:
                                sname = '-'.join(
                                    (sname, 'hourlyavg_rmoutliers'))
                                fig, ax = pf.plot_timeseries_all(dfr.index,
                                                                 dfr['dfy'],
                                                                 sv,
                                                                 lunits[0],
                                                                 stdev=sd_calc)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                pf.save_fig(psave_dir, sname)

                        elif 'SPKIR' in r:
                            fig, ax = pf.plot_spkir(t_final, dd_data, sv,
                                                    lunits[0])
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            # plot each wavelength
                            wavelengths = [
                                '412nm', '443nm', '490nm', '510nm', '555nm',
                                '620nm', '683nm'
                            ]
                            for wvi in range(len(dd_data)):
                                fig, ax = pf.plot_spkir_wv(
                                    t_final, dd_data[wvi], sv, lunits[0], wvi)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                snamewvi = '-'.join((sname, wavelengths[wvi]))
                                pf.save_fig(psave_dir, snamewvi)
                        elif 'presf_abc_wave_burst' in m:
                            fig, ax = pf.plot_presf_2d(t_final, dd_data, sv,
                                                       lunits[0])
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            snamewave = '-'.join((sname, m))
                            pf.save_fig(psave_dir, snamewave)

                        else:  # plot all data if not streamed
                            fig, ax = pf.plot_timeseries_all(t_final,
                                                             data_final,
                                                             sv,
                                                             lunits[0],
                                                             stdev=None)
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            if sd_calc:
                                sname = '-'.join((r, sv, 'rmoutliers'))
                                fig, ax = pf.plot_timeseries_all(t_final,
                                                                 data_final,
                                                                 sv,
                                                                 lunits[0],
                                                                 stdev=sd_calc)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                pf.save_fig(psave_dir, sname)

        fsum = pd.DataFrame(rows, columns=headers)
        fsum.to_csv('{}/{}_data_ranges.csv'.format(save_dir, r), index=False)
예제 #2
0
def main(url_list, sDir, plot_type, start_time, end_time, deployment_num):
    for i, u in enumerate(url_list):
        elements = u.split('/')[-2].split('-')
        r = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = u.split(r + '-')[1].split('/')[0]
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]
        datasets = cf.get_nc_urls([u])
        datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets)

        save_dir = os.path.join(sDir, array, subsite, r, plot_type)
        cf.create_dir(save_dir)
        sname = '-'.join((r, ms, 'track'))

        print('Appending....')
        sh = pd.DataFrame()
        deployments = []
        end_times = []
        for ii, d in enumerate(datasets_sel):
            print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel),
                                                  d.split('/')[-1]))
            ds = xr.open_dataset(d, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                d)

            if deployment_num is not None:
                if int(deployment.split('0')[-1]) is not deployment_num:
                    print(type(int(deployment.split('0')[-1])),
                          type(deployment_num))
                    continue

            # get end times of deployments
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            dr_data = cf.refdes_datareview_json(r)

            for index, row in ps_df.iterrows():
                deploy = row['deployment']
                deploy_info = cf.get_deployment_information(
                    dr_data, int(deploy[-4:]))
                if int(deploy[-4:]) not in deployments:
                    deployments.append(int(deploy[-4:]))
                if pd.to_datetime(deploy_info['stop_date']) not in end_times:
                    end_times.append(pd.to_datetime(deploy_info['stop_date']))

            data = {'lat': ds['lat'].values, 'lon': ds['lon'].values}
            new_r = pd.DataFrame(data,
                                 columns=['lat', 'lon'],
                                 index=ds['time'].values)
            sh = sh.append(new_r)

        xD = sh.lon.values
        yD = sh.lat.values
        tD = sh.index.values

        clabel = 'Time'
        ylabel = 'Latitude'
        xlabel = 'Longitude'

        fig, ax = pf.plot_profiles(xD,
                                   yD,
                                   tD,
                                   ylabel,
                                   xlabel,
                                   clabel,
                                   end_times,
                                   deployments,
                                   stdev=None)
        ax.invert_yaxis()
        ax.set_title('Glider Track - ' + r + '\n' + 'x: platform location',
                     fontsize=9)
        ax.set_xlim(-71.75, -69.75)
        ax.set_ylim(38.75, 40.75)
        #cbar.ax.set_yticklabels(end_times)

        # add Pioneer glider sampling area
        ax.add_patch(
            Rectangle((-71.5, 39.0),
                      1.58,
                      1.67,
                      linewidth=3,
                      edgecolor='b',
                      facecolor='none'))
        ax.text(-71,
                40.6,
                'Pioneer Glider Sampling Area',
                color='blue',
                fontsize=8)
        # add Pioneer AUV sampling area
        # ax.add_patch(Rectangle((-71.17, 39.67), 0.92, 1.0, linewidth=3, edgecolor='m', facecolor='none'))

        array_loc = cf.return_array_subsites_standard_loc(array)

        ax.scatter(array_loc.lon,
                   array_loc.lat,
                   s=40,
                   marker='x',
                   color='k',
                   alpha=0.3)
        #ax.legend(legn, array_loc.index, scatterpoints=1, loc='lower left', ncol=4, fontsize=8)

        pf.save_fig(save_dir, sname)
예제 #3
0
def main(url_list, sDir, mDir, zcell_size, zdbar, start_time, end_time, inpercentile):

    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save plots
    mDir : path to the directory on your machine to save data ranges
    zcell_size : depth cell size to group data
    zdbar : define depth where suspect data are identified
    start_time : select start date to slice timeseries
    end_time : select end date to slice timeseries
    """""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get science variable long names from the Data Review Database
        stream_sci_vars = cd.sci_var_long_names(r)

        # check if the science variable long names are the same for each stream and initialize empty arrays
        sci_vars_dict0 = cd.sci_var_long_names_check(stream_sci_vars)

        # get the list of data files and filter out collocated instruments and other streams
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)

        # select the list of data files from the preferred dataset for each deployment
        fdatasets_final = []
        for ii in range(len(ps_df)):
            for x in fdatasets:
                if ps_df['deployment'][ii] in x and ps_df[0][ii] in x:
                    fdatasets_final.append(x)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        sci_vars_dict, y_unit, y_name, l0 = cd.append_evaluated_science_data(
                                sDir, ps_df, n_streams, r, fdatasets_final, sci_vars_dict0, zdbar, start_time, end_time)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # create data range output folders
        save_dir_stat = os.path.join(mDir, array, subsite)
        cf.create_dir(save_dir_stat)
        # create plots output folder
        save_fdir = os.path.join(sDir, array, subsite, r, 'data_range')
        cf.create_dir(save_fdir)
        stat_df = pd.DataFrame()

        """
        create data ranges csv file and figures
        """
        for m, n in sci_vars_dict.items():
            for sv, vinfo in n['vars'].items():
                print('\n' + vinfo['var_name'])
                if len(vinfo['t']) < 1:
                    print('no variable data to plot')
                    continue
                else:
                    sv_units = vinfo['units'][0]
                    fv = vinfo['fv'][0]
                    t = vinfo['t']
                    z = vinfo['values']
                    y = vinfo['pressure']

                # Check if the array is all NaNs
                if sum(np.isnan(z)) == len(z):
                    print('Array of all NaNs - skipping plot.')
                    continue
                # Check if the array is all fill values
                elif len(z[z != fv]) == 0:
                    print('Array of all fill values - skipping plot.')
                    continue
                else:

                    if len(y) > 0:
                        if m == 'common_stream_placeholder':
                            sname = '-'.join((vinfo['var_name'], r))
                        else:
                            sname = '-'.join((vinfo['var_name'], r, m))

                        """
                        create data ranges for non - pressure data only
                        """

                        if 'pressure' in vinfo['var_name']:
                            pass
                        else:
                            columns = ['tsec', 'dbar', str(vinfo['var_name'])]
                            # create depth ranges
                            min_r = int(round(min(y) - zcell_size))
                            max_r = int(round(max(y) + zcell_size))
                            ranges = list(range(min_r, max_r, zcell_size))

                            # group data by depth
                            groups, d_groups = gt.group_by_depth_range(t, y, z, columns, ranges)

                            print('writing data ranges for {}'.format(vinfo['var_name']))
                            stat_data = groups.describe()[vinfo['var_name']]
                            stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False)
                            t_deploy = deployments[0]
                            for i in range(len(deployments))[1:len(deployments)]:
                                t_deploy = '{}, {}'.format(t_deploy, deployments[i])
                            stat_data.insert(loc=1, column='deployments', value=t_deploy, allow_duplicates=False)

                            stat_df = stat_df.append(stat_data, ignore_index=False)

                        """
                        plot full time range free from errors and suspect data
                        """

                        clabel = sv + " (" + sv_units + ")"
                        ylabel = (y_name[0][0] + " (" + y_unit[0][0] + ")")

                        t_eng = None
                        m_water_depth = None

                        # plot non-erroneous -suspect data
                        fig, ax, bar = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, t_eng, m_water_depth,
                                                        inpercentile, stdev=None)

                        title0 = 'Data colored using the upper and lower {} percentile.'.format(inpercentile)
                        ax.set_title(r+'\n'+title0, fontsize=9)
                        leg_text = ('{} % erroneous values removed after Human In the Loop review'.format(
                                                                                                    (len(t)/l0) * 100),)
                        ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)


                        for ii in range(len(end_times)):
                            ax.axvline(x=end_times[ii], color='b', linestyle='--', linewidth=.8)
                            ax.text(end_times[ii], min(y)-5, 'End' + str(deployments[ii]),
                                                   fontsize=6, style='italic',
                                                   bbox=dict(boxstyle='round',
                                                             ec=(0., 0.5, 0.5),
                                                             fc=(1., 1., 1.),
                                                             ))

                        # fig.tight_layout()
                        sfile = '_'.join(('data_range', sname))
                        pf.save_fig(save_fdir, sfile)

            # write stat file
            stat_df.to_csv('{}/{}_data_ranges.csv'.format(save_dir_stat, r), index=True, float_format='%11.6f')
예제 #4
0
def main(url_list, sDir, mDir, zcell_size, zdbar, start_time, end_time):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get science variable long names from the Data Review Database
        stream_sci_vars = cd.sci_var_long_names(r)
        #stream_vars = cd.var_long_names(r)

        # check if the science variable long names are the same for each stream and initialize empty arrays
        sci_vars_dict0 = cd.sci_var_long_names_check(stream_sci_vars)

        # get the list of data files and filter out collocated instruments and other streams
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)

        # select the list of data files from the preferred dataset for each deployment
        fdatasets_final = []
        for ii in range(len(ps_df)):
            for x in fdatasets:
                if ps_df['deployment'][ii] in x and ps_df[0][ii] in x:
                    fdatasets_final.append(x)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        et = []
        sci_vars_dict, y_unit, y_name = cd.append_evaluated_science_data(
            sDir, ps_df, n_streams, r, fdatasets_final, sci_vars_dict0, et,
            start_time, end_time)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(
                dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))
        """
        create a data-ranges table and figure for full data time range
        """
        # create a folder to save data ranges
        save_dir_stat = os.path.join(mDir, array, subsite)
        cf.create_dir(save_dir_stat)

        save_fdir = os.path.join(sDir, array, subsite, r, 'data_range')
        cf.create_dir(save_fdir)
        stat_df = pd.DataFrame()

        for m, n in sci_vars_dict.items():
            for sv, vinfo in n['vars'].items():
                print(vinfo['var_name'])
                if len(vinfo['t']) < 1:
                    print('no variable data to plot')
                    continue
                else:
                    sv_units = vinfo['units'][0]
                    fv = vinfo['fv'][0]
                    t = vinfo['t']
                    z = vinfo['values']
                    y = vinfo['pressure']

                # Check if the array is all NaNs
                if sum(np.isnan(z)) == len(z):
                    print('Array of all NaNs - skipping plot.')
                    continue
                # Check if the array is all fill values
                elif len(z[z != fv]) == 0:
                    print('Array of all fill values - skipping plot.')
                    continue
                else:
                    """
                    clean up data
                    """
                    # reject erroneous data
                    dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \
                        cf.reject_erroneous_data(r, sv, t, y, z, fv)

                    # reject timestamps from stat analysis
                    Dpath = '{}/{}/{}/{}/{}'.format(sDir, array, subsite, r,
                                                    'time_to_exclude')

                    onlyfiles = []
                    for item in os.listdir(Dpath):
                        if not item.startswith('.') and os.path.isfile(
                                os.path.join(Dpath, item)):
                            onlyfiles.append(join(Dpath, item))

                    dre = pd.DataFrame()
                    for nn in onlyfiles:
                        dr = pd.read_csv(nn)
                        dre = dre.append(dr, ignore_index=True)

                    drn = dre.loc[dre['Unnamed: 0'] == vinfo['var_name']]
                    list_time = []
                    for itime in drn.time_to_exclude:
                        ntime = itime.split(', ')
                        list_time.extend(ntime)

                    u_time_list = np.unique(list_time)
                    if len(u_time_list) != 0:
                        t_nospct, z_nospct, y_nospct = cf.reject_suspect_data(
                            dtime, zpressure, ndata, u_time_list)

                    print(
                        '{} using {} percentile of data grouped in {} dbar segments'
                        .format(
                            len(zpressure) - len(z_nospct), inpercentile,
                            zcell_size))

                    # reject time range from data portal file export
                    t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal(
                        subsite, r, t_nospct, y_nospct, z_nospct)

                    print('{} using visual inspection of data'.format(
                        len(z_nospct) - len(z_portal), inpercentile,
                        zcell_size))

                    # reject data in a depth range
                    if zdbar is not None:
                        y_ind = y_portal < zdbar
                        t_array = t_portal[y_ind]
                        y_array = y_portal[y_ind]
                        z_array = z_portal[y_ind]
                    else:
                        y_ind = []
                        t_array = t_portal
                        y_array = y_portal
                        z_array = z_portal
                    print('{} in water depth > {} dbar'.format(
                        len(y_ind), zdbar))

                    if len(y_array) > 0:
                        if m == 'common_stream_placeholder':
                            sname = '-'.join((vinfo['var_name'], r))
                        else:
                            sname = '-'.join((vinfo['var_name'], r, m))
                        """
                        create data ranges for non - pressure data only
                        """

                        if 'pressure' in vinfo['var_name']:
                            pass
                        else:
                            columns = ['tsec', 'dbar', str(vinfo['var_name'])]
                            # create depth ranges
                            min_r = int(round(min(y_array) - zcell_size))
                            max_r = int(round(max(y_array) + zcell_size))
                            ranges = list(range(min_r, max_r, zcell_size))

                            # group data by depth
                            groups, d_groups = gt.group_by_depth_range(
                                t_array, y_array, z_array, columns, ranges)

                            print('writing data ranges for {}'.format(
                                vinfo['var_name']))
                            stat_data = groups.describe()[vinfo['var_name']]
                            stat_data.insert(loc=0,
                                             column='parameter',
                                             value=sv,
                                             allow_duplicates=False)
                            t_deploy = deployments[0]
                            for i in range(
                                    len(deployments))[1:len(deployments)]:
                                t_deploy = '{}, {}'.format(
                                    t_deploy, deployments[i])
                            stat_data.insert(loc=1,
                                             column='deployments',
                                             value=t_deploy,
                                             allow_duplicates=False)

                        stat_df = stat_df.append(stat_data, ignore_index=True)
                        """
                        plot full time range free from errors and suspect data
                        """

                        clabel = sv + " (" + sv_units + ")"
                        ylabel = (y_name[0][0] + " (" + y_unit[0][0] + ")")
                        title = ' '.join((r, m))

                        # plot non-erroneous -suspect data
                        fig, ax, bar = pf.plot_xsection(subsite,
                                                        t_array,
                                                        y_array,
                                                        z_array,
                                                        clabel,
                                                        ylabel,
                                                        inpercentile=None,
                                                        stdev=None)

                        ax.set_title(title, fontsize=9)
                        leg_text = (
                            'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}]'
                            .format(
                                len(z) - lenfv,
                                len(z) - lennan,
                                len(z) - lenev, lengr, global_min,
                                global_max) + '\n' +
                            ('removed {} in the upper and lower {} percentile of data grouped in {} dbar segments'
                             .format(
                                 len(zpressure) - len(z_nospct), inpercentile,
                                 zcell_size)), )

                        ax.legend(leg_text,
                                  loc='upper center',
                                  bbox_to_anchor=(0.5, -0.17),
                                  fontsize=6)

                        for ii in range(len(end_times)):
                            ax.axvline(x=end_times[ii],
                                       color='b',
                                       linestyle='--',
                                       linewidth=.8)
                            ax.text(end_times[ii],
                                    min(y_array) - 5,
                                    'End' + str(deployments[ii]),
                                    fontsize=6,
                                    style='italic',
                                    bbox=dict(
                                        boxstyle='round',
                                        ec=(0., 0.5, 0.5),
                                        fc=(1., 1., 1.),
                                    ))

                        fig.tight_layout()
                        sfile = '_'.join(('data_range', sname))
                        pf.save_fig(save_fdir, sfile)

            # write stat file
            stat_df.to_csv('{}/{}_data_ranges.csv'.format(save_dir_stat, r),
                           index=True,
                           float_format='%11.6f')
예제 #5
0
def main(url_list, sDir, plot_type):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type
    
    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get preferred stream
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(
                dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a folder to save figures
            save_dir = os.path.join(sDir, array, subsite, r, plot_type,
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print('\nAppending data file: {}'.format(fd.split('/')[-1]))
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values

                        sh['pressure'] = np.append(sh['pressure'], y)

                        try:
                            ds[pressure].units
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                        except AttributeError:
                            print('pressure attributes missing units')
                            if 'pressure unit missing' not in y_unit:
                                y_unit.append('pressure unit missing')

                        try:
                            ds[pressure].long_name
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)
                        except AttributeError:
                            print('pressure attributes missing long_name')
                            if 'pressure long name missing' not in y_name:
                                y_name.append('pressure long name missing')

            # create a csv file with diagnostic results:

                if len(y_unit) != 1:
                    print('pressure unit varies')
                    if 'dbar' in y_unit:
                        y_unit = 'dbar'
                    print(y_unit)
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies')
                    if 'Seawater Pressure' in y_name:
                        y_name = 'Seawater Pressure'
                    print(y_name)
                else:
                    y_name = y_name[0]

                # create a folder to save variables statistics
                mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats'
                save_dir_stat = os.path.join(mDir, array, subsite)
                cf.create_dir(save_dir_stat)
                stat_df = pd.DataFrame()
                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z_nofv)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                            # reject values outside global ranges:
                            global_min, global_max = cf.get_global_ranges(
                                r, sv)
                            # platform not in qc-table (parad_k_par)
                            # global_min = 0
                            # global_max = 2500
                            print('global ranges for : {}-{}  {} - {}'.format(
                                r, sv, global_min, global_max))
                            if isinstance(global_min,
                                          (int, float)) and isinstance(
                                              global_max, (int, float)):
                                gr_ind = cf.reject_global_ranges(
                                    z_nofv_nonan_noev, global_min, global_max)
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[
                                    gr_ind]
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[
                                    gr_ind]
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[
                                    gr_ind]
                            else:
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev

                        if len(z_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                        # group by depth range
                        sname = '_'.join((sname, sv_units))

                        # if sv != 'pressure':
                        #     columns = ['tsec', 'dbar', str(sv)]
                        #
                        #     # select depth bin size for the data group function
                        #     bin_size = 10
                        #     min_r = int(round(min(y_nofv_nonan_noev) - bin_size))
                        #     max_r = int(round(max(y_nofv_nonan_noev) + bin_size))
                        #     ranges = list(range(min_r, max_r, bin_size))
                        #     groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr,
                        #                                                z_nofv_nonan_noev_nogr, columns, ranges)
                        #

                        # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]):
                        #     if 'pressure' not in sv:
                        #         print('final_stats_{}-{}-{}-{}'.format(r,
                        #                                                ms.split('-')[0],
                        #                                                ps_df[0].values[0].split('-')[0],
                        #                                                sv))
                        #         stat_data = groups.describe()[sv]
                        #         stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False)
                        #         stat_df = stat_df.append(stat_data)

                        # if sv == 'optical_backscatter':
                        #     less_ind = z_nofv_nonan_noev < 0.0004
                        #     print(sv, ' < 0.0004', len(less_ind))
                        #     more_ind = z_nofv_nonan_noev > 0.01
                        #     print(sv, ' > 0.01', len(more_ind))

                        # Plot all data
                        clabel = sv + " (" + sv_units + ")"
                        ylabel = y_name + " (" + y_unit + ")"

                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   z_nofv_nonan_noev,
                                                   clabel,
                                                   ylabel,
                                                   stdev=None)

                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)

                        pf.save_fig(save_dir, sname)

                        # Plot data with outliers removed
                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev_nogr,
                                                   y_nofv_nonan_noev_nogr,
                                                   z_nofv_nonan_noev_nogr,
                                                   clabel,
                                                   ylabel,
                                                   stdev=5)
                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)
                        sfile = '_'.join((sname, 'rmoutliers'))
                        pf.save_fig(save_dir, sfile)

                        # plot data with excluded time range removed
                        dr = pd.read_csv(
                            'https://datareview.marine.rutgers.edu/notes/export'
                        )
                        drn = dr.loc[dr.type == 'exclusion']
                        if len(drn) != 0:
                            subsite_node = '-'.join((subsite, r.split('-')[1]))
                            drne = drn.loc[drn.reference_designator.isin(
                                [subsite, subsite_node, r])]

                            t_ex = t_nofv_nonan_noev_nogr
                            y_ex = y_nofv_nonan_noev_nogr
                            z_ex = z_nofv_nonan_noev_nogr
                            for i, row in drne.iterrows():
                                sdate = cf.format_dates(row.start_date)
                                edate = cf.format_dates(row.end_date)
                                ts = np.datetime64(sdate)
                                te = np.datetime64(edate)
                                ind = np.where((t_ex < ts) | (t_ex > te), True,
                                               False)
                                if len(ind) != 0:
                                    t_ex = t_ex[ind]
                                    z_ex = z_ex[ind]
                                    y_ex = y_ex[ind]

                            fig, ax = pf.plot_xsection(subsite,
                                                       t_ex,
                                                       y_ex,
                                                       z_ex,
                                                       clabel,
                                                       ylabel,
                                                       stdev=None)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            sfile = '_'.join((sname, 'rmsuspectdata'))
                            pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time, method_num, zdbar, n_std, inpercentile, zcell_size):

    for i, u in enumerate(url_list):
        print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u))
        elements = u.split('/')[-2].split('-')
        r = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = u.split(r + '-')[1].split('/')[0]
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        # read URL to get data
        datasets = cf.get_nc_urls([u])
        datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets)

        # get sci data review list
        dr_data = cf.refdes_datareview_json(r)

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # create a dictionary for science variables from analysis file
        stream_sci_vars_dict = dict()
        for x in dr_data['instrument']['data_streams']:
            dr_ms = '-'.join((x['method'], x['stream_name']))
            if ms == dr_ms:
                stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                sci_vars = dict()
                for y in x['stream']['parameters']:
                    if y['data_product_type'] == 'Science Data':
                        sci_vars.update({y['name']: dict(db_units=y['unit'])})
                if len(sci_vars) > 0:
                    stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

        for ii, d in enumerate(datasets_sel):
            part_d = d.split('/')[-1]
            print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), part_d))
            with xr.open_dataset(d, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(d)

            if method_num is not None:
                if method != method_num:
                    print(method_num, method)
                    continue


            if deployment_num is not None:
                if int(deployment.split('0')[-1]) is not deployment_num:
                    print(type(int(deployment.split('0')[-1])), type(deployment_num))
                    continue

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                    continue
                stime = start_time.strftime('%Y-%m-%d')
                etime = end_time.strftime('%Y-%m-%d')
                ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment, ext)
            else:
                save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment)

            cf.create_dir(save_dir)

            texclude_dir = os.path.join(sDir, array, subsite, refdes, 'time_to_exclude')
            cf.create_dir(texclude_dir)

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms)

            for var in list(sci_vars_dict[ms]['vars'].keys()):
                sh = sci_vars_dict[ms]['vars'][var]
                if ds[var].units == sh['db_units']:
                    if ds[var]._FillValue not in sh['fv']:
                        sh['fv'].append(ds[var]._FillValue)
                    if ds[var].units not in sh['units']:
                        sh['units'].append(ds[var].units)

                    sh['t'] = np.append(sh['t'], ds['time'].values) # t = ds['time'].values
                    sh['values'] = np.append(sh['values'], ds[var].values)  # z = ds[var].values

                    y, y_unit, y_name = cf.add_pressure_to_dictionary_of_sci_vars(ds)
                    sh['pressure'] = np.append(sh['pressure'], y)

            stat_data = pd.DataFrame(columns=['deployments', 'time_to_exclude'])
            file_exclude = '{}/{}_{}_{}_excluded_timestamps.csv'.format(texclude_dir,
                                                                                   deployment, refdes, method)
            stat_data.to_csv(file_exclude, index=True)
            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        fv = vinfo['fv'][0]
                        t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        colors = cm.rainbow(np.linspace(0, 1, len(vinfo['t'])))
                        t = vinfo['t']
                        z = vinfo['values']
                        y = vinfo['pressure']


                    # Check if the array is all NaNs
                    if sum(np.isnan(z)) == len(z):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(z[z != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:
                        # reject erroneous data
                        dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \
                            cf.reject_erroneous_data(r, sv, t, y, z, fv)


                        # create data groups
                        columns = ['tsec', 'dbar', str(sv)]
                        min_r = int(round(min(zpressure) - zcell_size))
                        max_r = int(round(max(zpressure) + zcell_size))
                        ranges = list(range(min_r, max_r, zcell_size))

                        groups, d_groups = gt.group_by_depth_range(dtime, zpressure, ndata, columns, ranges)
                        #     ... excluding timestamps
                        if 'scatter' in sv:
                            n_std = None #to use percentile
                        else:
                            n_std = n_std

                        #  rejecting timestamps from percentile analysis
                        y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex, \
                        t_nospct, z_nospct, y_nospct = cf.reject_timestamps_in_groups(groups, d_groups, n_std,
                                                                                      dtime, zpressure, ndata,
                                                                                      inpercentile)
                        print('{} using {} percentile of data grouped in {} dbar segments'.format(
                                                    len(zpressure) - len(z_nospct), inpercentile, zcell_size))

                        """
                        writing timestamps to .csv file to use with data_range.py script
                        """
                        if len(time_ex) != 0:
                            t_exclude = time_ex[0]
                            for i in range(len(time_ex))[1:len(time_ex)]:
                                t_exclude = '{}, {}'.format(t_exclude, time_ex[i])

                            stat_data = pd.DataFrame({'deployments': deployment,
                                                      'time_to_exclude': t_exclude}, index=[sv])
                            stat_data.to_csv(file_exclude, index=True, mode='a', header=False)

                        # reject time range from data portal file export
                        t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal(subsite, r,
                                                                                       t_nospct, z_nospct, y_nospct)
                        print('{} using visual inspection of data'.format(len(z_nospct) - len(z_portal),
                                                                                            inpercentile, zcell_size))

                        # reject data in a depth range
                        if zdbar is not None:
                            y_ind = y_portal < zdbar
                            t_array = t_portal[y_ind]
                            y_array = y_portal[y_ind]
                            z_array = z_portal[y_ind]
                        else:
                            y_ind = []
                            t_array = t_portal
                            y_array = y_portal
                            z_array = z_portal
                        print('{} in water depth > {} dbar'.format(len(y_ind), zdbar))

                    """
                     Plot data
                     """
                    if len(t_array) > 0:
                        if m == 'common_stream_placeholder':
                            sname = '-'.join((sv, r))
                        else:
                            sname = '-'.join((sv, r, m))

                    xlabel = sv + " (" + sv_units + ")"
                    ylabel = y_name[0] + " (" + y_unit[0] + ")"
                    clabel = 'Time'
                    title = ' '.join((deployment, r, m))

                    # plot non-erroneous data
                    fig, ax = pf.plot_profiles(ndata, zpressure, dtime,
                                               ylabel, xlabel, clabel, end_times, deployments, stdev=None)
                    ax.set_title(title, fontsize=9)
                    ax.plot(n_avg, y_avg, '-k')
                    ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2)
                    leg_text = (
                        'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}]'.format(
                            len(z) - lenfv, len(z) - lennan, len(z) - lenev, lengr, global_min, global_max) + '\n' +
                        ('(black) data average in {} dbar segments'.format(zcell_size)) + '\n' +
                        ('(magenta) upper and lower {} percentile envelope in {} dbar segments'.format(
                                                                                            inpercentile, zcell_size)),)
                    ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                    fig.tight_layout()
                    sfile = '_'.join(('rm_erroneous_data', sname))
                    pf.save_fig(save_dir, sfile)

                    # plot excluding time ranges for suspect data
                    if len(z_nospct) != len(zpressure):
                        fig, ax = pf.plot_profiles(z_nospct, y_nospct, t_nospct,
                                                   ylabel, xlabel, clabel, end_times, deployments, stdev=None)

                        ax.set_title(title, fontsize=9)
                        leg_text = (
                         'removed {} in the upper and lower {} percentile of data grouped in {} dbar segments'.format(
                                                             len(zpressure) - len(z_nospct), inpercentile, zcell_size),)
                        ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                        fig.tight_layout()
                        sfile = '_'.join(('rm_suspect_data', sname))
                        pf.save_fig(save_dir, sfile)

                    # plot excluding time ranges from data portal export
                    if len(z_nospct) - len(z_portal):
                        fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal,
                                                   ylabel, xlabel, clabel, end_times, deployments, stdev=None)
                        ax.set_title(title, fontsize=9)
                        leg_text = ('excluded {} suspect data when inspected visually'.format(
                                                                                        len(z_nospct) - len(z_portal)),)
                        ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                        fig.tight_layout()
                        sfile = '_'.join(('rm_v_suspect_data', sname))
                        pf.save_fig(save_dir, sfile)


                    # Plot excluding a selected depth value
                    if len(z_array) != len(z_array):
                        fig, ax = pf.plot_profiles(z_array, y_array, t_array,
                                                   ylabel, xlabel, clabel, end_times, deployments, stdev=None)

                        ax.set_title(title, fontsize=9)
                        leg_text = ('excluded {} suspect data in water depth greater than {} dbar'.format(len(y_ind), zdbar),)
                        ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                        fig.tight_layout()
                        sfile = '_'.join(('rm_depth_range', sname))
                        pf.save_fig(save_dir, sfile)