Exemplo n.º 1
0
def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        fdatasets = np.unique(fdatasets).tolist()
        for fd in fdatasets:
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            sci_vars = cf.return_science_vars(stream)
            print('\nPlotting {} {}'.format(r, deployment))
            array = subsite[0:2]
            filename = '_'.join(fname.split('_')[:-1])
            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'timeseries_plots')
            cf.create_dir(save_dir)

            tm = ds['time'].values
            t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title = ' '.join((deployment, refdes, method))

            # -------- plot entire deployment --------

            for var in sci_vars:
                print(var)
                vv = ds[var]
                fv = vv._FillValue
                # need to round SPKIR values to 1 decimal place to match the global ranges. otherwise, values that
                # round to zero (e.g. 1.55294e-05) will be excluded by the global range test
                # v = np.round(vv.values.T, 1)  # .T = transpose 2D array
                v = vv.values.T
                n_nan = np.sum(np.isnan(v))

                # convert fill values to nans
                v[v == fv] = np.nan
                n_fv = np.sum(np.isnan(v)) - n_nan

                # plot before global ranges are removed
                fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                sfile = '-'.join((filename, var, t0[:10]))
                pf.save_fig(save_dir, sfile)

                # reject data outside of global ranges
                [g_min, g_max] = cf.get_global_ranges(r, var)
                if g_min is not None and g_max is not None:
                    v[v < g_min] = np.nan
                    v[v > g_max] = np.nan
                    n_grange = np.sum(np.isnan(v)) - n_fv - n_nan
                else:
                    n_grange = 'no global ranges'

                # plot after global ranges are removed
                fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                title2 = 'removed: {} global ranges [{}, {}]'.format(
                    n_grange, g_min, g_max)
                ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\n' + title2),
                             fontsize=9)
                sfile = '-'.join((filename, var, t0[:10], 'rmgr'))
                pf.save_fig(save_dir, sfile)

            # -------- break the deployment into months and plot --------

            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'timeseries_plots', 'monthly')
            cf.create_dir(save_dir)

            # create list of start and end dates
            dt_start = dt.datetime.strptime(t0, '%Y-%m-%dT%H:%M:%S')
            dt_end = dt.datetime.strptime(t1, '%Y-%m-%dT%H:%M:%S')
            start_dates = [dt_start.strftime('%m-%d-%YT00:00:00')]
            end_dates = []
            ts1 = dt_start
            while ts1 <= dt_end:
                ts2 = ts1 + dt.timedelta(days=1)
                if ts2.month != ts1.month:
                    start_dates.append(ts2.strftime('%m-%d-%YT00:00:00'))
                    end_dates.append(ts1.strftime('%m-%d-%YT23:59:59'))
                ts1 = ts2
            end_dates.append(dt_end.strftime('%m-%d-%YT23:59:59'))

            for sd, ed in zip(start_dates, end_dates):
                sd_format = dt.datetime.strptime(sd, '%m-%d-%YT%H:%M:%S')
                ed_format = dt.datetime.strptime(ed, '%m-%d-%YT%H:%M:%S')
                ds_month = ds.sel(time=slice(sd_format, ed_format))
                if len(ds_month['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(sd, ed))
                    continue
                tm = ds_month['time'].values
                t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')

                for var in sci_vars:
                    print(var)
                    vv = ds_month[var]
                    fv = vv._FillValue
                    v = vv.values.T  # transpose 2D array
                    n_nan = np.sum(np.isnan(v))

                    # convert fill values to nans
                    v[v == fv] = np.nan
                    n_fv = np.sum(np.isnan(v)) - n_nan

                    # reject data outside of global ranges
                    [g_min, g_max] = cf.get_global_ranges(r, var)
                    if g_min is not None and g_max is not None:
                        v[v < g_min] = np.nan
                        v[v > g_max] = np.nan
                        n_grange = np.sum(np.isnan(v)) - n_fv - n_nan
                    else:
                        n_grange = 'no global ranges'

                    # plot after global ranges are removed
                    fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                    title2 = 'removed: {} global ranges [{}, {}]'.format(
                        n_grange, g_min, g_max)
                    ax.set_title(
                        (title + '\n' + t0 + ' - ' + t1 + '\n' + title2),
                        fontsize=9)
                    sfile = '-'.join((filename, var, t0[:7], 'rmgr'))
                    pf.save_fig(save_dir, sfile)
def main(sDir, plotting_sDir, url_list, sd_calc):
    dr = pd.read_csv('https://datareview.marine.rutgers.edu/notes/export')
    drn = dr.loc[dr.type == 'exclusion']
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)
        pms = []
        for index, row in ps_df.iterrows():
            for ii in range(n_streams):
                try:
                    rms = '-'.join((r, row[ii]))
                    pms.append(row[ii])
                except TypeError:
                    continue
                for dd in datasets:
                    spl = dd.split('/')[-2].split('-')
                    catalog_rms = '-'.join(
                        (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                    fdeploy = dd.split('/')[-1].split('_')[0]
                    if rms == catalog_rms and fdeploy == row['deployment']:
                        fdatasets.append(dd)

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        # find time ranges to exclude from analysis for data review database
        subsite = r.split('-')[0]
        subsite_node = '-'.join((subsite, r.split('-')[1]))

        drne = drn.loc[drn.reference_designator.isin(
            [subsite, subsite_node, r])]
        et = []
        for i, row in drne.iterrows():
            sdate = cf.format_dates(row.start_date)
            edate = cf.format_dates(row.end_date)
            et.append([sdate, edate])

        # get science variable long names from the Data Review Database
        stream_sci_vars = cd.sci_var_long_names(r)

        # check if the science variable long names are the same for each stream
        sci_vars_dict = cd.sci_var_long_names_check(stream_sci_vars)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        sci_vars_dict, pressure_unit, pressure_name = cd.append_science_data(
            ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et)

        # analyze combined dataset
        print('\nAnalyzing combined dataset and writing summary file')

        array = subsite[0:2]
        save_dir = os.path.join(sDir, array, subsite)
        cf.create_dir(save_dir)

        rows = []
        if ('FLM' in r) and (
                'CTDMO' in r
        ):  # calculate Flanking Mooring CTDMO stats based on pressure
            headers = [
                'common_stream_name', 'preferred_methods_streams',
                'deployments', 'long_name', 'units', 't0', 't1', 'fill_value',
                'global_ranges', 'n_all', 'press_min_max',
                'n_excluded_forpress', 'n_nans', 'n_fillvalues', 'n_grange',
                'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max',
                'stdev', 'note'
            ]
        else:
            headers = [
                'common_stream_name', 'preferred_methods_streams',
                'deployments', 'long_name', 'units', 't0', 't1', 'fill_value',
                'global_ranges', 'n_all', 'n_nans', 'n_fillvalues', 'n_grange',
                'define_stdev', 'n_outliers', 'n_stats', 'mean', 'min', 'max',
                'stdev'
            ]

        for m, n in sci_vars_dict.items():
            print('\nSTREAM: ', m)
            if m == 'common_stream_placeholder':
                m = 'science_data_stream'
            if m == 'metbk_hourly':  # don't calculate ranges for metbk_hourly
                continue

            if ('FLM' in r) and (
                    'CTDMO' in r
            ):  # calculate Flanking Mooring CTDMO stats based on pressure
                # index the pressure variable to filter and calculate stats on the rest of the variables
                sv_press = 'Seawater Pressure'
                vinfo_press = n['vars'][sv_press]

                # first, index where data are nans, fill values, and outside of global ranges
                fv_press = list(np.unique(vinfo_press['fv']))[0]
                pdata = vinfo_press['values']

                [pind, __, __, __, __,
                 __] = index_dataset(r, vinfo_press['var_name'], pdata,
                                     fv_press)

                pdata_filtered = pdata[pind]
                [__, pmean, __, __, psd,
                 __] = cf.variable_statistics(pdata_filtered, None)

                # index of pressure = average of all 'valid' pressure data +/- 1 SD
                ipress_min = pmean - psd
                ipress_max = pmean + psd
                ind_press = (pdata >= ipress_min) & (pdata <= ipress_max)

                # calculate stats for all variables
                print('\nPARAMETERS:')
                for sv, vinfo in n['vars'].items():
                    print(sv)
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print('No unique fill value for {}'.format(sv))

                    lunits = np.unique(vinfo['units']).tolist()
                    n_all = len(vinfo['t'])

                    # filter data based on pressure index
                    t_filtered = vinfo['t'][ind_press]
                    data_filtered = vinfo['values'][ind_press]
                    deploy_filtered = vinfo['deployments'][ind_press]

                    n_excluded = n_all - len(t_filtered)

                    [dataind, g_min, g_max, n_nan, n_fv,
                     n_grange] = index_dataset(r, vinfo['var_name'],
                                               data_filtered, fill_value)

                    t_final = t_filtered[dataind]
                    data_final = data_filtered[dataind]
                    deploy_final = deploy_filtered[dataind]

                    t0 = pd.to_datetime(
                        min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                    t1 = pd.to_datetime(
                        max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                    deploy = list(np.unique(deploy_final))
                    deployments = [int(dd) for dd in deploy]

                    if len(data_final) > 1:
                        [num_outliers, mean, vmin, vmax, sd, n_stats
                         ] = cf.variable_statistics(data_final, sd_calc)
                    else:
                        mean = None
                        vmin = None
                        vmax = None
                        sd = None
                        n_stats = None

                    note = 'restricted stats calculation to data points where pressure is within defined ranges' \
                           ' (average of all pressure data +/- 1 SD)'
                    rows.append([
                        m,
                        list(np.unique(pms)), deployments, sv, lunits, t0, t1,
                        fv_lst, [g_min, g_max], n_all,
                        [round(ipress_min, 2),
                         round(ipress_max,
                               2)], n_excluded, n_nan, n_fv, n_grange, sd_calc,
                        num_outliers, n_stats, mean, vmin, vmax, sd, note
                    ])

                    # plot CTDMO data used for stats
                    psave_dir = os.path.join(plotting_sDir, array, subsite, r,
                                             'timeseries_plots_stats')
                    cf.create_dir(psave_dir)

                    dr_data = cf.refdes_datareview_json(r)
                    deployments = []
                    end_times = []
                    for index, row in ps_df.iterrows():
                        deploy = row['deployment']
                        deploy_info = cf.get_deployment_information(
                            dr_data, int(deploy[-4:]))
                        deployments.append(int(deploy[-4:]))
                        end_times.append(
                            pd.to_datetime(deploy_info['stop_date']))

                    sname = '-'.join((r, sv))
                    fig, ax = pf.plot_timeseries_all(t_final,
                                                     data_final,
                                                     sv,
                                                     lunits[0],
                                                     stdev=None)
                    ax.set_title(
                        (r + '\nDeployments: ' + str(sorted(deployments)) +
                         '\n' + t0 + ' - ' + t1),
                        fontsize=8)
                    for etimes in end_times:
                        ax.axvline(x=etimes,
                                   color='k',
                                   linestyle='--',
                                   linewidth=.6)
                    pf.save_fig(psave_dir, sname)

                    if sd_calc:
                        sname = '-'.join((r, sv, 'rmoutliers'))
                        fig, ax = pf.plot_timeseries_all(t_final,
                                                         data_final,
                                                         sv,
                                                         lunits[0],
                                                         stdev=sd_calc)
                        ax.set_title(
                            (r + '\nDeployments: ' + str(sorted(deployments)) +
                             '\n' + t0 + ' - ' + t1),
                            fontsize=8)
                        for etimes in end_times:
                            ax.axvline(x=etimes,
                                       color='k',
                                       linestyle='--',
                                       linewidth=.6)
                        pf.save_fig(psave_dir, sname)

            else:
                if not sd_calc:
                    sdcalc = None

                print('\nPARAMETERS: ')
                for sv, vinfo in n['vars'].items():
                    print(sv)

                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    lunits = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        n_all = len(t)

                        if 'SPKIR' in r or 'presf_abc_wave_burst' in m:
                            if 'SPKIR' in r:
                                [dd_data, g_min, g_max, n_nan, n_fv,
                                 n_grange] = index_dataset_2d(
                                     r, 'spkir_abj_cspp_downwelling_vector',
                                     data, fill_value)
                            else:
                                [dd_data, g_min, g_max, n_nan, n_fv,
                                 n_grange] = index_dataset_2d(
                                     r, 'presf_wave_burst_pressure', data,
                                     fill_value)
                            t_final = t
                            t0 = pd.to_datetime(
                                min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(
                                max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                            deploy_final = vinfo['deployments']
                            deploy = list(np.unique(deploy_final))
                            deployments = [int(dd) for dd in deploy]

                            num_outliers = []
                            mean = []
                            vmin = []
                            vmax = []
                            sd = []
                            n_stats = []
                            for i in range(len(dd_data)):
                                dd = data[i]
                                # drop nans before calculating stats
                                dd = dd[~np.isnan(dd)]
                                [
                                    num_outliersi, meani, vmini, vmaxi, sdi,
                                    n_statsi
                                ] = cf.variable_statistics(dd, sd_calc)
                                num_outliers.append(num_outliersi)
                                mean.append(meani)
                                vmin.append(vmini)
                                vmax.append(vmaxi)
                                sd.append(sdi)
                                n_stats.append(n_statsi)

                        else:
                            [dataind, g_min, g_max, n_nan, n_fv,
                             n_grange] = index_dataset(r, vinfo['var_name'],
                                                       data, fill_value)
                            t_final = t[dataind]
                            if len(t_final) > 0:
                                t0 = pd.to_datetime(
                                    min(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                                t1 = pd.to_datetime(
                                    max(t_final)).strftime('%Y-%m-%dT%H:%M:%S')
                                data_final = data[dataind]
                                # if sv == 'Dissolved Oxygen Concentration':
                                #     xx = (data_final > 0) & (data_final < 400)
                                #     data_final = data_final[xx]
                                #     t_final = t_final[xx]
                                # if sv == 'Seawater Conductivity':
                                #     xx = (data_final > 1) & (data_final < 400)
                                #     data_final = data_final[xx]
                                #     t_final = t_final[xx]
                                deploy_final = vinfo['deployments'][dataind]
                                deploy = list(np.unique(deploy_final))
                                deployments = [int(dd) for dd in deploy]

                                if len(data_final) > 1:
                                    [
                                        num_outliers, mean, vmin, vmax, sd,
                                        n_stats
                                    ] = cf.variable_statistics(
                                        data_final, sd_calc)
                                else:
                                    sdcalc = None
                                    num_outliers = None
                                    mean = None
                                    vmin = None
                                    vmax = None
                                    sd = None
                                    n_stats = None
                            else:
                                sdcalc = None
                                num_outliers = None
                                mean = None
                                vmin = None
                                vmax = None
                                sd = None
                                n_stats = None
                                deployments = None
                                t0 = None
                                t1 = None
                    else:
                        sdcalc = None
                        num_outliers = None
                        mean = None
                        vmin = None
                        vmax = None
                        sd = None
                        n_stats = None
                        deployments = None
                        t0 = None
                        t1 = None
                        t_final = []

                    if sd_calc:
                        print_sd = sd_calc
                    else:
                        print_sd = sdcalc

                    rows.append([
                        m,
                        list(np.unique(pms)), deployments, sv, lunits, t0, t1,
                        fv_lst, [g_min, g_max], n_all, n_nan, n_fv, n_grange,
                        print_sd, num_outliers, n_stats, mean, vmin, vmax, sd
                    ])

                    if len(t_final) > 0:
                        # plot data used for stats
                        psave_dir = os.path.join(
                            plotting_sDir, array, subsite, r,
                            'timeseries_reviewed_datarange')
                        cf.create_dir(psave_dir)

                        dr_data = cf.refdes_datareview_json(r)
                        deployments = []
                        end_times = []
                        for index, row in ps_df.iterrows():
                            deploy = row['deployment']
                            deploy_info = cf.get_deployment_information(
                                dr_data, int(deploy[-4:]))
                            deployments.append(int(deploy[-4:]))
                            end_times.append(
                                pd.to_datetime(deploy_info['stop_date']))

                        sname = '-'.join((r, sv))

                        # plot hourly averages for streaming data
                        if 'streamed' in sci_vars_dict[list(
                                sci_vars_dict.keys())[0]]['ms'][0]:
                            sname = '-'.join((sname, 'hourlyavg'))
                            df = pd.DataFrame({
                                'dfx': t_final,
                                'dfy': data_final
                            })
                            dfr = df.resample('H', on='dfx').mean()

                            # Plot all data
                            fig, ax = pf.plot_timeseries_all(dfr.index,
                                                             dfr['dfy'],
                                                             sv,
                                                             lunits[0],
                                                             stdev=None)
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            if sd_calc:
                                sname = '-'.join(
                                    (sname, 'hourlyavg_rmoutliers'))
                                fig, ax = pf.plot_timeseries_all(dfr.index,
                                                                 dfr['dfy'],
                                                                 sv,
                                                                 lunits[0],
                                                                 stdev=sd_calc)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                pf.save_fig(psave_dir, sname)

                        elif 'SPKIR' in r:
                            fig, ax = pf.plot_spkir(t_final, dd_data, sv,
                                                    lunits[0])
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            # plot each wavelength
                            wavelengths = [
                                '412nm', '443nm', '490nm', '510nm', '555nm',
                                '620nm', '683nm'
                            ]
                            for wvi in range(len(dd_data)):
                                fig, ax = pf.plot_spkir_wv(
                                    t_final, dd_data[wvi], sv, lunits[0], wvi)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                snamewvi = '-'.join((sname, wavelengths[wvi]))
                                pf.save_fig(psave_dir, snamewvi)
                        elif 'presf_abc_wave_burst' in m:
                            fig, ax = pf.plot_presf_2d(t_final, dd_data, sv,
                                                       lunits[0])
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            snamewave = '-'.join((sname, m))
                            pf.save_fig(psave_dir, snamewave)

                        else:  # plot all data if not streamed
                            fig, ax = pf.plot_timeseries_all(t_final,
                                                             data_final,
                                                             sv,
                                                             lunits[0],
                                                             stdev=None)
                            ax.set_title((r + '\nDeployments: ' +
                                          str(sorted(deployments)) + '\n' +
                                          t0 + ' - ' + t1),
                                         fontsize=8)
                            for etimes in end_times:
                                ax.axvline(x=etimes,
                                           color='k',
                                           linestyle='--',
                                           linewidth=.6)
                            pf.save_fig(psave_dir, sname)

                            if sd_calc:
                                sname = '-'.join((r, sv, 'rmoutliers'))
                                fig, ax = pf.plot_timeseries_all(t_final,
                                                                 data_final,
                                                                 sv,
                                                                 lunits[0],
                                                                 stdev=sd_calc)
                                ax.set_title((r + '\nDeployments: ' +
                                              str(sorted(deployments)) + '\n' +
                                              t0 + ' - ' + t1),
                                             fontsize=8)
                                for etimes in end_times:
                                    ax.axvline(x=etimes,
                                               color='k',
                                               linestyle='--',
                                               linewidth=.6)
                                pf.save_fig(psave_dir, sname)

        fsum = pd.DataFrame(rows, columns=headers)
        fsum.to_csv('{}/{}_data_ranges.csv'.format(save_dir, r), index=False)
Exemplo n.º 3
0
def main(sDir, url_list, start_time, end_time):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)
        for index, row in ps_df.iterrows():
            for ii in range(n_streams):
                try:
                    rms = '-'.join((r, row[ii]))
                except TypeError:
                    continue
                for dd in datasets:
                    spl = dd.split('/')[-2].split('-')
                    catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                    fdeploy = dd.split('/')[-1].split('_')[0]
                    if rms == catalog_rms and fdeploy == row['deployment']:
                        fdatasets.append(dd)

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets)

        # get science variable long names from the Data Review Database
        #stream_sci_vars = cd.sci_var_long_names(r)
        if 'SPKIR' in r or 'PRESF' in r:  # only get the main science variable for SPKIR
            stream_vars = cd.sci_var_long_names(r)
        else:
            stream_vars = var_long_names(r)

        # check if the science variable long names are the same for each stream and initialize empty arrays
        sci_vars_dict = cd.sci_var_long_names_check(stream_vars)

        # get the preferred stream information
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # build dictionary of science data from the preferred dataset for each deployment
        print('\nAppending data from files')
        et = []
        sci_vars_dict, __, __ = cd.append_science_data(ps_df, n_streams, r, fdatasets_sel, sci_vars_dict, et, start_time, end_time)

        # get end times of deployments
        dr_data = cf.refdes_datareview_json(r)
        deployments = []
        dend_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            dend_times.append(pd.to_datetime(deploy_info['stop_date']))

        subsite = r.split('-')[0]
        array = subsite[0:2]
        save_dir = os.path.join(sDir, array, subsite, r, 'timeseries_plots_preferred_all')
        cf.create_dir(save_dir)

        print('\nPlotting data')
        for m, n in sci_vars_dict.items():
            for sv, vinfo in n['vars'].items():
                print(sv)
                if 'SPKIR' in r:
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    sv_units = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        [dd_data, g_min, g_max] = index_dataset_2d(r, 'spkir_abj_cspp_downwelling_vector', data, fill_value)
                        t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        deploy_final = vinfo['deployments']
                        deploy = list(np.unique(deploy_final))
                        deployments = [int(dd) for dd in deploy]

                        sname = '-'.join((r, sv))
                        fig, ax = pf.plot_spkir(t, dd_data, sv, sv_units[0])
                        ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                      + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                        for etimes in dend_times:
                            ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                        pf.save_fig(save_dir, sname)

                        # plot each wavelength
                        wavelengths = ['412nm', '443nm', '490nm', '510nm', '555nm', '620nm', '683nm']
                        for wvi in range(len(dd_data)):
                            fig, ax = pf.plot_spkir_wv(t, dd_data[wvi], sv, sv_units[0], wvi)
                            ax.set_title(
                                (r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                 + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                            for etimes in dend_times:
                                ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                            snamewvi = '-'.join((sname, wavelengths[wvi]))
                            pf.save_fig(save_dir, snamewvi)

                elif 'presf_abc_wave_burst' in m:
                    fv_lst = np.unique(vinfo['fv']).tolist()
                    if len(fv_lst) == 1:
                        fill_value = fv_lst[0]
                    else:
                        print(fv_lst)
                        print('No unique fill value for {}'.format(sv))

                    sv_units = np.unique(vinfo['units']).tolist()

                    t = vinfo['t']
                    if len(t) > 1:
                        data = vinfo['values']
                        [dd_data, g_min, g_max] = index_dataset_2d(r, 'presf_wave_burst_pressure', data, fill_value)
                        t0 = pd.to_datetime(min(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(t)).strftime('%Y-%m-%dT%H:%M:%S')
                        deploy_final = vinfo['deployments']
                        deploy = list(np.unique(deploy_final))
                        deployments = [int(dd) for dd in deploy]

                        sname = '-'.join((r, sv))
                        fig, ax = pf.plot_presf_2d(t, dd_data, sv, sv_units[0])
                        ax.set_title((r + '\nDeployments: ' + str(sorted(deployments)) + '\n' + t0 + ' - ' + t1 + '\n'
                                      + 'removed global ranges +/- [{} - {}]'.format(g_min, g_max)), fontsize=8)
                        for etimes in dend_times:
                            ax.axvline(x=etimes, color='k', linestyle='--', linewidth=.6)
                        pf.save_fig(save_dir, sname)

                else:
                    if type(vinfo['values']) != dict:  # if the variable is not a 2D array
                        if 'Spectra' not in sv:
                            if len(vinfo['t']) < 1:
                                print('no variable data to plot')
                            else:
                                sv_units = vinfo['units'][0]
                                sv_name = vinfo['var_name']
                                t0 = pd.to_datetime(min(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                                t1 = pd.to_datetime(max(vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                                x = vinfo['t']
                                y = vinfo['values']

                                # reject NaNs and values of 0.0
                                nan_ind = (~np.isnan(y)) & (y != 0.0)
                                x_nonan = x[nan_ind]
                                y_nonan = y[nan_ind]

                                # reject fill values
                                fv_ind = y_nonan != vinfo['fv'][0]
                                x_nonan_nofv = x_nonan[fv_ind]
                                y_nonan_nofv = y_nonan[fv_ind]

                                # reject extreme values
                                Ev_ind = cf.reject_extreme_values(y_nonan_nofv)
                                y_nonan_nofv_nE = y_nonan_nofv[Ev_ind]
                                x_nonan_nofv_nE = x_nonan_nofv[Ev_ind]

                                # reject values outside global ranges:
                                global_min, global_max = cf.get_global_ranges(r, sv_name)
                                if any(e is None for e in [global_min, global_max]):
                                    y_nonan_nofv_nE_nogr = y_nonan_nofv_nE
                                    x_nonan_nofv_nE_nogr = x_nonan_nofv_nE
                                else:
                                    gr_ind = cf.reject_global_ranges(y_nonan_nofv_nE, global_min, global_max)
                                    y_nonan_nofv_nE_nogr = y_nonan_nofv_nE[gr_ind]
                                    x_nonan_nofv_nE_nogr = x_nonan_nofv_nE[gr_ind]

                                if len(y_nonan_nofv) > 0:
                                    if m == 'common_stream_placeholder':
                                        sname = '-'.join((r, sv))
                                    else:
                                        sname = '-'.join((r, m, sv))

                                    plt_deploy = [int(x) for x in list(np.unique(vinfo['deployments']))]

                                    # plot hourly averages for cabled and FDCHP data
                                    if 'streamed' in sci_vars_dict[list(sci_vars_dict.keys())[0]]['ms'][0] or 'FDCHP' in r:
                                        sname = '-'.join((sname, 'hourlyavg'))
                                        df = pd.DataFrame({'dfx': x_nonan_nofv_nE_nogr, 'dfy': y_nonan_nofv_nE_nogr})
                                        dfr = df.resample('H', on='dfx').mean()

                                        # Plot all data
                                        fig, ax = pf.plot_timeseries_all(dfr.index, dfr['dfy'], sv, sv_units, stdev=None)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            for e in list(np.unique(vinfo['deployments'])):
                                                etime = dend_times[int(e) - 1]
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        pf.save_fig(save_dir, sname)
                                    else:
                                        # Plot all data
                                        fig, ax = pf.plot_timeseries_all(x_nonan_nofv, y_nonan_nofv, sv, sv_units, stdev=None)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            # for e in list(np.unique(vinfo['deployments'])):
                                            #     etime = dend_times[int(e) - 1]
                                            #     ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                            etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1]
                                            ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        # if not any(e is None for e in [global_min, global_max]):
                                        #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                                        #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)
                                        # else:
                                        #     maxpoint = x[np.argmax(y_nonan_nofv)], max(y_nonan_nofv)
                                        #     ax.annotate('No Global Ranges', size=8,
                                        #                 xy=maxpoint, xytext=(5, 5), textcoords='offset points')
                                        pf.save_fig(save_dir, sname)

                                        # Plot data with outliers removed
                                        fig, ax = pf.plot_timeseries_all(x_nonan_nofv_nE_nogr, y_nonan_nofv_nE_nogr, sv, sv_units,
                                                                         stdev=5)
                                        ax.set_title((r + '\nDeployments: ' + str(plt_deploy) + '\n' + t0 + ' - ' + t1),
                                                     fontsize=8)

                                        # if plotting a specific time range, plot deployment lines only for those deployments
                                        if type(start_time) == dt.datetime:
                                            # for e in list(np.unique(vinfo['deployments'])):
                                            #     etime = dend_times[int(e) - 1]
                                            #     ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                            etime = dend_times[int(list(np.unique(vinfo['deployments']))[0]) - 1]
                                            ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        else:
                                            for etime in dend_times:
                                                ax.axvline(x=etime, color='b', linestyle='--', linewidth=.6)
                                        # if not any(e is None for e in [global_min, global_max]):
                                        #     ax.axhline(y=global_min, color='r', linestyle='--', linewidth=.6)
                                        #     ax.axhline(y=global_max, color='r', linestyle='--', linewidth=.6)
                                        # else:
                                        #     maxpoint = x[np.argmax(y_nonan_nofv_nE_nogr)], max(y_nonan_nofv_nE_nogr)
                                        #     ax.annotate('No Global Ranges', size=8,
                                        #                 xy=maxpoint, xytext=(5, 5), textcoords='offset points')

                                        sfile = '_'.join((sname, 'rmoutliers'))
                                        pf.save_fig(save_dir, sfile)