Пример #1
0
def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    rms = '-'.join((r, row[ii]))
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        for fd in fdatasets:
            with xr.open_dataset(fd, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd)
                print('\nPlotting {} {}'.format(r, deployment))
                array = subsite[0:2]
                save_dir = os.path.join(sDir, array, subsite, refdes, 'ts_plots')
                cf.create_dir(save_dir)

                tme = ds['time'].values
                t0 = pd.to_datetime(tme.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tme.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))
                filename = '-'.join(('_'.join(fname.split('_')[:-1]), 'ts', t0[:10]))

                ds_vars = list(ds.data_vars.keys())
                raw_vars = cf.return_raw_vars(ds_vars)

                xvar = return_var(ds, raw_vars, 'salinity', 'Practical Salinity')
                sal = ds[xvar].values
                sal_fv = ds[xvar]._FillValue

                yvar = return_var(ds, raw_vars, 'temp', 'Seawater Temperature')
                temp = ds[yvar].values
                temp_fv = ds[yvar]._FillValue

                press = pf.pressure_var(ds, list(ds.coords.keys()))
                if press is None:
                    press = pf.pressure_var(ds, list(ds.data_vars.keys()))
                p = ds[press].values

                # get rid of nans, 0.0s, fill values
                sind1 = (~np.isnan(sal)) & (sal != 0.0) & (sal != sal_fv)
                sal = sal[sind1]
                temp = temp[sind1]
                tme = tme[sind1]
                p = p[sind1]
                tind1 = (~np.isnan(temp)) & (temp != 0.0) & (temp != temp_fv)
                sal = sal[tind1]
                temp = temp[tind1]
                tme = tme[tind1]
                p = p[tind1]

                # reject values outside global ranges:
                global_min, global_max = cf.get_global_ranges(r, xvar)
                if any(e is None for e in [global_min, global_max]):
                    sal = sal
                    temp = temp
                    tme = tme
                    p = p
                else:
                    sgr_ind = cf.reject_global_ranges(sal, global_min, global_max)
                    sal = sal[sgr_ind]
                    temp = temp[sgr_ind]
                    tme = tme[sgr_ind]
                    p = p[sgr_ind]

                global_min, global_max = cf.get_global_ranges(r, yvar)
                if any(e is None for e in [global_min, global_max]):
                    sal = sal
                    temp = temp
                    tme = tme
                    p = p
                else:
                    tgr_ind = cf.reject_global_ranges(temp, global_min, global_max)
                    sal = sal[tgr_ind]
                    temp = temp[tgr_ind]
                    tme = tme[tgr_ind]
                    p = p[tgr_ind]

                # get rid of outliers
                soind = cf.reject_outliers(sal, 5)
                sal = sal[soind]
                temp = temp[soind]
                tme = tme[soind]
                p = p[soind]

                toind = cf.reject_outliers(temp, 5)
                sal = sal[toind]
                temp = temp[toind]
                tme = tme[toind]
                p = p[toind]

                if len(sal) > 0:  # if there are any data to plot

                    colors = cm.rainbow(np.linspace(0, 1, len(tme)))

                    # Figure out boundaries (mins and maxes)
                    #smin = sal.min() - (0.01 * sal.min())
                    #smax = sal.max() + (0.01 * sal.max())
                    if sal.max() - sal.min() < 0.2:
                        smin = sal.min() - (0.0005 * sal.min())
                        smax = sal.max() + (0.0005 * sal.max())
                    else:
                        smin = sal.min() - (0.001 * sal.min())
                        smax = sal.max() + (0.001 * sal.max())

                    if temp.max() - temp.min() <= 1:
                        tmin = temp.min() - (0.01 * temp.min())
                        tmax = temp.max() + (0.01 * temp.max())
                    elif 1 < temp.max() - temp.min() < 1.5:
                        tmin = temp.min() - (0.05 * temp.min())
                        tmax = temp.max() + (0.05 * temp.max())
                    else:
                        tmin = temp.min() - (0.1 * temp.min())
                        tmax = temp.max() + (0.1 * temp.max())

                    # Calculate how many gridcells are needed in the x and y directions and
                    # Create temp and sal vectors of appropriate dimensions
                    xdim = int(round((smax-smin)/0.1 + 1, 0))
                    if xdim == 1:
                        xdim = 2
                    si = np.linspace(0, xdim - 1, xdim) * 0.1 + smin

                    if 1.1 <= temp.max() - temp.min() < 1.7:  # if the diff between min and max temp is small
                        ydim = int(round((tmax-tmin)/0.75 + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) * 0.75 + tmin
                    elif temp.max() - temp.min() < 1.1:
                        ydim = int(round((tmax - tmin) / 0.1 + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) * 0.1 + tmin
                    else:
                        ydim = int(round((tmax - tmin) + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) + tmin

                    # Create empty grid of zeros
                    mdens = np.zeros((ydim, xdim))

                    # Loop to fill in grid with densities
                    for j in range(0, ydim):
                        for i in range(0, xdim):
                            mdens[j, i] = gsw.density.rho(si[i], ti[j], np.median(p))  # calculate density using median pressure value

                    fig, ax = pf.plot_ts(si, ti, mdens, sal, temp, colors)

                    ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\ncolors = time (cooler: earlier)'), fontsize=9)
                    leg_text = ('Removed {} values (SD=5)'.format(len(ds[xvar].values) - len(sal)),)
                    ax.legend(leg_text, loc='best', fontsize=6)
                    pf.save_fig(save_dir, filename)
Пример #2
0
def add_pressure_to_dictionary_of_sci_vars(ds):
    y_unit = []
    y_name = []
    y_fillvalue = []
    if 'MOAS' in ds.subsite:
        if 'CTD' in ds.sensor:  # for glider CTDs, pressure is a coordinate
            pressure = 'sci_water_pressure_dbar'
            y = ds[pressure].values
            if ds[pressure].units not in y_unit:
                y_unit.append(ds[pressure].units)
            if ds[pressure].long_name not in y_name:
                y_name.append(ds[pressure].long_name)
        else:
            pressure = 'int_ctd_pressure'
            y = ds[pressure].values
            try:
                if ds[pressure].units not in y_unit:
                    y_unit.append(ds[pressure].units)
            except AttributeError:
                y_unit.append('no_units')
            try:
                if ds[pressure].long_name not in y_name:
                    y_name.append(ds[pressure].long_name)
            except AttributeError:
                y_name.append('pressure')
    else:
        try:
            pressure = pf.pressure_var(ds, ds.data_vars.keys())
            y = ds[pressure].values
            if len(ds[pressure].dims) > 1:
                print('Pressure has >1 dimension')
                y_empty = np.empty((1, len(ds['time'].values)))
                y_empty[:] = np.nan
                y = y_empty.ravel()
        except KeyError:
            print('no pressure variable in file - replacing by a nan array')
            y_empty = np.empty((1, len(ds['time'].values)))
            y_empty[:] = np.nan
            y = y_empty.ravel()

    if sum(np.isnan(y)) == len(y) or len(y[y != 0]) == 0 or len(
            y[y != ds[pressure]._FillValue]) == 0:
        print(
            'Pressure array of all  NaNs or zeros or fill values - ... trying to use pressure coordinate'
        )
        pressure = [
            pressure for pressure in ds.coords.keys()
            if 'pressure' in ds.coords[pressure].name
        ]

        if len(pressure) == 1:
            pressure = pressure[0]
            y = ds.coords[pressure].values
        else:
            print('Missing pressure coordinate: ', pressure)
            y_empty = np.empty((1, len(ds['time'].values)))
            y_empty[:] = np.nan
            y = y_empty.ravel()

    try:
        ds[pressure].units
        if ds[pressure].units not in y_unit:
            y_unit.append(ds[pressure].units)
    except AttributeError:
        print('pressure attributes missing units')
        if 'pressure unit missing' not in y_unit:
            y_unit.append('pressure unit missing')

    try:
        ds[pressure].long_name
        if ds[pressure].long_name not in y_name:
            y_name.append(ds[pressure].long_name)
    except AttributeError:
        print('pressure attributes missing long_name')
        if 'pressure long name missing' not in y_name:
            y_name.append('pressure long name missing')

    try:
        ds[pressure]._FillValue
        if ds[pressure]._FillValue not in y_fillvalue:
            y_fillvalue.append(ds[pressure]._FillValue)
    except AttributeError:
        print('pressure attributes missing _FillValue')
        if 'pressure Fill Value missing' not in y_fillvalue:
            y_fillvalue.append('pressure Fill Value missing')

    return pressure, y, y_unit, y_name, y_fillvalue
def main(sDir, url_list):
    reviewlist = pd.read_csv(
        'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv'
    )

    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    json_file_list = []
    for r in rd_list:
        dependencies = []
        print('\n{}'.format(r))
        data = OrderedDict(deployments=OrderedDict())
        save_dir = os.path.join(sDir, r.split('-')[0], r)
        cf.create_dir(save_dir)

        # Deployment location test
        deploy_loc_test = cf.deploy_location_check(r)
        data['location_comparison'] = deploy_loc_test

        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))

            # complete the analysis by reference designator
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])

                # check for the OOI 1.0 datasets for review
                rl_filtered = reviewlist.loc[
                    (reviewlist['Reference Designator'] == r)
                    & (reviewlist['status'] == 'for review')]
                review_deployments = rl_filtered['deploymentNumber'].tolist()
                review_deployments_int = [
                    'deployment%04d' % int(x) for x in review_deployments
                ]
                for rev_dep in review_deployments_int:
                    rdatasets = [s for s in udatasets if rev_dep in s]
                    if len(rdatasets) > 0:
                        datasets = []
                        for dss in rdatasets:  # filter out collocated data files
                            if catalog_rms == dss.split('/')[-1].split(
                                    '_20')[0][15:]:
                                datasets.append(dss)
                            else:
                                drd = dss.split('/')[-1].split('_20')[0][15:42]
                                if drd not in dependencies and drd != r:
                                    dependencies.append(drd)

                        notes = []
                        time_ascending = ''
                        if len(datasets) == 1:
                            try:
                                ds = xr.open_dataset(datasets[0],
                                                     mask_and_scale=False)
                                ds = ds.swap_dims({'obs': 'time'})
                                fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(
                                    datasets[0])
                            except OSError:
                                print('OSError - skipping file {}'.format(
                                    datasets[0]))
                                continue
                        elif len(datasets) > 1:
                            ds = xr.open_mfdataset(datasets,
                                                   mask_and_scale=False)
                            ds = ds.swap_dims({'obs': 'time'})
                            #ds = ds.chunk({'time': 100})
                            fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(
                                datasets[0])
                            fname = fname.split('_20')[0]
                            notes.append('multiple deployment .nc files')
                            # when opening multiple datasets, don't check that the timestamps are in ascending order
                            time_ascending = 'not_tested'
                        else:
                            continue

                        print('\nAnalyzing file: {}'.format(fname))

                        # Get info from the data review database
                        dr_data = cf.refdes_datareview_json(refdes)
                        stream_vars = cf.return_stream_vars(data_stream)
                        sci_vars = cf.return_science_vars(data_stream)
                        node = refdes.split('-')[1]
                        if 'cspp' in data_stream or 'WFP' in node:
                            sci_vars.append('int_ctd_pressure')

                        # if 'FDCHP' in refdes:
                        #     remove_vars = ['fdchp_wind_x', 'fdchp_wind_y', 'fdchp_wind_z', 'fdchp_speed_of_sound_sonic',
                        #                    'fdchp_x_accel_g', 'fdchp_y_accel_g', 'fdchp_z_accel_g']
                        #     rv_regex = re.compile('|'.join(remove_vars))
                        #     rv_sci_vars = [nn for nn in sci_vars if not rv_regex.search(nn)]
                        #     sci_vars = rv_sci_vars

                        deploy_info = get_deployment_information(
                            dr_data, int(deployment[-4:]))

                        # Grab deployment Variables
                        deploy_start = str(deploy_info['start_date'])
                        deploy_stop = str(deploy_info['stop_date'])
                        deploy_lon = deploy_info['longitude']
                        deploy_lat = deploy_info['latitude']
                        deploy_depth = deploy_info['deployment_depth']

                        # Calculate days deployed
                        if deploy_stop != 'None':
                            r_deploy_start = pd.to_datetime(
                                deploy_start).replace(hour=0,
                                                      minute=0,
                                                      second=0)
                            if deploy_stop.split('T')[1] == '00:00:00':
                                r_deploy_stop = pd.to_datetime(deploy_stop)
                            else:
                                r_deploy_stop = (pd.to_datetime(deploy_stop) +
                                                 timedelta(days=1)).replace(
                                                     hour=0,
                                                     minute=0,
                                                     second=0)
                            n_days_deployed = (r_deploy_stop -
                                               r_deploy_start).days
                        else:
                            n_days_deployed = None

                        # Add reference designator to dictionary
                        try:
                            data['refdes']
                        except KeyError:
                            data['refdes'] = refdes

                        deployments = data['deployments'].keys()
                        data_start = pd.to_datetime(min(
                            ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')
                        data_stop = pd.to_datetime(max(
                            ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')

                        # Add deployment and info to dictionary and initialize delivery method sub-dictionary
                        if deployment not in deployments:
                            data['deployments'][deployment] = OrderedDict(
                                deploy_start=deploy_start,
                                deploy_stop=deploy_stop,
                                n_days_deployed=n_days_deployed,
                                lon=deploy_lon,
                                lat=deploy_lat,
                                deploy_depth=deploy_depth,
                                method=OrderedDict())

                        # Add delivery methods to dictionary and initialize stream sub-dictionary
                        methods = data['deployments'][deployment][
                            'method'].keys()
                        if method not in methods:
                            data['deployments'][deployment]['method'][
                                method] = OrderedDict(stream=OrderedDict())

                        # Add streams to dictionary and initialize file sub-dictionary
                        streams = data['deployments'][deployment]['method'][
                            method]['stream'].keys()

                        if data_stream not in streams:
                            data['deployments'][deployment]['method'][method][
                                'stream'][data_stream] = OrderedDict(
                                    file=OrderedDict())

                        # Get a list of data gaps >1 day
                        time_df = pd.DataFrame(ds['time'].values,
                                               columns=['time'])
                        gap_list = cf.timestamp_gap_test(time_df)

                        # Calculate the sampling rate to the nearest second
                        time_df['diff'] = time_df['time'].diff().astype(
                            'timedelta64[s]')
                        rates_df = time_df.groupby(['diff']).agg(['count'])
                        n_diff_calc = len(time_df) - 1
                        rates = dict(n_unique_rates=len(rates_df),
                                     common_sampling_rates=dict())
                        for i, row in rates_df.iterrows():
                            percent = (float(row['time']['count']) /
                                       float(n_diff_calc))
                            if percent > 0.1:
                                rates['common_sampling_rates'].update(
                                    {int(i): '{:.2%}'.format(percent)})

                        sampling_rt_sec = None
                        for k, v in rates['common_sampling_rates'].items():
                            if float(v.strip('%')) > 50.00:
                                sampling_rt_sec = k

                        if not sampling_rt_sec:
                            sampling_rt_sec = 'no consistent sampling rate: {}'.format(
                                rates['common_sampling_rates'])

                        # Check that the timestamps in the file are unique
                        time = ds['time']
                        len_time = time.__len__()
                        len_time_unique = np.unique(time).__len__()
                        if len_time == len_time_unique:
                            time_test = 'pass'
                        else:
                            time_test = 'fail'

                        # Check that the timestamps in the file are in ascending order
                        if time_ascending != 'not_tested':
                            # convert time to number
                            time_in = [
                                dt.datetime.utcfromtimestamp(
                                    np.datetime64(x).astype('O') / 1e9)
                                for x in ds['time'].values
                            ]
                            time_data = nc.date2num(
                                time_in, 'seconds since 1900-01-01')

                            # Create a list of True or False by iterating through the array of time and checking
                            # if every time stamp is increasing
                            result = [(time_data[k + 1] - time_data[k]) > 0
                                      for k in range(len(time_data) - 1)]

                            # Print outcome of the iteration with the list of indices when time is not increasing
                            if result.count(True) == len(time) - 1:
                                time_ascending = 'pass'
                            else:
                                ind_fail = {
                                    k: time_in[k]
                                    for k, v in enumerate(result) if v is False
                                }
                                time_ascending = 'fail: {}'.format(ind_fail)

                        # Count the number of days for which there is at least 1 timestamp
                        n_days = len(
                            np.unique(time.values.astype('datetime64[D]')))

                        # Compare variables in file to variables in Data Review Database
                        ds_variables = list(ds.data_vars.keys()) + list(
                            ds.coords.keys())
                        #ds_variables = [k for k in ds]
                        ds_variables = eliminate_common_variables(ds_variables)
                        ds_variables = [
                            x for x in ds_variables if 'qc' not in x
                        ]
                        [_, unmatch1] = compare_lists(stream_vars,
                                                      ds_variables)
                        [_, unmatch2] = compare_lists(ds_variables,
                                                      stream_vars)

                        # Check deployment pressure from asset management against pressure variable in file
                        press = pf.pressure_var(ds, list(ds.coords.keys()))
                        if press is None:
                            press = pf.pressure_var(ds,
                                                    list(ds.data_vars.keys()))

                        # calculate mean pressure from data, excluding outliers +/- 3 SD
                        try:
                            pressure = ds[press]
                            num_dims = len(pressure.dims)
                            if len(pressure) > 1:
                                # if the pressure variable is an array of all zeros (as in the case of pressure_depth
                                # for OPTAAs on surface piercing profilers
                                if (len(np.unique(pressure)) == 1) & (
                                        np.unique(pressure)[0] == 0.0):
                                    try:
                                        pressure = ds['int_ctd_pressure']
                                        press = 'int_ctd_pressure'
                                    except KeyError:
                                        pressure = pressure

                                # reject NaNs
                                p_nonan = pressure.values[~np.isnan(pressure.
                                                                    values)]

                                # reject fill values
                                p_nonan_nofv = p_nonan[
                                    p_nonan != pressure._FillValue]

                                # reject data outside of global ranges
                                [pg_min,
                                 pg_max] = cf.get_global_ranges(r, press)
                                if pg_min is not None and pg_max is not None:
                                    pgr_ind = cf.reject_global_ranges(
                                        p_nonan_nofv, pg_min, pg_max)
                                    p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
                                else:
                                    p_nonan_nofv_gr = p_nonan_nofv

                                if (len(p_nonan_nofv_gr) > 0) and (num_dims
                                                                   == 1):
                                    [
                                        press_outliers, pressure_mean, _,
                                        pressure_max, _, _
                                    ] = cf.variable_statistics(
                                        p_nonan_nofv_gr, 3)
                                    pressure_mean = round(pressure_mean, 2)
                                    pressure_max = round(pressure_max, 2)
                                elif (len(p_nonan_nofv_gr) > 0) and (num_dims >
                                                                     1):
                                    print('variable has more than 1 dimension')
                                    press_outliers = 'not calculated: variable has more than 1 dimension'
                                    pressure_mean = round(
                                        np.nanmean(p_nonan_nofv_gr), 2)
                                    pressure_max = round(
                                        np.nanmax(p_nonan_nofv_gr), 2)
                                else:
                                    press_outliers = None
                                    pressure_mean = None
                                    pressure_max = None
                                    if len(pressure) > 0 and len(p_nonan) == 0:
                                        notes.append(
                                            'Pressure variable all NaNs')
                                    elif len(pressure) > 0 and len(
                                            p_nonan) > 0 and len(
                                                p_nonan_nofv) == 0:
                                        notes.append(
                                            'Pressure variable all fill values'
                                        )
                                    elif len(pressure) > 0 and len(
                                            p_nonan) > 0 and len(
                                                p_nonan_nofv) > 0 and len(
                                                    p_nonan_nofv_gr) == 0:
                                        notes.append(
                                            'Pressure variable outside of global ranges'
                                        )

                            else:  # if there is only 1 data point
                                press_outliers = 0
                                pressure_mean = round(
                                    ds[press].values.tolist()[0], 2)
                                pressure_max = round(
                                    ds[press].values.tolist()[0], 2)

                            try:
                                pressure_units = pressure.units
                            except AttributeError:
                                pressure_units = 'no units attribute for pressure'

                            if pressure_mean:
                                if ('WFP' in node) or ('MOAS' in subsite) or (
                                        'SP' in node):
                                    pressure_compare = int(round(pressure_max))
                                else:
                                    pressure_compare = int(
                                        round(pressure_mean))

                                if pressure_units == '0.001 dbar':
                                    pressure_max = round((pressure_max / 1000),
                                                         2)
                                    pressure_mean = round(
                                        (pressure_mean / 1000), 2)
                                    pressure_compare = round(
                                        (pressure_compare / 1000), 2)
                                    notes.append(
                                        'Pressure converted from 0.001 dbar to dbar for pressure comparison'
                                    )

                                elif pressure_units == 'daPa':
                                    pressure_max = round((pressure_max / 1000),
                                                         2)
                                    pressure_mean = round(
                                        (pressure_mean / 1000), 2)
                                    pressure_compare = round(
                                        (pressure_compare / 1000), 2)
                                    notes.append(
                                        'Pressure converted from daPa to dbar for pressure comparison'
                                    )

                            else:
                                pressure_compare = None

                            if (not deploy_depth) or (not pressure_mean):
                                pressure_diff = None
                            else:
                                pressure_diff = pressure_compare - deploy_depth

                        except KeyError:
                            press = 'no seawater pressure in file'
                            pressure_diff = None
                            pressure_mean = None
                            pressure_max = None
                            pressure_compare = None
                            press_outliers = None
                            pressure_units = None

                        # Add files and info to dictionary
                        filenames = data['deployments'][deployment]['method'][
                            method]['stream'][data_stream]['file'].keys()

                        if fname not in filenames:
                            data['deployments'][deployment]['method'][method][
                                'stream'][data_stream]['file'][
                                    fname] = OrderedDict(
                                        file_downloaded=pd.to_datetime(
                                            splitter[0][0:15]).strftime(
                                                '%Y-%m-%dT%H:%M:%S'),
                                        file_coordinates=list(
                                            ds.coords.keys()),
                                        sampling_rate_seconds=sampling_rt_sec,
                                        sampling_rate_details=rates,
                                        data_start=data_start,
                                        data_stop=data_stop,
                                        time_gaps=gap_list,
                                        unique_timestamps=time_test,
                                        n_timestamps=len_time,
                                        n_days=n_days,
                                        notes=notes,
                                        ascending_timestamps=time_ascending,
                                        pressure_comparison=dict(
                                            pressure_mean=pressure_mean,
                                            units=pressure_units,
                                            num_outliers=press_outliers,
                                            diff=pressure_diff,
                                            pressure_max=pressure_max,
                                            variable=press,
                                            pressure_compare=pressure_compare),
                                        vars_in_file=ds_variables,
                                        vars_not_in_file=[
                                            x for x in unmatch1
                                            if 'time' not in x
                                        ],
                                        vars_not_in_db=unmatch2,
                                        sci_var_stats=OrderedDict())

                        # calculate statistics for science variables, excluding outliers +/- 5 SD
                        for sv in sci_vars:
                            if sv != 't_max':  # for ADCP
                                if sv != 'wavss_a_buoymotion_time':
                                    print(sv)
                                    try:
                                        var = ds[sv]
                                        # need to round SPKIR values to 1 decimal place to match the global ranges.
                                        # otherwise, values that round to zero (e.g. 1.55294e-05) will be excluded by
                                        # the global range test
                                        # if 'spkir' in sv:
                                        #     vD = np.round(var.values, 1)
                                        # else:
                                        #     vD = var.values
                                        vD = var.values
                                        if 'timedelta' not in str(
                                                var.values.dtype):
                                            # for OPTAA wavelengths: when multiple files are opened with xr.open_mfdataset
                                            # xarray automatically forces all variables to have the same number of
                                            # dimensions. So in this case wavelength_a and wavelength_c have 1 dimension
                                            # in the individual files, so I'm forcing the analysis to treat them like
                                            # they have 1 dimension (when there are multiple files for 1 deployment)
                                            if sv == 'wavelength_a' or sv == 'wavelength_c':
                                                [g_min,
                                                 g_max] = cf.get_global_ranges(
                                                     r, sv)
                                                vnum_dims = len(var.dims)
                                                if vnum_dims == 1:
                                                    n_all = len(var)
                                                    mean = list(vD)
                                                else:
                                                    vnum_dims = 1
                                                    n_all = len(vD[0])
                                                    mean = list(vD[0])
                                                num_outliers = None
                                                vmin = None
                                                vmax = None
                                                sd = None
                                                n_stats = 'not calculated'
                                                var_units = var.units
                                                n_nan = None
                                                n_fv = None
                                                n_grange = 'no global ranges'
                                                fv = var._FillValue

                                            else:
                                                vnum_dims = len(var.dims)
                                                if vnum_dims > 2:
                                                    print(
                                                        'variable has more than 2 dimensions'
                                                    )
                                                    num_outliers = None
                                                    mean = None
                                                    vmin = None
                                                    vmax = None
                                                    sd = None
                                                    n_stats = 'variable has more than 2 dimensions'
                                                    var_units = var.units
                                                    n_nan = None
                                                    n_fv = None
                                                    n_grange = None
                                                    fv = None
                                                    n_all = None
                                                else:
                                                    if vnum_dims > 1:
                                                        n_all = [
                                                            len(vD),
                                                            len(vD.flatten())
                                                        ]
                                                    else:
                                                        n_all = len(vD)
                                                    n_nan = int(
                                                        np.sum(np.isnan(vD)))
                                                    fv = var._FillValue
                                                    var_nofv = var.where(
                                                        var != fv)
                                                    n_fv = int(
                                                        np.sum(
                                                            np.isnan(
                                                                var_nofv.values
                                                            ))) - n_nan

                                                    try:
                                                        var_units = var.units
                                                    except AttributeError:
                                                        var_units = 'no_units'
                                                    [g_min, g_max
                                                     ] = cf.get_global_ranges(
                                                         r, sv)
                                                    if list(
                                                            np.unique(
                                                                np.isnan(
                                                                    var_nofv))
                                                    ) != [True]:
                                                        # reject data outside of global ranges
                                                        if g_min is not None and g_max is not None:
                                                            var_gr = var_nofv.where(
                                                                (var_nofv >=
                                                                 g_min)
                                                                & (var_nofv <=
                                                                   g_max))
                                                            n_grange = int(
                                                                np.sum(
                                                                    np.isnan(
                                                                        var_gr)
                                                                ) - n_fv -
                                                                n_nan)
                                                        else:
                                                            n_grange = 'no global ranges'
                                                            var_gr = var_nofv

                                                        if list(
                                                                np.unique(
                                                                    np.isnan(
                                                                        var_gr)
                                                                )) != [True]:
                                                            if sv == 'spkir_abj_cspp_downwelling_vector':
                                                                # don't remove outliers from dataset
                                                                [
                                                                    num_outliers,
                                                                    mean, vmin,
                                                                    vmax, sd,
                                                                    n_stats
                                                                ] = cf.variable_statistics_spkir(
                                                                    var_gr)
                                                            else:
                                                                if vnum_dims > 1:
                                                                    var_gr = var_gr.values.flatten(
                                                                    )
                                                                # drop nans before calculating stats
                                                                var_gr = var_gr[
                                                                    ~np.isnan(
                                                                        var_gr
                                                                    )]
                                                                [
                                                                    num_outliers,
                                                                    mean, vmin,
                                                                    vmax, sd,
                                                                    n_stats
                                                                ] = cf.variable_statistics(
                                                                    var_gr, 5)
                                                        else:
                                                            num_outliers = None
                                                            mean = None
                                                            vmin = None
                                                            vmax = None
                                                            sd = None
                                                            n_stats = 0
                                                            n_grange = None
                                                    else:
                                                        num_outliers = None
                                                        mean = None
                                                        vmin = None
                                                        vmax = None
                                                        sd = None
                                                        n_stats = 0
                                                        n_grange = None

                                    except KeyError:
                                        if sv == 'int_ctd_pressure':
                                            continue
                                        else:
                                            num_outliers = None
                                            mean = None
                                            vmin = None
                                            vmax = None
                                            sd = None
                                            n_stats = 'variable not found in file'
                                            var_units = None
                                            n_nan = None
                                            n_fv = None
                                            fv = None
                                            n_grange = None
                                            n_all = None

                                    if vnum_dims > 1:
                                        sv = '{} (dims: {})'.format(
                                            sv, list(var.dims))
                                    else:
                                        sv = sv
                                    if 'timedelta' not in str(
                                            var.values.dtype):
                                        data['deployments'][deployment][
                                            'method'][method]['stream'][
                                                data_stream]['file'][fname][
                                                    'sci_var_stats'][sv] = dict(
                                                        n_outliers=num_outliers,
                                                        mean=mean,
                                                        min=vmin,
                                                        max=vmax,
                                                        stdev=sd,
                                                        n_stats=n_stats,
                                                        units=var_units,
                                                        n_nans=n_nan,
                                                        n_fillvalues=n_fv,
                                                        fill_value=str(fv),
                                                        global_ranges=[
                                                            g_min, g_max
                                                        ],
                                                        n_grange=n_grange,
                                                        n_all=n_all)

        sfile = os.path.join(save_dir, '{}-file_analysis.json'.format(r))
        with open(sfile, 'w') as outfile:
            json.dump(data, outfile)

        depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r))
        with open(depfile, 'w') as depf:
            depf.write(str(dependencies))

        json_file_list.append(str(sfile))

    return json_file_list
def main(sDir, url_list, deployment_num):
    reviewlist = pd.read_csv(
        'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv')

    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    json_file_list = []
    for r in rd_list:
        dependencies = []
        print('\n{}'.format(r))
        data = OrderedDict(deployments=OrderedDict())
        save_dir = os.path.join(sDir, r.split('-')[0], r)
        cf.create_dir(save_dir)

        # Deployment location test
        deploy_loc_test = cf.deploy_location_check(r)
        data['location_comparison'] = deploy_loc_test

        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))

            # complete the analysis by reference designator
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])

                # check for the OOI 1.0 datasets for review
                rl_filtered = reviewlist.loc[
                    (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')]
                review_deployments = rl_filtered['deploymentNumber'].tolist()
                review_deployments_int = ['deployment%04d' % int(x) for x in review_deployments]

                for rev_dep in review_deployments_int:
                    if deployment_num is not None:
                        if int(rev_dep[-4:]) is not deployment_num:
                            print('\nskipping {}'.format(rev_dep))
                            continue

                    rdatasets = [s for s in udatasets if rev_dep in s]
                    rdatasets.sort()
                    if len(rdatasets) > 0:
                        datasets = []
                        for dss in rdatasets:  # filter out collocated data files
                            if catalog_rms == dss.split('/')[-1].split('_20')[0][15:]:
                                datasets.append(dss)
                            else:
                                drd = dss.split('/')[-1].split('_20')[0][15:42]
                                if drd not in dependencies and drd != r:
                                    dependencies.append(drd)

                        notes = []
                        time_ascending = ''
                        sci_vars_dict = {}
                        #datasets = datasets[0:2]  #### for testing
                        for i in range(len(datasets)):
                            ds = xr.open_dataset(datasets[i], mask_and_scale=False)
                            ds = ds.swap_dims({'obs': 'time'})
                            print('\nAppending data from {}: file {} of {}'.format(rev_dep, i+1, len(datasets)))

                            # when opening multiple datasets, don't check that the timestamps are in ascending order
                            time_ascending = 'not_tested'

                            if i == 0:
                                fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0])
                                fname = fname.split('_20')[0]

                                # Get info from the data review database
                                dr_data = cf.refdes_datareview_json(refdes)
                                stream_vars = cf.return_stream_vars(data_stream)
                                sci_vars = cf.return_science_vars(data_stream)
                                node = refdes.split('-')[1]
                                if 'cspp' in data_stream or 'WFP' in node:
                                    sci_vars.append('int_ctd_pressure')

                                # Add pressure to the list of science variables
                                press = pf.pressure_var(ds, list(ds.coords.keys()))
                                if press is None:
                                    press = pf.pressure_var(ds, list(ds.data_vars.keys()))
                                if press is not None:
                                    sci_vars.append(press)
                                sci_vars.append('time')
                                sci_vars = list(np.unique(sci_vars))
                                if 'ADCP' in r:
                                    sci_vars = [x for x in sci_vars if 'beam' not in x]

                                for sci_var in sci_vars:
                                    if sci_var == 'time':
                                        sci_vars_dict.update(
                                            {sci_var: dict(values=np.array([], dtype=np.datetime64), units=[], fv=[])})
                                    else:
                                        sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[])})

                                deploy_info = get_deployment_information(dr_data, int(deployment[-4:]))

                                # Grab deployment Variables
                                deploy_start = str(deploy_info['start_date'])
                                deploy_stop = str(deploy_info['stop_date'])
                                deploy_lon = deploy_info['longitude']
                                deploy_lat = deploy_info['latitude']
                                deploy_depth = deploy_info['deployment_depth']

                                # Calculate days deployed
                                if deploy_stop != 'None':
                                    r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
                                    if deploy_stop.split('T')[1] == '00:00:00':
                                        r_deploy_stop = pd.to_datetime(deploy_stop)
                                    else:
                                        r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
                                    n_days_deployed = (r_deploy_stop - r_deploy_start).days
                                else:
                                    n_days_deployed = None

                                # Add reference designator to dictionary
                                try:
                                    data['refdes']
                                except KeyError:
                                    data['refdes'] = refdes

                            # append data for the deployment into a dictionary
                            for s_v in sci_vars_dict.keys():
                                vv = ds[s_v]
                                try:
                                    if vv.units not in sci_vars_dict[s_v]['units']:
                                        sci_vars_dict[s_v]['units'].append(vv.units)
                                except AttributeError:
                                    print('')
                                try:
                                    if vv._FillValue not in sci_vars_dict[s_v]['fv']:
                                        sci_vars_dict[s_v]['fv'].append(vv._FillValue)
                                except AttributeError:
                                    print('')
                                if len(vv.dims) == 1:
                                    if s_v in ['wavelength_a', 'wavelength_c']:
                                        # if the array is not same as the array that was already appended for these
                                        # two OPTAA variables, append. if it's already there, don't append
                                        if np.sum(vv.values == sci_vars_dict[s_v]['values']) != len(vv.values):
                                            sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'],
                                                                                     vv.values)
                                    else:
                                        sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values)

                                elif len(vv.dims) == 2:  # appending 2D datasets
                                    vD = vv.values.T
                                    if len(sci_vars_dict[s_v]['values']) == 0:
                                        sci_vars_dict[s_v]['values'] = vD
                                    else:
                                        sci_vars_dict[s_v]['values'] = np.concatenate((sci_vars_dict[s_v]['values'], vD), axis=1)

                        deployments = data['deployments'].keys()
                        data_start = pd.to_datetime(min(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S')
                        data_stop = pd.to_datetime(max(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S')

                        # Add deployment and info to dictionary and initialize delivery method sub-dictionary
                        if deployment not in deployments:
                            data['deployments'][deployment] = OrderedDict(deploy_start=deploy_start,
                                                                          deploy_stop=deploy_stop,
                                                                          n_days_deployed=n_days_deployed,
                                                                          lon=deploy_lon,
                                                                          lat=deploy_lat,
                                                                          deploy_depth=deploy_depth,
                                                                          method=OrderedDict())

                        # Add delivery methods to dictionary and initialize stream sub-dictionary
                        methods = data['deployments'][deployment]['method'].keys()
                        if method not in methods:
                            data['deployments'][deployment]['method'][method] = OrderedDict(
                                stream=OrderedDict())

                        # Add streams to dictionary and initialize file sub-dictionary
                        streams = data['deployments'][deployment]['method'][method]['stream'].keys()

                        if data_stream not in streams:
                            data['deployments'][deployment]['method'][method]['stream'][
                                data_stream] = OrderedDict(file=OrderedDict())

                        # Get a list of data gaps >1 day
                        time_df = pd.DataFrame(sci_vars_dict['time']['values'], columns=['time'])
                        time_df = time_df.sort_values(by=['time'])
                        gap_list = cf.timestamp_gap_test(time_df)

                        # Calculate the sampling rate to the nearest second
                        time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
                        rates_df = time_df.groupby(['diff']).agg(['count'])
                        n_diff_calc = len(time_df) - 1
                        rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
                        for i, row in rates_df.iterrows():
                            percent = (float(row['time']['count']) / float(n_diff_calc))
                            if percent > 0.1:
                                rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})

                        sampling_rt_sec = None
                        for k, v in rates['common_sampling_rates'].items():
                            if float(v.strip('%')) > 50.00:
                                sampling_rt_sec = k

                        if not sampling_rt_sec:
                            sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates'])

                        # Don't do : Check that the timestamps in the file are unique
                        time_test = ''

                        # Count the number of days for which there is at least 1 timestamp
                        n_days = len(np.unique(sci_vars_dict['time']['values'].astype('datetime64[D]')))

                        # Compare variables in file to variables in Data Review Database
                        ds_variables = list(ds.data_vars.keys()) + list(ds.coords.keys())
                        ds_variables = eliminate_common_variables(ds_variables)
                        ds_variables = [x for x in ds_variables if 'qc' not in x]
                        [_, unmatch1] = compare_lists(stream_vars, ds_variables)
                        [_, unmatch2] = compare_lists(ds_variables, stream_vars)

                        # calculate mean pressure from data, excluding outliers +/- 3 SD
                        try:
                            pressure = sci_vars_dict[press]
                            if len(pressure) > 1:
                                # reject NaNs
                                p_nonan = pressure['values'][~np.isnan(pressure['values'])]

                                # reject fill values
                                p_nonan_nofv = p_nonan[p_nonan != pressure['fv'][0]]

                                # reject data outside of global ranges
                                [pg_min, pg_max] = cf.get_global_ranges(r, press)
                                if pg_min is not None and pg_max is not None:
                                    pgr_ind = cf.reject_global_ranges(p_nonan_nofv, pg_min, pg_max)
                                    p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
                                else:
                                    p_nonan_nofv_gr = p_nonan_nofv

                                if (len(p_nonan_nofv_gr) > 0):
                                    [press_outliers, pressure_mean, _, pressure_max, _, _] = cf.variable_statistics(p_nonan_nofv_gr, 3)
                                    pressure_mean = round(pressure_mean, 2)
                                    pressure_max = round(pressure_max, 2)
                                else:
                                    press_outliers = None
                                    pressure_mean = None
                                    pressure_max = None
                                    if len(pressure) > 0 and len(p_nonan) == 0:
                                        notes.append('Pressure variable all NaNs')
                                    elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0:
                                        notes.append('Pressure variable all fill values')
                                    elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0:
                                        notes.append('Pressure variable outside of global ranges')

                            else:  # if there is only 1 data point
                                press_outliers = 0
                                pressure_mean = round(ds[press].values.tolist()[0], 2)
                                pressure_max = round(ds[press].values.tolist()[0], 2)

                            try:
                                pressure_units = pressure['units'][0]
                            except AttributeError:
                                pressure_units = 'no units attribute for pressure'

                            if pressure_mean:
                                if 'SF' in node:
                                    pressure_compare = int(round(pressure_max))
                                else:
                                    pressure_compare = int(round(pressure_mean))

                                if pressure_units == '0.001 dbar':
                                    pressure_max = round((pressure_max / 1000), 2)
                                    pressure_mean = round((pressure_mean / 1000), 2)
                                    pressure_compare = round((pressure_compare / 1000), 2)
                                    notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison')

                                elif pressure_units == 'daPa':
                                    pressure_max = round((pressure_max / 1000), 2)
                                    pressure_mean = round((pressure_mean / 1000), 2)
                                    pressure_compare = round((pressure_compare / 1000), 2)
                                    notes.append('Pressure converted from daPa to dbar for pressure comparison')

                            else:
                                pressure_compare = None

                            if (not deploy_depth) or (not pressure_mean):
                                pressure_diff = None
                            else:
                                pressure_diff = pressure_compare - deploy_depth

                        except KeyError:
                            press = 'no seawater pressure in file'
                            pressure_diff = None
                            pressure_mean = None
                            pressure_max = None
                            pressure_compare = None
                            press_outliers = None
                            pressure_units = None

                        # Add files and info to dictionary
                        filenames = data['deployments'][deployment]['method'][method]['stream'][data_stream][
                            'file'].keys()

                        if fname not in filenames:
                            data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][
                                fname] = OrderedDict(
                                file_downloaded=pd.to_datetime(splitter[0][0:15]).strftime('%Y-%m-%dT%H:%M:%S'),
                                file_coordinates=list(ds.coords.keys()),
                                sampling_rate_seconds=sampling_rt_sec,
                                sampling_rate_details=rates,
                                data_start=data_start,
                                data_stop=data_stop,
                                time_gaps=gap_list,
                                unique_timestamps=time_test,
                                n_timestamps=len(sci_vars_dict['time']['values']),
                                n_days=n_days,
                                notes=notes,
                                ascending_timestamps=time_ascending,
                                pressure_comparison=dict(pressure_mean=pressure_mean, units=pressure_units,
                                                         num_outliers=press_outliers, diff=pressure_diff,
                                                         pressure_max=pressure_max, variable=press,
                                                         pressure_compare=pressure_compare),
                                vars_in_file=ds_variables,
                                vars_not_in_file=[x for x in unmatch1 if 'time' not in x],
                                vars_not_in_db=unmatch2,
                                sci_var_stats=OrderedDict())

                        # calculate statistics for science variables, excluding outliers +/- 5 SD
                        for sv in sci_vars_dict.keys():
                            if sv != 't_max':  # for ADCP
                                if sv != 'time':
                                    print(sv)
                                    var = sci_vars_dict[sv]
                                    vD = var['values']
                                    var_units = var['units']
                                    #if 'timedelta' not in str(vD.dtype):
                                    vnum_dims = len(np.shape(vD))
                                    # for OPTAA wavelengths, print the array
                                    if sv == 'wavelength_a' or sv == 'wavelength_c':
                                        [g_min, g_max] = cf.get_global_ranges(r, sv)
                                        n_all = len(var)
                                        mean = list(vD)
                                        num_outliers = None
                                        vmin = None
                                        vmax = None
                                        sd = None
                                        n_stats = 'not calculated'
                                        n_nan = None
                                        n_fv = None
                                        n_grange = 'no global ranges'
                                        fv = var['fv'][0]
                                    else:
                                        if vnum_dims > 2:
                                            print('variable has more than 2 dimensions')
                                            num_outliers = None
                                            mean = None
                                            vmin = None
                                            vmax = None
                                            sd = None
                                            n_stats = 'variable has more than 2 dimensions'
                                            n_nan = None
                                            n_fv = None
                                            n_grange = None
                                            fv = None
                                            n_all = None
                                        else:
                                            if vnum_dims > 1:
                                                n_all = [len(vD), len(vD.flatten())]
                                            else:
                                                n_all = len(vD)
                                            n_nan = int(np.sum(np.isnan(vD)))
                                            fv = var['fv'][0]
                                            vD[vD == fv] = np.nan  # turn fill values to nans
                                            n_fv = int(np.sum(np.isnan(vD))) - n_nan

                                            [g_min, g_max] = cf.get_global_ranges(r, sv)
                                            if list(np.unique(np.isnan(vD))) != [True]:
                                                # reject data outside of global ranges
                                                if g_min is not None and g_max is not None:
                                                    # turn data outside of global ranges to nans
                                                    #var_gr = var_nofv.where((var_nofv >= g_min) & (var_nofv <= g_max))
                                                    vD[vD < g_min] = np.nan
                                                    vD[vD > g_max] = np.nan
                                                    n_grange = int(np.sum(np.isnan(vD)) - n_fv - n_nan)
                                                else:
                                                    n_grange = 'no global ranges'

                                                if list(np.unique(np.isnan(vD))) != [True]:
                                                    if sv == 'spkir_abj_cspp_downwelling_vector':
                                                        # don't remove outliers from dataset
                                                        [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics_spkir(vD)
                                                    else:
                                                        if vnum_dims > 1:
                                                            var_gr = vD.flatten()
                                                        else:
                                                            var_gr = vD
                                                        # drop nans before calculating stats
                                                        var_gr = var_gr[~np.isnan(var_gr)]
                                                        [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_gr, 5)
                                                else:
                                                    num_outliers = None
                                                    mean = None
                                                    vmin = None
                                                    vmax = None
                                                    sd = None
                                                    n_stats = 0
                                                    n_grange = None
                                            else:
                                                num_outliers = None
                                                mean = None
                                                vmin = None
                                                vmax = None
                                                sd = None
                                                n_stats = 0
                                                n_grange = None

                                    if vnum_dims > 1:
                                        sv = '{} (dims: {})'.format(sv, list(np.shape(var['values'])))
                                    else:
                                        sv = sv
                                    #if 'timedelta' not in str(var.values.dtype):
                                    data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][
                                        fname]['sci_var_stats'][sv] = dict(n_outliers=num_outliers, mean=mean, min=vmin,
                                                                           max=vmax, stdev=sd, n_stats=n_stats, units=var_units,
                                                                           n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv),
                                                                           global_ranges=[g_min, g_max], n_grange=n_grange,
                                                                           n_all=n_all)

                    sfile = os.path.join(save_dir, '{}-{}-file_analysis.json'.format(rev_dep, r))
                    with open(sfile, 'w') as outfile:
                        json.dump(data, outfile)
                    json_file_list.append(str(sfile))

        depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r))
        with open(depfile, 'w') as depf:
            depf.write(str(dependencies))

    return json_file_list
                            print(
                                'The CTD cast was done {} km from the mooring location'
                                .format(diff_loc))

                            # define pressure from the data file
                            if 'SBD' in ds.node:
                                press = np.empty(np.shape(ds['time']))
                                press[:] = 1
                            elif 'RID' in ds.node:
                                press = np.empty(np.shape(ds['time']))
                                press[:] = 7
                            # elif 'RIS' in ds.node:
                            #     press = np.empty(np.shape(ds['time']))
                            #     press[:] = 30
                            else:
                                press = pf.pressure_var(
                                    ds, list(ds.data_vars.keys()))
                                # press = pf.pressure_var(ds, list(ds.coords.keys()))
                                # if press is None:
                                #     press = pf.pressure_var(ds, list(ds.data_vars.keys()))
                                press = ds[press].values

                            if 'CTD' in ds.sensor:
                                try:
                                    ctd_cond = np.squeeze(np.array(df['CNDC']))
                                except KeyError:
                                    try:
                                        ctd_cond = np.squeeze(
                                            np.array(df['c1mS/cm'])) / 10
                                    except KeyError:
                                        try:
                                            ctd_cond = np.squeeze(
def main(url_list, sDir, plot_type):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams chat
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate the data files by methods
        '''
        for ms in ms_list:  # np.unique(methodstream)
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a folder to save figures
            save_dir = os.path.join(sDir, array, subsite, r, plot_type,
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print('\nAppending data file: {}'.format(fd.split('/')[-1]))
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)

                        sh['pressure'] = np.append(sh['pressure'], y)

            if len(y_unit) != 1:
                print('pressure unit varies!')
            else:
                y_unit = y_unit[0]

            if len(y_name) != 1:
                print('pressure long name varies!')
            else:
                y_name = y_name[0]

            for m, n in sci_vars_dict.items():
                for sv, vinfo in n['vars'].items():
                    print('\nWorking on variable: {}'.format(sv))
                    if len(vinfo['t']) < 1:
                        print('no variable data to plot')
                    else:
                        sv_units = vinfo['units'][0]
                        fv = vinfo['fv'][0]
                        t0 = pd.to_datetime(min(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(max(
                            vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                        t = vinfo['t']
                        x = vinfo['values']
                        y = vinfo['pressure']

                    # Check if the array is all NaNs
                    if sum(np.isnan(x)) == len(x):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(x[x != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:
                        # reject fill values
                        fv_ind = x != fv
                        y_nofv = y[fv_ind]
                        t_nofv = t[fv_ind]
                        c_nofv = cm.rainbow(np.linspace(0, 1, len(t[fv_ind])))
                        x_nofv = x[fv_ind]
                        print(len(x) - len(fv_ind), ' fill values')

                        # reject NaNs
                        nan_ind = ~np.isnan(x)
                        t_nofv_nonan = t_nofv[nan_ind]
                        c_nofv_nonan = c_nofv[nan_ind]
                        y_nofv_nonan = y_nofv[nan_ind]
                        x_nofv_nonan = x_nofv[nan_ind]
                        print(len(x) - len(nan_ind), ' NaNs')

                        # reject extreme values
                        ev_ind = cf.reject_extreme_values(x_nofv_nonan)
                        t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                        c_nofv_nonan_noev = c_nofv_nonan[ev_ind]
                        y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                        x_nofv_nonan_noev = x_nofv_nonan[ev_ind]
                        print(len(z) - len(ev_ind), ' Extreme Values', '|1e7|')

                        # reject values outside global ranges:
                        global_min, global_max = cf.get_global_ranges(r, sv)
                        # platform not in qc-table (parad_k_par)
                        # global_min = 0
                        # global_max = 2500
                        print('global ranges for : {}-{}  {} - {}'.format(
                            r, sv, global_min, global_max))
                        if isinstance(global_min, (int, float)) and isinstance(
                                global_max, (int, float)):
                            gr_ind = cf.reject_global_ranges(
                                x_nofv_nonan_noev, global_min, global_max)
                            t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[gr_ind]
                            y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[gr_ind]
                            x_nofv_nonan_noev_nogr = x_nofv_nonan_noev[gr_ind]
                        else:
                            t_nofv_nonan_noev_nogr = t_nofv_nonan_noev
                            y_nofv_nonan_noev_nogr = y_nofv_nonan_noev
                            x_nofv_nonan_noev_nogr = x_nofv_nonan_noev

                    if len(x_nofv_nonan_noev) > 0:
                        if m == 'common_stream_placeholder':
                            sname = '-'.join((r, sv))
                        else:
                            sname = '-'.join((r, m, sv))

                    if sv != 'pressure':
                        columns = ['tsec', 'dbar', str(sv)]
                        bin_size = 10
                        min_r = int(round(min(y_nofv_nonan_noev) - bin_size))
                        max_r = int(round(max(y_nofv_nonan_noev) + bin_size))
                        ranges = list(range(min_r, max_r, bin_size))
                        groups, d_groups = gt.group_by_depth_range(
                            t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr,
                            x_nofv_nonan_noev_nogr, columns, ranges)

                    y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], []
                    tm = 1
                    for ii in range(len(groups)):
                        nan_ind = d_groups[ii + tm].notnull()
                        xtime = d_groups[ii + tm][nan_ind]
                        colors = cm.rainbow(np.linspace(0, 1, len(xtime)))
                        ypres = d_groups[ii + tm + 1][nan_ind]
                        nval = d_groups[ii + tm + 2][nan_ind]
                        tm += 2

                        l_arr.append(len(
                            nval))  # count of data to filter out small groups
                        y_avg.append(ypres.mean())
                        n_avg.append(nval.mean())
                        n_min.append(nval.min())
                        n_max.append(nval.max())
                        n_std = 3
                        n0_std.append(nval.mean() + n_std * nval.std())
                        n1_std.append(nval.mean() - n_std * nval.std())

                    # Plot all data
                    ylabel = y_name + " (" + y_unit + ")"
                    xlabel = sv + " (" + sv_units + ")"
                    clabel = 'Time'

                    fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr,
                                               y_nofv_nonan_noev_nogr,
                                               t_nofv_nonan_noev_nogr,
                                               ylabel,
                                               xlabel,
                                               clabel,
                                               end_times,
                                               deployments,
                                               stdev=None)

                    title_text = ' '.join((r, ms.split('-')[-1])) + '\n' \
                                 + t0 + ' - ' + t1 + '\n' + str(bin_size) +\
                                 ' m average and ' + str(n_std) + ' std shown'

                    ax.set_title(title_text, fontsize=9)
                    ax.plot(n_avg, y_avg, '-k')

                    ax.fill_betweenx(y_avg,
                                     n0_std,
                                     n1_std,
                                     color='m',
                                     alpha=0.2)
                    pf.save_fig(save_dir, sname)

                    # Plot data with outliers removed

                    fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr,
                                               y_nofv_nonan_noev_nogr,
                                               t_nofv_nonan_noev_nogr,
                                               ylabel,
                                               xlabel,
                                               clabel,
                                               end_times,
                                               deployments,
                                               stdev=5)
                    ax.set_title(' '.join((r, ms.split('-')[-1])) + '\n' \
                                 + t0 + ' - ' + t1, fontsize=9)
                    sfile = '_'.join((sname, 'rmoutliers'))
                    pf.save_fig(save_dir, sfile)
def main(sDir, f, start_time, end_time):
    ff = pd.read_csv(os.path.join(sDir, f))
    url_list = ff['outputUrl'].tolist()
    for i, u in enumerate(url_list):
        print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u))
        main_sensor = u.split('/')[-2].split('-')[4]
        datasets = cf.get_nc_urls([u])
        datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets)

        for ii, d in enumerate(datasets_sel):
            print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel),
                                                  d))
            with xr.open_dataset(d, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    d)
                vars = ds.data_vars.keys()

                if 'MOAS' in subsite and 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                    pressure = 'sci_water_pressure_dbar'
                else:
                    pressure = pf.pressure_var(ds, vars)

                sci_vars = cf.return_science_vars(stream)
                sci_vars = [s for s in sci_vars if s not in [pressure]
                            ]  # remove pressure from sci_vars

                save_dir = os.path.join(sDir, subsite, refdes,
                                        'xsection_plots', deployment)
                cf.create_dir(save_dir)

                t = ds['time'].values
                t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))

                y = ds[pressure]

                print('Plotting variables...')
                for var in sci_vars:
                    print(var)
                    z = ds[var]

                    # Plot all data
                    clabel = var + " (" + z.units + ")"
                    ylabel = pressure + " (" + y.units + ")"

                    fig, ax = pf.plot_xsection(subsite,
                                               t,
                                               y,
                                               z,
                                               clabel,
                                               ylabel,
                                               stdev=None)
                    ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                    sfile = '_'.join((fname[0:-46], z.name))
                    pf.save_fig(save_dir, sfile)

                    # Plot data with outliers removed
                    fig, ax = pf.plot_xsection(subsite,
                                               t,
                                               y,
                                               z,
                                               clabel,
                                               ylabel,
                                               stdev=5)
                    ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                    sfile = '_'.join((fname[0:-46], z.name, 'rmoutliers'))
                    pf.save_fig(save_dir, sfile)
Пример #8
0
def main(url_list, sDir, plot_type):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type
    
    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get preferred stream
        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = cf.get_deployment_information(
                dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a folder to save figures
            save_dir = os.path.join(sDir, array, subsite, r, plot_type,
                                    ms.split('-')[0])
            cf.create_dir(save_dir)

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print('\nAppending data file: {}'.format(fd.split('/')[-1]))
                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values

                        sh['pressure'] = np.append(sh['pressure'], y)

                        try:
                            ds[pressure].units
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                        except AttributeError:
                            print('pressure attributes missing units')
                            if 'pressure unit missing' not in y_unit:
                                y_unit.append('pressure unit missing')

                        try:
                            ds[pressure].long_name
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)
                        except AttributeError:
                            print('pressure attributes missing long_name')
                            if 'pressure long name missing' not in y_name:
                                y_name.append('pressure long name missing')

            # create a csv file with diagnostic results:

                if len(y_unit) != 1:
                    print('pressure unit varies')
                    if 'dbar' in y_unit:
                        y_unit = 'dbar'
                    print(y_unit)
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies')
                    if 'Seawater Pressure' in y_name:
                        y_name = 'Seawater Pressure'
                    print(y_name)
                else:
                    y_name = y_name[0]

                # create a folder to save variables statistics
                mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats'
                save_dir_stat = os.path.join(mDir, array, subsite)
                cf.create_dir(save_dir_stat)
                stat_df = pd.DataFrame()
                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z_nofv)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                            # reject values outside global ranges:
                            global_min, global_max = cf.get_global_ranges(
                                r, sv)
                            # platform not in qc-table (parad_k_par)
                            # global_min = 0
                            # global_max = 2500
                            print('global ranges for : {}-{}  {} - {}'.format(
                                r, sv, global_min, global_max))
                            if isinstance(global_min,
                                          (int, float)) and isinstance(
                                              global_max, (int, float)):
                                gr_ind = cf.reject_global_ranges(
                                    z_nofv_nonan_noev, global_min, global_max)
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[
                                    gr_ind]
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[
                                    gr_ind]
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[
                                    gr_ind]
                            else:
                                t_nofv_nonan_noev_nogr = t_nofv_nonan_noev
                                y_nofv_nonan_noev_nogr = y_nofv_nonan_noev
                                z_nofv_nonan_noev_nogr = z_nofv_nonan_noev

                        if len(z_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))

                        # group by depth range
                        sname = '_'.join((sname, sv_units))

                        # if sv != 'pressure':
                        #     columns = ['tsec', 'dbar', str(sv)]
                        #
                        #     # select depth bin size for the data group function
                        #     bin_size = 10
                        #     min_r = int(round(min(y_nofv_nonan_noev) - bin_size))
                        #     max_r = int(round(max(y_nofv_nonan_noev) + bin_size))
                        #     ranges = list(range(min_r, max_r, bin_size))
                        #     groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr,
                        #                                                z_nofv_nonan_noev_nogr, columns, ranges)
                        #

                        # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]):
                        #     if 'pressure' not in sv:
                        #         print('final_stats_{}-{}-{}-{}'.format(r,
                        #                                                ms.split('-')[0],
                        #                                                ps_df[0].values[0].split('-')[0],
                        #                                                sv))
                        #         stat_data = groups.describe()[sv]
                        #         stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False)
                        #         stat_df = stat_df.append(stat_data)

                        # if sv == 'optical_backscatter':
                        #     less_ind = z_nofv_nonan_noev < 0.0004
                        #     print(sv, ' < 0.0004', len(less_ind))
                        #     more_ind = z_nofv_nonan_noev > 0.01
                        #     print(sv, ' > 0.01', len(more_ind))

                        # Plot all data
                        clabel = sv + " (" + sv_units + ")"
                        ylabel = y_name + " (" + y_unit + ")"

                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   z_nofv_nonan_noev,
                                                   clabel,
                                                   ylabel,
                                                   stdev=None)

                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)

                        pf.save_fig(save_dir, sname)

                        # Plot data with outliers removed
                        fig, ax = pf.plot_xsection(subsite,
                                                   t_nofv_nonan_noev_nogr,
                                                   y_nofv_nonan_noev_nogr,
                                                   z_nofv_nonan_noev_nogr,
                                                   clabel,
                                                   ylabel,
                                                   stdev=5)
                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)
                        sfile = '_'.join((sname, 'rmoutliers'))
                        pf.save_fig(save_dir, sfile)

                        # plot data with excluded time range removed
                        dr = pd.read_csv(
                            'https://datareview.marine.rutgers.edu/notes/export'
                        )
                        drn = dr.loc[dr.type == 'exclusion']
                        if len(drn) != 0:
                            subsite_node = '-'.join((subsite, r.split('-')[1]))
                            drne = drn.loc[drn.reference_designator.isin(
                                [subsite, subsite_node, r])]

                            t_ex = t_nofv_nonan_noev_nogr
                            y_ex = y_nofv_nonan_noev_nogr
                            z_ex = z_nofv_nonan_noev_nogr
                            for i, row in drne.iterrows():
                                sdate = cf.format_dates(row.start_date)
                                edate = cf.format_dates(row.end_date)
                                ts = np.datetime64(sdate)
                                te = np.datetime64(edate)
                                ind = np.where((t_ex < ts) | (t_ex > te), True,
                                               False)
                                if len(ind) != 0:
                                    t_ex = t_ex[ind]
                                    z_ex = z_ex[ind]
                                    y_ex = y_ex[ind]

                            fig, ax = pf.plot_xsection(subsite,
                                                       t_ex,
                                                       y_ex,
                                                       z_ex,
                                                       clabel,
                                                       ylabel,
                                                       stdev=None)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            sfile = '_'.join((sname, 'rmsuspectdata'))
                            pf.save_fig(save_dir, sfile)
Пример #9
0
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams chat
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate the data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print(fd)

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    fd)

                if deployment_num is not None:
                    if int(deployment.split('0')[-1]) is not deployment_num:
                        print(type(int(deployment.split('0')[-1])),
                              type(deployment_num))
                        continue

                save_dir = os.path.join(sDir, array, subsite, refdes,
                                        plot_type,
                                        ms.split('-')[0], deployment)
                cf.create_dir(save_dir)

                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)

                        sh['pressure'] = np.append(sh['pressure'], y)

                if len(y_unit) != 1:
                    print('pressure unit varies UHHHHHHHHH')
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies UHHHHHHHHH')
                else:
                    y_name = y_name[0]

                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms.split('-')[1]))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            colors = cm.rainbow(
                                np.linspace(0, 1, len(t_nofv_nonan_noev)))
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                        if len(y_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))
                        # Plot all data
                        ylabel = y_name + " (" + y_unit + ")"
                        xlabel = sv + " (" + sv_units + ")"
                        clabel = 'Time'
                        clabel = sv + " (" + sv_units + ")"

                        fig, ax = pf.plot_profiles(z_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   colors,
                                                   xlabel,
                                                   ylabel,
                                                   stdev=None)
                        ax.set_title((
                            title + '\n' + str(deployment_num) + ': ' + t0 +
                            ' - ' + t1 + '\n' +
                            'used bin = 2 dbar to calculate an average profile (black line) and 3-STD envelope (shaded area)'
                        ),
                                     fontsize=9)

                        # group by depth range
                        columns = ['time', 'pressure', str(sv)]
                        # ranges = [0, 50, 100, 200, 400, 600]
                        ranges = list(
                            range(int(round(min(y_nofv_nonan_noev))),
                                  int(round(max(y_nofv_nonan_noev))), 1))
                        groups, d_groups = gt.group_by_depth_range(
                            t_nofv_nonan_noev, y_nofv_nonan_noev,
                            z_nofv_nonan_noev, columns, ranges)

                        # describe_file = '_'.join((sname, 'statistics.csv'))
                        # # groups.describe().to_csv(save_dir + '/' + describe_file)
                        ind = groups.describe()[sv]['mean'].notnull()
                        groups.describe()[sv][ind].to_csv(
                            '{}/{}_statistics.csv'.format(save_dir, sname),
                            index=True)

                        tm = 1
                        fig, ax = pyplot.subplots(nrows=2, ncols=1)
                        pyplot.margins(y=.08, x=.02)
                        pyplot.grid()
                        y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], []

                        for ii in range(len(groups)):

                            nan_ind = d_groups[ii + tm].notnull()
                            xtime = d_groups[ii + tm][nan_ind]
                            colors = cm.rainbow(np.linspace(0, 1, len(xtime)))
                            ypres = d_groups[ii + tm + 1][nan_ind]
                            nval = d_groups[ii + tm + 2][nan_ind]
                            tm += 2

                            # fig, ax = pf.plot_xsection(subsite, xtime, ypres, nval, clabel, ylabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            # pf.plot_profiles(nval, ypres, colors, ylabel, clabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            ind2 = cf.reject_outliers(nval, 5)
                            xD = nval[ind2]
                            yD = ypres[ind2]
                            nZ = colors[ind2]
                            outliers = str(len(nval) - len(xD))
                            leg_text = ('removed {} outliers (SD={})'.format(
                                outliers, stdev), )

                            ax.scatter(xD, yD, c=nZ, s=2, edgecolor='None')
                            ax.invert_yaxis()
                            ax.set_xlabel(clabel, fontsize=9)
                            ax.set_ylabel(ylabel, fontsize=9)
                            ax.legend(leg_text, loc='best', fontsize=6)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            l_arr.append(
                                len(nval)
                            )  #  count of data to filter out small groups
                            y_avg.append(ypres.mean())
                            n_avg.append(nval.mean())
                            n_min.append(nval.min())
                            n_max.append(nval.max())
                            n0_std.append(nval.mean() + 3 * nval.std())
                            n1_std.append(nval.mean() - 3 * nval.std())

                        ax.plot(n_avg, y_avg, '-k')
                        # ax.plot(n_min, y_avg, '-b')
                        # ax.plot(n_max, y_avg, '-b')
                        ax.fill_betweenx(y_avg,
                                         n0_std,
                                         n1_std,
                                         color='m',
                                         alpha=0.2)
                        sfile = '_'.join((sname, 'statistics'))
                        pf.save_fig(save_dir, sfile)