Python return_science_vars примеры использования

Язык программирования: Python

Пространство имен/Пакет: functions.common

Метод/Функция: return_science_vars

Примеров на hotexamples.com: 22

Python return_science_vars - 22 примеров найдено. Это лучшие примеры Python кода для functions.common.return_science_vars, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: ctd_timeseries.py Проект: ooi-data-lab/data-review-tools

def main(sDir, f):
    ff = pd.read_csv(os.path.join(sDir, f))
    datasets = cf.get_nc_urls(ff['outputUrl'].tolist())
    for d in datasets:
        print(d)
        fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
            d)
        save_dir = os.path.join(sDir, subsite, refdes, deployment)
        cf.create_dir(save_dir)

        sci_vars = cf.return_science_vars(stream)

        colors = cm.jet(np.linspace(0, 1, len(sci_vars)))

        with xr.open_dataset(d, mask_and_scale=False) as ds:
            ds = ds.swap_dims({'obs': 'time'})
            t = ds['time'].data
            t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title = ' '.join((deployment, refdes, method))

            fig, ax = plt.subplots()
            axes = [ax]
            for i in range(len(sci_vars)):
                if i > 0:
                    axes.append(ax.twinx()
                                )  # twin the x-axis to make independent y-axes

            fig.subplots_adjust(right=0.6)
            right_additive = (0.98 - 0.6) / float(5)

            for i in range(len(sci_vars)):
                if i > 0:
                    axes[i].spines['right'].set_position(
                        ('axes', 1. + right_additive * i))
                y = ds[sci_vars[i]]

                ind = cf.reject_outliers(y, 5)
                yD = y.data[ind]
                x = t[ind]

                #yD = y.data
                c = colors[i]
                axes[i].plot(x, yD, '.', markersize=2, color=c)
                axes[i].set_ylabel((y.name + " (" + y.units + ")"),
                                   color=c,
                                   fontsize=9)
                axes[i].tick_params(axis='y', colors=c)
                if i == len(
                        sci_vars) - 1:  # if the last variable has been plotted
                    pf.format_date_axis(axes[i], fig)

            axes[0].set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
            sfile = '_'.join((fname, 'timeseries'))
            pf.save_fig(save_dir, sfile)

Пример #2

Показать файл

Файл: nc_file_analysis_cabled.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list, deployment_num):
    reviewlist = pd.read_csv(
        'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv')

    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    json_file_list = []
    for r in rd_list:
        dependencies = []
        print('\n{}'.format(r))
        data = OrderedDict(deployments=OrderedDict())
        save_dir = os.path.join(sDir, r.split('-')[0], r)
        cf.create_dir(save_dir)

        # Deployment location test
        deploy_loc_test = cf.deploy_location_check(r)
        data['location_comparison'] = deploy_loc_test

        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))

            # complete the analysis by reference designator
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])

                # check for the OOI 1.0 datasets for review
                rl_filtered = reviewlist.loc[
                    (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')]
                review_deployments = rl_filtered['deploymentNumber'].tolist()
                review_deployments_int = ['deployment%04d' % int(x) for x in review_deployments]

                for rev_dep in review_deployments_int:
                    if deployment_num is not None:
                        if int(rev_dep[-4:]) is not deployment_num:
                            print('\nskipping {}'.format(rev_dep))
                            continue

                    rdatasets = [s for s in udatasets if rev_dep in s]
                    rdatasets.sort()
                    if len(rdatasets) > 0:
                        datasets = []
                        for dss in rdatasets:  # filter out collocated data files
                            if catalog_rms == dss.split('/')[-1].split('_20')[0][15:]:
                                datasets.append(dss)
                            else:
                                drd = dss.split('/')[-1].split('_20')[0][15:42]
                                if drd not in dependencies and drd != r:
                                    dependencies.append(drd)

                        notes = []
                        time_ascending = ''
                        sci_vars_dict = {}
                        #datasets = datasets[0:2]  #### for testing
                        for i in range(len(datasets)):
                            ds = xr.open_dataset(datasets[i], mask_and_scale=False)
                            ds = ds.swap_dims({'obs': 'time'})
                            print('\nAppending data from {}: file {} of {}'.format(rev_dep, i+1, len(datasets)))

                            # when opening multiple datasets, don't check that the timestamps are in ascending order
                            time_ascending = 'not_tested'

                            if i == 0:
                                fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0])
                                fname = fname.split('_20')[0]

                                # Get info from the data review database
                                dr_data = cf.refdes_datareview_json(refdes)
                                stream_vars = cf.return_stream_vars(data_stream)
                                sci_vars = cf.return_science_vars(data_stream)
                                node = refdes.split('-')[1]
                                if 'cspp' in data_stream or 'WFP' in node:
                                    sci_vars.append('int_ctd_pressure')

                                # Add pressure to the list of science variables
                                press = pf.pressure_var(ds, list(ds.coords.keys()))
                                if press is None:
                                    press = pf.pressure_var(ds, list(ds.data_vars.keys()))
                                if press is not None:
                                    sci_vars.append(press)
                                sci_vars.append('time')
                                sci_vars = list(np.unique(sci_vars))
                                if 'ADCP' in r:
                                    sci_vars = [x for x in sci_vars if 'beam' not in x]

                                for sci_var in sci_vars:
                                    if sci_var == 'time':
                                        sci_vars_dict.update(
                                            {sci_var: dict(values=np.array([], dtype=np.datetime64), units=[], fv=[])})
                                    else:
                                        sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[])})

                                deploy_info = get_deployment_information(dr_data, int(deployment[-4:]))

                                # Grab deployment Variables
                                deploy_start = str(deploy_info['start_date'])
                                deploy_stop = str(deploy_info['stop_date'])
                                deploy_lon = deploy_info['longitude']
                                deploy_lat = deploy_info['latitude']
                                deploy_depth = deploy_info['deployment_depth']

                                # Calculate days deployed
                                if deploy_stop != 'None':
                                    r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
                                    if deploy_stop.split('T')[1] == '00:00:00':
                                        r_deploy_stop = pd.to_datetime(deploy_stop)
                                    else:
                                        r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
                                    n_days_deployed = (r_deploy_stop - r_deploy_start).days
                                else:
                                    n_days_deployed = None

                                # Add reference designator to dictionary
                                try:
                                    data['refdes']
                                except KeyError:
                                    data['refdes'] = refdes

                            # append data for the deployment into a dictionary
                            for s_v in sci_vars_dict.keys():
                                vv = ds[s_v]
                                try:
                                    if vv.units not in sci_vars_dict[s_v]['units']:
                                        sci_vars_dict[s_v]['units'].append(vv.units)
                                except AttributeError:
                                    print('')
                                try:
                                    if vv._FillValue not in sci_vars_dict[s_v]['fv']:
                                        sci_vars_dict[s_v]['fv'].append(vv._FillValue)
                                except AttributeError:
                                    print('')
                                if len(vv.dims) == 1:
                                    if s_v in ['wavelength_a', 'wavelength_c']:
                                        # if the array is not same as the array that was already appended for these
                                        # two OPTAA variables, append. if it's already there, don't append
                                        if np.sum(vv.values == sci_vars_dict[s_v]['values']) != len(vv.values):
                                            sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'],
                                                                                     vv.values)
                                    else:
                                        sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values)

                                elif len(vv.dims) == 2:  # appending 2D datasets
                                    vD = vv.values.T
                                    if len(sci_vars_dict[s_v]['values']) == 0:
                                        sci_vars_dict[s_v]['values'] = vD
                                    else:
                                        sci_vars_dict[s_v]['values'] = np.concatenate((sci_vars_dict[s_v]['values'], vD), axis=1)

                        deployments = data['deployments'].keys()
                        data_start = pd.to_datetime(min(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S')
                        data_stop = pd.to_datetime(max(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S')

                        # Add deployment and info to dictionary and initialize delivery method sub-dictionary
                        if deployment not in deployments:
                            data['deployments'][deployment] = OrderedDict(deploy_start=deploy_start,
                                                                          deploy_stop=deploy_stop,
                                                                          n_days_deployed=n_days_deployed,
                                                                          lon=deploy_lon,
                                                                          lat=deploy_lat,
                                                                          deploy_depth=deploy_depth,
                                                                          method=OrderedDict())

                        # Add delivery methods to dictionary and initialize stream sub-dictionary
                        methods = data['deployments'][deployment]['method'].keys()
                        if method not in methods:
                            data['deployments'][deployment]['method'][method] = OrderedDict(
                                stream=OrderedDict())

                        # Add streams to dictionary and initialize file sub-dictionary
                        streams = data['deployments'][deployment]['method'][method]['stream'].keys()

                        if data_stream not in streams:
                            data['deployments'][deployment]['method'][method]['stream'][
                                data_stream] = OrderedDict(file=OrderedDict())

                        # Get a list of data gaps >1 day
                        time_df = pd.DataFrame(sci_vars_dict['time']['values'], columns=['time'])
                        time_df = time_df.sort_values(by=['time'])
                        gap_list = cf.timestamp_gap_test(time_df)

                        # Calculate the sampling rate to the nearest second
                        time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
                        rates_df = time_df.groupby(['diff']).agg(['count'])
                        n_diff_calc = len(time_df) - 1
                        rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
                        for i, row in rates_df.iterrows():
                            percent = (float(row['time']['count']) / float(n_diff_calc))
                            if percent > 0.1:
                                rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})

                        sampling_rt_sec = None
                        for k, v in rates['common_sampling_rates'].items():
                            if float(v.strip('%')) > 50.00:
                                sampling_rt_sec = k

                        if not sampling_rt_sec:
                            sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates'])

                        # Don't do : Check that the timestamps in the file are unique
                        time_test = ''

                        # Count the number of days for which there is at least 1 timestamp
                        n_days = len(np.unique(sci_vars_dict['time']['values'].astype('datetime64[D]')))

                        # Compare variables in file to variables in Data Review Database
                        ds_variables = list(ds.data_vars.keys()) + list(ds.coords.keys())
                        ds_variables = eliminate_common_variables(ds_variables)
                        ds_variables = [x for x in ds_variables if 'qc' not in x]
                        [_, unmatch1] = compare_lists(stream_vars, ds_variables)
                        [_, unmatch2] = compare_lists(ds_variables, stream_vars)

                        # calculate mean pressure from data, excluding outliers +/- 3 SD
                        try:
                            pressure = sci_vars_dict[press]
                            if len(pressure) > 1:
                                # reject NaNs
                                p_nonan = pressure['values'][~np.isnan(pressure['values'])]

                                # reject fill values
                                p_nonan_nofv = p_nonan[p_nonan != pressure['fv'][0]]

                                # reject data outside of global ranges
                                [pg_min, pg_max] = cf.get_global_ranges(r, press)
                                if pg_min is not None and pg_max is not None:
                                    pgr_ind = cf.reject_global_ranges(p_nonan_nofv, pg_min, pg_max)
                                    p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
                                else:
                                    p_nonan_nofv_gr = p_nonan_nofv

                                if (len(p_nonan_nofv_gr) > 0):
                                    [press_outliers, pressure_mean, _, pressure_max, _, _] = cf.variable_statistics(p_nonan_nofv_gr, 3)
                                    pressure_mean = round(pressure_mean, 2)
                                    pressure_max = round(pressure_max, 2)
                                else:
                                    press_outliers = None
                                    pressure_mean = None
                                    pressure_max = None
                                    if len(pressure) > 0 and len(p_nonan) == 0:
                                        notes.append('Pressure variable all NaNs')
                                    elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0:
                                        notes.append('Pressure variable all fill values')
                                    elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0:
                                        notes.append('Pressure variable outside of global ranges')

                            else:  # if there is only 1 data point
                                press_outliers = 0
                                pressure_mean = round(ds[press].values.tolist()[0], 2)
                                pressure_max = round(ds[press].values.tolist()[0], 2)

                            try:
                                pressure_units = pressure['units'][0]
                            except AttributeError:
                                pressure_units = 'no units attribute for pressure'

                            if pressure_mean:
                                if 'SF' in node:
                                    pressure_compare = int(round(pressure_max))
                                else:
                                    pressure_compare = int(round(pressure_mean))

                                if pressure_units == '0.001 dbar':
                                    pressure_max = round((pressure_max / 1000), 2)
                                    pressure_mean = round((pressure_mean / 1000), 2)
                                    pressure_compare = round((pressure_compare / 1000), 2)
                                    notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison')

                                elif pressure_units == 'daPa':
                                    pressure_max = round((pressure_max / 1000), 2)
                                    pressure_mean = round((pressure_mean / 1000), 2)
                                    pressure_compare = round((pressure_compare / 1000), 2)
                                    notes.append('Pressure converted from daPa to dbar for pressure comparison')

                            else:
                                pressure_compare = None

                            if (not deploy_depth) or (not pressure_mean):
                                pressure_diff = None
                            else:
                                pressure_diff = pressure_compare - deploy_depth

                        except KeyError:
                            press = 'no seawater pressure in file'
                            pressure_diff = None
                            pressure_mean = None
                            pressure_max = None
                            pressure_compare = None
                            press_outliers = None
                            pressure_units = None

                        # Add files and info to dictionary
                        filenames = data['deployments'][deployment]['method'][method]['stream'][data_stream][
                            'file'].keys()

                        if fname not in filenames:
                            data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][
                                fname] = OrderedDict(
                                file_downloaded=pd.to_datetime(splitter[0][0:15]).strftime('%Y-%m-%dT%H:%M:%S'),
                                file_coordinates=list(ds.coords.keys()),
                                sampling_rate_seconds=sampling_rt_sec,
                                sampling_rate_details=rates,
                                data_start=data_start,
                                data_stop=data_stop,
                                time_gaps=gap_list,
                                unique_timestamps=time_test,
                                n_timestamps=len(sci_vars_dict['time']['values']),
                                n_days=n_days,
                                notes=notes,
                                ascending_timestamps=time_ascending,
                                pressure_comparison=dict(pressure_mean=pressure_mean, units=pressure_units,
                                                         num_outliers=press_outliers, diff=pressure_diff,
                                                         pressure_max=pressure_max, variable=press,
                                                         pressure_compare=pressure_compare),
                                vars_in_file=ds_variables,
                                vars_not_in_file=[x for x in unmatch1 if 'time' not in x],
                                vars_not_in_db=unmatch2,
                                sci_var_stats=OrderedDict())

                        # calculate statistics for science variables, excluding outliers +/- 5 SD
                        for sv in sci_vars_dict.keys():
                            if sv != 't_max':  # for ADCP
                                if sv != 'time':
                                    print(sv)
                                    var = sci_vars_dict[sv]
                                    vD = var['values']
                                    var_units = var['units']
                                    #if 'timedelta' not in str(vD.dtype):
                                    vnum_dims = len(np.shape(vD))
                                    # for OPTAA wavelengths, print the array
                                    if sv == 'wavelength_a' or sv == 'wavelength_c':
                                        [g_min, g_max] = cf.get_global_ranges(r, sv)
                                        n_all = len(var)
                                        mean = list(vD)
                                        num_outliers = None
                                        vmin = None
                                        vmax = None
                                        sd = None
                                        n_stats = 'not calculated'
                                        n_nan = None
                                        n_fv = None
                                        n_grange = 'no global ranges'
                                        fv = var['fv'][0]
                                    else:
                                        if vnum_dims > 2:
                                            print('variable has more than 2 dimensions')
                                            num_outliers = None
                                            mean = None
                                            vmin = None
                                            vmax = None
                                            sd = None
                                            n_stats = 'variable has more than 2 dimensions'
                                            n_nan = None
                                            n_fv = None
                                            n_grange = None
                                            fv = None
                                            n_all = None
                                        else:
                                            if vnum_dims > 1:
                                                n_all = [len(vD), len(vD.flatten())]
                                            else:
                                                n_all = len(vD)
                                            n_nan = int(np.sum(np.isnan(vD)))
                                            fv = var['fv'][0]
                                            vD[vD == fv] = np.nan  # turn fill values to nans
                                            n_fv = int(np.sum(np.isnan(vD))) - n_nan

                                            [g_min, g_max] = cf.get_global_ranges(r, sv)
                                            if list(np.unique(np.isnan(vD))) != [True]:
                                                # reject data outside of global ranges
                                                if g_min is not None and g_max is not None:
                                                    # turn data outside of global ranges to nans
                                                    #var_gr = var_nofv.where((var_nofv >= g_min) & (var_nofv <= g_max))
                                                    vD[vD < g_min] = np.nan
                                                    vD[vD > g_max] = np.nan
                                                    n_grange = int(np.sum(np.isnan(vD)) - n_fv - n_nan)
                                                else:
                                                    n_grange = 'no global ranges'

                                                if list(np.unique(np.isnan(vD))) != [True]:
                                                    if sv == 'spkir_abj_cspp_downwelling_vector':
                                                        # don't remove outliers from dataset
                                                        [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics_spkir(vD)
                                                    else:
                                                        if vnum_dims > 1:
                                                            var_gr = vD.flatten()
                                                        else:
                                                            var_gr = vD
                                                        # drop nans before calculating stats
                                                        var_gr = var_gr[~np.isnan(var_gr)]
                                                        [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_gr, 5)
                                                else:
                                                    num_outliers = None
                                                    mean = None
                                                    vmin = None
                                                    vmax = None
                                                    sd = None
                                                    n_stats = 0
                                                    n_grange = None
                                            else:
                                                num_outliers = None
                                                mean = None
                                                vmin = None
                                                vmax = None
                                                sd = None
                                                n_stats = 0
                                                n_grange = None

                                    if vnum_dims > 1:
                                        sv = '{} (dims: {})'.format(sv, list(np.shape(var['values'])))
                                    else:
                                        sv = sv
                                    #if 'timedelta' not in str(var.values.dtype):
                                    data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][
                                        fname]['sci_var_stats'][sv] = dict(n_outliers=num_outliers, mean=mean, min=vmin,
                                                                           max=vmax, stdev=sd, n_stats=n_stats, units=var_units,
                                                                           n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv),
                                                                           global_ranges=[g_min, g_max], n_grange=n_grange,
                                                                           n_all=n_all)

                    sfile = os.path.join(save_dir, '{}-{}-file_analysis.json'.format(rev_dep, r))
                    with open(sfile, 'w') as outfile:
                        json.dump(data, outfile)
                    json_file_list.append(str(sfile))

        depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r))
        with open(depfile, 'w') as depf:
            depf.write(str(dependencies))

    return json_file_list

Пример #3

Показать файл

def main(url_list, sDir, stime, etime):
    if len(url_list) != 2:
        print('Please provide 2 reference designators for plotting')
    else:
        uu0 = url_list[0]
        uu1 = url_list[1]
        rd0 = uu0.split('/')[-2][20:47]
        rd1 = uu1.split('/')[-2][20:47]
        array = rd0[0:2]
        inst = rd0.split('-')[-1]

        datasets0 = []
        datasets1 = []
        for i in range(len(url_list)):
            udatasets = cf.get_nc_urls([url_list[i]])
            if i == 0:
                datasets0.append(udatasets)
            else:
                datasets1.append(udatasets)

        datasets0 = list(itertools.chain(*datasets0))
        datasets1 = list(itertools.chain(*datasets1))

        main_sensor0 = rd0.split('-')[-1]
        main_sensor1 = rd1.split('-')[-1]
        fdatasets0_sel = cf.filter_collocated_instruments(
            main_sensor0, datasets0)
        fdatasets1_sel = cf.filter_collocated_instruments(
            main_sensor1, datasets1)

        deployments = [
            dd.split('/')[-1].split('_')[0] for dd in fdatasets0_sel
        ]

        for d in deployments:
            fd0 = [x for x in fdatasets0_sel if d in x]
            fd1 = [x for x in fdatasets1_sel if d in x]

            ds0 = xr.open_dataset(fd0[0], mask_and_scale=False)
            ds0 = ds0.swap_dims({'obs': 'time'})
            ds1 = xr.open_dataset(fd1[0], mask_and_scale=False)
            ds1 = ds1.swap_dims({'obs': 'time'})

            if stime is not None and etime is not None:
                ds0 = ds0.sel(time=slice(stime, etime))
                ds1 = ds1.sel(time=slice(stime, etime))
                if len(ds0['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd0[0])
            sci_vars = cf.return_science_vars(stream)

            save_dir_profile = os.path.join(sDir, array, subsite, inst,
                                            'profile_plots', deployment)
            cf.create_dir(save_dir_profile)

            # get pressure variable
            pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars(
                ds0)

            for sv in sci_vars:
                print('')
                print(sv)
                if 'pressure' not in sv:
                    fig, ax = plt.subplots()
                    plt.margins(y=.08, x=.02)
                    plt.grid()
                    title = ' '.join((deployment, subsite, inst, method))
                    sname = '-'.join((subsite, inst, method, sv))
                    for i in range(len(url_list)):
                        if i == 0:
                            ds = ds0
                        else:
                            ds = ds1
                        t = ds['time'].values
                        zpressure = ds[pvarname].values
                        z1 = ds[sv].values
                        fv = ds[sv]._FillValue
                        sv_units = ds[sv].units

                        # Check if the array is all NaNs
                        if sum(np.isnan(z1)) == len(z1):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z1[z1 != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # get rid of 0.0 data
                            if sv == 'salinity':
                                ind = z1 > 1
                            elif sv == 'density':
                                ind = z1 > 1000
                            elif sv == 'conductivity':
                                ind = z1 > 0.1
                            elif sv == 'dissolved_oxygen':
                                ind = z1 > 160
                            elif sv == 'estimated_oxygen_concentration':
                                ind = z1 > 200
                            else:
                                ind = z1 > 0
                            # if sv == 'sci_flbbcd_chlor_units':
                            #     ind = ndata < 7.5
                            # elif sv == 'sci_flbbcd_cdom_units':
                            #     ind = ndata < 25
                            # else:
                            #     ind = ndata > 0.0

                            # if 'CTD' in r:
                            #     ind = zpressure > 0.0
                            # else:
                            #     ind = ndata > 0.0

                            lenzero = np.sum(~ind)
                            dtime = t[ind]
                            zpressure = zpressure[ind]
                            zdata = z1[ind]

                            if len(dtime) > 0:
                                ax.scatter(zdata,
                                           zpressure,
                                           s=2,
                                           edgecolor='None')

                    xlabel = sv + " (" + sv_units + ")"
                    ylabel = press[0] + " (" + y_units[0] + ")"

                    ax.invert_yaxis()
                    # plt.xlim([-0.5, 0.5])
                    ax.set_xlabel(xlabel, fontsize=9)
                    ax.set_ylabel(ylabel, fontsize=9)
                    ax.set_title(title + '\nWFP02 (blue) & WFP03 (orange)',
                                 fontsize=9)
                    fig.tight_layout()
                    pf.save_fig(save_dir_profile, sname)

Пример #4

Показать файл

def compare_data(df):
    names = df.columns
    summary = dict(deployments=dict())
    for d, row in df.iterrows():
        for i, n in enumerate(names):
            ii = i + 1
            if ii > 1:
                f1 = row[n]
                try:
                    if np.isnan(f1) is True:
                        continue
                except TypeError:
                    for x in range(ii - 1):
                        f0 = row[names[x]]
                        try:
                            if np.isnan(f0) is True:
                                continue
                        except TypeError:
                            if d not in summary['deployments'].keys():
                                summary['deployments'][d] = dict(
                                    comparison=dict())
                            compare = '{} {}'.format(names[x], n)
                            if compare not in summary['deployments'][d][
                                    'comparison'].keys():
                                summary['deployments'][d]['comparison'][
                                    compare] = dict(vars=dict())

                            if len(f0) == 1:
                                ds0 = xr.open_dataset(f0[0])
                                ds0 = ds0.swap_dims({'obs': 'time'})
                            else:
                                ds0 = xr.open_mfdataset(f0)
                                ds0 = ds0.swap_dims({'obs': 'time'})
                                ds0 = ds0.chunk({'time': 100})
                            splt0 = compare.split(' ')[0].split('-')
                            ds0_sci_vars = cf.return_science_vars(splt0[1])
                            ds0_method = splt0[0]

                            if len(f1) == 1:
                                ds1 = xr.open_dataset(f1[0])
                                ds1 = ds1.swap_dims({'obs': 'time'})
                            else:
                                ds1 = xr.open_mfdataset(f1)
                                ds1 = ds1.swap_dims({'obs': 'time'})
                                ds1 = ds1.chunk({'time': 100})
                            splt1 = compare.split(' ')[1].split('-')
                            ds1_sci_vars = cf.return_science_vars(splt1[1])
                            ds1_method = splt1[0]

                            # find where the variable long names are the same
                            ds0names = long_names(ds0, ds0_sci_vars)
                            ds0names.rename(columns={'name': 'name_ds0'},
                                            inplace=True)
                            ds1names = long_names(ds1, ds1_sci_vars)
                            ds1names.rename(columns={'name': 'name_ds1'},
                                            inplace=True)
                            mapping = pd.merge(ds0names,
                                               ds1names,
                                               on='long_name',
                                               how='inner')
                            print('----------------------')
                            print('{}: {}'.format(d, compare))
                            print('----------------------')

                            blank_dict = {
                                'missing_data_gaps': [],
                                'n_missing': [],
                                'n_missing_days_total': 0,
                                'n_missing_total': 0
                            }

                            for rr in mapping.itertuples():
                                index, name_ds0, long_name, name_ds1 = rr
                                print(long_name)

                                # Compare data from two data streams (round timestamps to the nearest second).
                                ds0_rename = '_'.join((str(name_ds0), 'ds0'))
                                [ds0_df, ds0_units, n0,
                                 n0_nan] = get_ds_variable_info(
                                     ds0, name_ds0, ds0_rename)

                                ds1_rename = '_'.join((str(name_ds1), 'ds1'))
                                [ds1_df, ds1_units, n1,
                                 n1_nan] = get_ds_variable_info(
                                     ds1, name_ds1, ds1_rename)

                                # Compare units
                                if ds0_units == ds1_units:
                                    unit_test = 'pass'
                                else:
                                    unit_test = 'fail'

                                if unit_test == 'pass':
                                    # skip if the variables have more than 1 dimension
                                    if (type(ds0_df) == str) or (type(ds1_df)
                                                                 == str):
                                        n_comparison = 0
                                        n_diff_g_zero = None
                                        min_diff = None
                                        max_diff = None
                                        ds0_missing_dict = '2D dataset'
                                        ds1_missing_dict = '2D dataset'
                                    else:
                                        # Merge dataframes from both methods
                                        merged = pd.merge(ds0_df,
                                                          ds1_df,
                                                          on='time',
                                                          how='outer')

                                        # Drop rows where both variables are NaNs, and make sure the timestamps are in order
                                        merged.dropna(
                                            subset=[ds0_rename, ds1_rename],
                                            how='all',
                                            inplace=True)
                                        if len(merged) == 0:
                                            print('No valid data to compare')
                                            n_comparison = 0
                                            n_diff_g_zero = None
                                            min_diff = None
                                            max_diff = None
                                            ds0_missing_dict = 'No valid data to compare'
                                            ds1_missing_dict = 'No valid data to compare'
                                        else:
                                            merged = merged.sort_values(
                                                'time').reset_index(drop=True)
                                            m_intersect = merged[
                                                merged[ds0_rename].notnull()
                                                & merged[ds1_rename].notnull()]

                                            # If the number of data points for comparison is less than 1% of the smaller sample size
                                            # compare the timestamps by rounding to the nearest hour
                                            if len(m_intersect) == 0 or float(
                                                    len(m_intersect)) / float(
                                                        min(n0,
                                                            n1)) * 100 < 1.00:
                                                n_comparison = 0
                                                n_diff_g_zero = None
                                                min_diff = None
                                                max_diff = None

                                                utime_df0 = unique_timestamps_hour(
                                                    ds0)
                                                utime_df0['ds0'] = 'ds0'
                                                utime_df1 = unique_timestamps_hour(
                                                    ds1)
                                                utime_df1['ds1'] = 'ds1'
                                                umerged = pd.merge(utime_df0,
                                                                   utime_df1,
                                                                   on='time',
                                                                   how='outer')
                                                umerged = umerged.sort_values(
                                                    'time').reset_index(
                                                        drop=True)

                                                if 'telemetered' in ds0_method:
                                                    ds0_missing_dict = 'method not checked for missing data'
                                                else:
                                                    ds0_missing = umerged.loc[
                                                        umerged['ds0'].isnull(
                                                        )]
                                                    if len(ds0_missing) > 0:
                                                        ds0_missing_dict = missing_data_times(
                                                            ds0_missing)
                                                        if ds0_missing_dict != blank_dict:
                                                            ds0_missing_dict[
                                                                'n_hours_missing'] = ds0_missing_dict.pop(
                                                                    'n_missing'
                                                                )
                                                            ds0_missing_dict[
                                                                'n_hours_missing_total'] = ds0_missing_dict.pop(
                                                                    'n_missing_total'
                                                                )
                                                        else:
                                                            ds0_missing_dict = 'timestamps rounded to the hour: no missing data'
                                                    else:
                                                        ds0_missing_dict = 'timestamps rounded to the hour: no missing data'

                                                if 'telemetered' in ds1_method:
                                                    ds1_missing_dict = 'method not checked for missing data'
                                                else:
                                                    ds1_missing = umerged.loc[
                                                        umerged['ds1'].isnull(
                                                        )]
                                                    if len(ds1_missing) > 0:
                                                        ds1_missing_dict = missing_data_times(
                                                            ds1_missing)
                                                        if ds1_missing_dict != blank_dict:
                                                            ds1_missing_dict[
                                                                'n_hours_missing'] = ds1_missing_dict.pop(
                                                                    'n_missing'
                                                                )
                                                            ds1_missing_dict[
                                                                'n_hours_missing_total'] = ds1_missing_dict.pop(
                                                                    'n_missing_total'
                                                                )
                                                        else:
                                                            ds1_missing_dict = 'timestamps rounded to the hour: no missing data'
                                                    else:
                                                        ds1_missing_dict = 'timestamps rounded to the hour: no missing data'

                                            else:
                                                # Find where data are available in one dataset and missing in the other if
                                                # timestamps match exactly. Don't check for missing data in telemetered
                                                # datasets.
                                                if 'telemetered' in ds0_method:
                                                    ds0_missing_dict = 'method not checked for missing data'
                                                else:
                                                    ds0_missing = merged.loc[
                                                        merged[ds0_rename].
                                                        isnull()]
                                                    if len(ds0_missing) > 0:
                                                        ds0_missing_dict = missing_data_times(
                                                            ds0_missing)
                                                        if ds0_missing_dict == blank_dict:
                                                            ds0_missing_dict = 'no missing data'
                                                    else:
                                                        ds0_missing_dict = 'no missing data'

                                                if 'telemetered' in ds1_method:
                                                    ds1_missing_dict = 'method not checked for missing data'
                                                else:
                                                    ds1_missing = merged.loc[
                                                        merged[ds1_rename].
                                                        isnull()]
                                                    if len(ds1_missing) > 0:
                                                        ds1_missing_dict = missing_data_times(
                                                            ds1_missing)
                                                        if ds1_missing_dict == blank_dict:
                                                            ds1_missing_dict = 'no missing data'
                                                    else:
                                                        ds1_missing_dict = 'no missing data'

                                                # Where the data intersect, calculate the difference between the methods
                                                diff = m_intersect[
                                                    ds0_rename] - m_intersect[
                                                        ds1_rename]
                                                n_diff_g_zero = sum(
                                                    abs(diff) >
                                                    0.99999999999999999)

                                                min_diff = round(
                                                    min(abs(diff)), 10)
                                                max_diff = round(
                                                    max(abs(diff)), 10)
                                                n_comparison = len(diff)

                                    summary['deployments'][d]['comparison'][
                                        compare]['vars'][str(
                                            long_name
                                        )] = dict(
                                            ds0=dict(name=name_ds0,
                                                     units=ds0_units,
                                                     n=n0,
                                                     n_nan=n0_nan,
                                                     missing=ds0_missing_dict),
                                            ds1=dict(name=name_ds1,
                                                     units=ds1_units,
                                                     n=n1,
                                                     n_nan=n1_nan,
                                                     missing=ds1_missing_dict),
                                            unit_test=unit_test,
                                            n_comparison=n_comparison,
                                            n_diff_greater_zero=n_diff_g_zero,
                                            min_abs_diff=min_diff,
                                            max_abs_diff=max_diff)
    return summary

Пример #5

Показать файл

Файл: nc_file_analysis.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list):
    reviewlist = pd.read_csv(
        'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv'
    )

    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    json_file_list = []
    for r in rd_list:
        dependencies = []
        print('\n{}'.format(r))
        data = OrderedDict(deployments=OrderedDict())
        save_dir = os.path.join(sDir, r.split('-')[0], r)
        cf.create_dir(save_dir)

        # Deployment location test
        deploy_loc_test = cf.deploy_location_check(r)
        data['location_comparison'] = deploy_loc_test

        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))

            # complete the analysis by reference designator
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])

                # check for the OOI 1.0 datasets for review
                rl_filtered = reviewlist.loc[
                    (reviewlist['Reference Designator'] == r)
                    & (reviewlist['status'] == 'for review')]
                review_deployments = rl_filtered['deploymentNumber'].tolist()
                review_deployments_int = [
                    'deployment%04d' % int(x) for x in review_deployments
                ]
                for rev_dep in review_deployments_int:
                    rdatasets = [s for s in udatasets if rev_dep in s]
                    if len(rdatasets) > 0:
                        datasets = []
                        for dss in rdatasets:  # filter out collocated data files
                            if catalog_rms == dss.split('/')[-1].split(
                                    '_20')[0][15:]:
                                datasets.append(dss)
                            else:
                                drd = dss.split('/')[-1].split('_20')[0][15:42]
                                if drd not in dependencies and drd != r:
                                    dependencies.append(drd)

                        notes = []
                        time_ascending = ''
                        if len(datasets) == 1:
                            try:
                                ds = xr.open_dataset(datasets[0],
                                                     mask_and_scale=False)
                                ds = ds.swap_dims({'obs': 'time'})
                                fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(
                                    datasets[0])
                            except OSError:
                                print('OSError - skipping file {}'.format(
                                    datasets[0]))
                                continue
                        elif len(datasets) > 1:
                            ds = xr.open_mfdataset(datasets,
                                                   mask_and_scale=False)
                            ds = ds.swap_dims({'obs': 'time'})
                            #ds = ds.chunk({'time': 100})
                            fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(
                                datasets[0])
                            fname = fname.split('_20')[0]
                            notes.append('multiple deployment .nc files')
                            # when opening multiple datasets, don't check that the timestamps are in ascending order
                            time_ascending = 'not_tested'
                        else:
                            continue

                        print('\nAnalyzing file: {}'.format(fname))

                        # Get info from the data review database
                        dr_data = cf.refdes_datareview_json(refdes)
                        stream_vars = cf.return_stream_vars(data_stream)
                        sci_vars = cf.return_science_vars(data_stream)
                        node = refdes.split('-')[1]
                        if 'cspp' in data_stream or 'WFP' in node:
                            sci_vars.append('int_ctd_pressure')

                        # if 'FDCHP' in refdes:
                        #     remove_vars = ['fdchp_wind_x', 'fdchp_wind_y', 'fdchp_wind_z', 'fdchp_speed_of_sound_sonic',
                        #                    'fdchp_x_accel_g', 'fdchp_y_accel_g', 'fdchp_z_accel_g']
                        #     rv_regex = re.compile('|'.join(remove_vars))
                        #     rv_sci_vars = [nn for nn in sci_vars if not rv_regex.search(nn)]
                        #     sci_vars = rv_sci_vars

                        deploy_info = get_deployment_information(
                            dr_data, int(deployment[-4:]))

                        # Grab deployment Variables
                        deploy_start = str(deploy_info['start_date'])
                        deploy_stop = str(deploy_info['stop_date'])
                        deploy_lon = deploy_info['longitude']
                        deploy_lat = deploy_info['latitude']
                        deploy_depth = deploy_info['deployment_depth']

                        # Calculate days deployed
                        if deploy_stop != 'None':
                            r_deploy_start = pd.to_datetime(
                                deploy_start).replace(hour=0,
                                                      minute=0,
                                                      second=0)
                            if deploy_stop.split('T')[1] == '00:00:00':
                                r_deploy_stop = pd.to_datetime(deploy_stop)
                            else:
                                r_deploy_stop = (pd.to_datetime(deploy_stop) +
                                                 timedelta(days=1)).replace(
                                                     hour=0,
                                                     minute=0,
                                                     second=0)
                            n_days_deployed = (r_deploy_stop -
                                               r_deploy_start).days
                        else:
                            n_days_deployed = None

                        # Add reference designator to dictionary
                        try:
                            data['refdes']
                        except KeyError:
                            data['refdes'] = refdes

                        deployments = data['deployments'].keys()
                        data_start = pd.to_datetime(min(
                            ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')
                        data_stop = pd.to_datetime(max(
                            ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')

                        # Add deployment and info to dictionary and initialize delivery method sub-dictionary
                        if deployment not in deployments:
                            data['deployments'][deployment] = OrderedDict(
                                deploy_start=deploy_start,
                                deploy_stop=deploy_stop,
                                n_days_deployed=n_days_deployed,
                                lon=deploy_lon,
                                lat=deploy_lat,
                                deploy_depth=deploy_depth,
                                method=OrderedDict())

                        # Add delivery methods to dictionary and initialize stream sub-dictionary
                        methods = data['deployments'][deployment][
                            'method'].keys()
                        if method not in methods:
                            data['deployments'][deployment]['method'][
                                method] = OrderedDict(stream=OrderedDict())

                        # Add streams to dictionary and initialize file sub-dictionary
                        streams = data['deployments'][deployment]['method'][
                            method]['stream'].keys()

                        if data_stream not in streams:
                            data['deployments'][deployment]['method'][method][
                                'stream'][data_stream] = OrderedDict(
                                    file=OrderedDict())

                        # Get a list of data gaps >1 day
                        time_df = pd.DataFrame(ds['time'].values,
                                               columns=['time'])
                        gap_list = cf.timestamp_gap_test(time_df)

                        # Calculate the sampling rate to the nearest second
                        time_df['diff'] = time_df['time'].diff().astype(
                            'timedelta64[s]')
                        rates_df = time_df.groupby(['diff']).agg(['count'])
                        n_diff_calc = len(time_df) - 1
                        rates = dict(n_unique_rates=len(rates_df),
                                     common_sampling_rates=dict())
                        for i, row in rates_df.iterrows():
                            percent = (float(row['time']['count']) /
                                       float(n_diff_calc))
                            if percent > 0.1:
                                rates['common_sampling_rates'].update(
                                    {int(i): '{:.2%}'.format(percent)})

                        sampling_rt_sec = None
                        for k, v in rates['common_sampling_rates'].items():
                            if float(v.strip('%')) > 50.00:
                                sampling_rt_sec = k

                        if not sampling_rt_sec:
                            sampling_rt_sec = 'no consistent sampling rate: {}'.format(
                                rates['common_sampling_rates'])

                        # Check that the timestamps in the file are unique
                        time = ds['time']
                        len_time = time.__len__()
                        len_time_unique = np.unique(time).__len__()
                        if len_time == len_time_unique:
                            time_test = 'pass'
                        else:
                            time_test = 'fail'

                        # Check that the timestamps in the file are in ascending order
                        if time_ascending != 'not_tested':
                            # convert time to number
                            time_in = [
                                dt.datetime.utcfromtimestamp(
                                    np.datetime64(x).astype('O') / 1e9)
                                for x in ds['time'].values
                            ]
                            time_data = nc.date2num(
                                time_in, 'seconds since 1900-01-01')

                            # Create a list of True or False by iterating through the array of time and checking
                            # if every time stamp is increasing
                            result = [(time_data[k + 1] - time_data[k]) > 0
                                      for k in range(len(time_data) - 1)]

                            # Print outcome of the iteration with the list of indices when time is not increasing
                            if result.count(True) == len(time) - 1:
                                time_ascending = 'pass'
                            else:
                                ind_fail = {
                                    k: time_in[k]
                                    for k, v in enumerate(result) if v is False
                                }
                                time_ascending = 'fail: {}'.format(ind_fail)

                        # Count the number of days for which there is at least 1 timestamp
                        n_days = len(
                            np.unique(time.values.astype('datetime64[D]')))

                        # Compare variables in file to variables in Data Review Database
                        ds_variables = list(ds.data_vars.keys()) + list(
                            ds.coords.keys())
                        #ds_variables = [k for k in ds]
                        ds_variables = eliminate_common_variables(ds_variables)
                        ds_variables = [
                            x for x in ds_variables if 'qc' not in x
                        ]
                        [_, unmatch1] = compare_lists(stream_vars,
                                                      ds_variables)
                        [_, unmatch2] = compare_lists(ds_variables,
                                                      stream_vars)

                        # Check deployment pressure from asset management against pressure variable in file
                        press = pf.pressure_var(ds, list(ds.coords.keys()))
                        if press is None:
                            press = pf.pressure_var(ds,
                                                    list(ds.data_vars.keys()))

                        # calculate mean pressure from data, excluding outliers +/- 3 SD
                        try:
                            pressure = ds[press]
                            num_dims = len(pressure.dims)
                            if len(pressure) > 1:
                                # if the pressure variable is an array of all zeros (as in the case of pressure_depth
                                # for OPTAAs on surface piercing profilers
                                if (len(np.unique(pressure)) == 1) & (
                                        np.unique(pressure)[0] == 0.0):
                                    try:
                                        pressure = ds['int_ctd_pressure']
                                        press = 'int_ctd_pressure'
                                    except KeyError:
                                        pressure = pressure

                                # reject NaNs
                                p_nonan = pressure.values[~np.isnan(pressure.
                                                                    values)]

                                # reject fill values
                                p_nonan_nofv = p_nonan[
                                    p_nonan != pressure._FillValue]

                                # reject data outside of global ranges
                                [pg_min,
                                 pg_max] = cf.get_global_ranges(r, press)
                                if pg_min is not None and pg_max is not None:
                                    pgr_ind = cf.reject_global_ranges(
                                        p_nonan_nofv, pg_min, pg_max)
                                    p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
                                else:
                                    p_nonan_nofv_gr = p_nonan_nofv

                                if (len(p_nonan_nofv_gr) > 0) and (num_dims
                                                                   == 1):
                                    [
                                        press_outliers, pressure_mean, _,
                                        pressure_max, _, _
                                    ] = cf.variable_statistics(
                                        p_nonan_nofv_gr, 3)
                                    pressure_mean = round(pressure_mean, 2)
                                    pressure_max = round(pressure_max, 2)
                                elif (len(p_nonan_nofv_gr) > 0) and (num_dims >
                                                                     1):
                                    print('variable has more than 1 dimension')
                                    press_outliers = 'not calculated: variable has more than 1 dimension'
                                    pressure_mean = round(
                                        np.nanmean(p_nonan_nofv_gr), 2)
                                    pressure_max = round(
                                        np.nanmax(p_nonan_nofv_gr), 2)
                                else:
                                    press_outliers = None
                                    pressure_mean = None
                                    pressure_max = None
                                    if len(pressure) > 0 and len(p_nonan) == 0:
                                        notes.append(
                                            'Pressure variable all NaNs')
                                    elif len(pressure) > 0 and len(
                                            p_nonan) > 0 and len(
                                                p_nonan_nofv) == 0:
                                        notes.append(
                                            'Pressure variable all fill values'
                                        )
                                    elif len(pressure) > 0 and len(
                                            p_nonan) > 0 and len(
                                                p_nonan_nofv) > 0 and len(
                                                    p_nonan_nofv_gr) == 0:
                                        notes.append(
                                            'Pressure variable outside of global ranges'
                                        )

                            else:  # if there is only 1 data point
                                press_outliers = 0
                                pressure_mean = round(
                                    ds[press].values.tolist()[0], 2)
                                pressure_max = round(
                                    ds[press].values.tolist()[0], 2)

                            try:
                                pressure_units = pressure.units
                            except AttributeError:
                                pressure_units = 'no units attribute for pressure'

                            if pressure_mean:
                                if ('WFP' in node) or ('MOAS' in subsite) or (
                                        'SP' in node):
                                    pressure_compare = int(round(pressure_max))
                                else:
                                    pressure_compare = int(
                                        round(pressure_mean))

                                if pressure_units == '0.001 dbar':
                                    pressure_max = round((pressure_max / 1000),
                                                         2)
                                    pressure_mean = round(
                                        (pressure_mean / 1000), 2)
                                    pressure_compare = round(
                                        (pressure_compare / 1000), 2)
                                    notes.append(
                                        'Pressure converted from 0.001 dbar to dbar for pressure comparison'
                                    )

                                elif pressure_units == 'daPa':
                                    pressure_max = round((pressure_max / 1000),
                                                         2)
                                    pressure_mean = round(
                                        (pressure_mean / 1000), 2)
                                    pressure_compare = round(
                                        (pressure_compare / 1000), 2)
                                    notes.append(
                                        'Pressure converted from daPa to dbar for pressure comparison'
                                    )

                            else:
                                pressure_compare = None

                            if (not deploy_depth) or (not pressure_mean):
                                pressure_diff = None
                            else:
                                pressure_diff = pressure_compare - deploy_depth

                        except KeyError:
                            press = 'no seawater pressure in file'
                            pressure_diff = None
                            pressure_mean = None
                            pressure_max = None
                            pressure_compare = None
                            press_outliers = None
                            pressure_units = None

                        # Add files and info to dictionary
                        filenames = data['deployments'][deployment]['method'][
                            method]['stream'][data_stream]['file'].keys()

                        if fname not in filenames:
                            data['deployments'][deployment]['method'][method][
                                'stream'][data_stream]['file'][
                                    fname] = OrderedDict(
                                        file_downloaded=pd.to_datetime(
                                            splitter[0][0:15]).strftime(
                                                '%Y-%m-%dT%H:%M:%S'),
                                        file_coordinates=list(
                                            ds.coords.keys()),
                                        sampling_rate_seconds=sampling_rt_sec,
                                        sampling_rate_details=rates,
                                        data_start=data_start,
                                        data_stop=data_stop,
                                        time_gaps=gap_list,
                                        unique_timestamps=time_test,
                                        n_timestamps=len_time,
                                        n_days=n_days,
                                        notes=notes,
                                        ascending_timestamps=time_ascending,
                                        pressure_comparison=dict(
                                            pressure_mean=pressure_mean,
                                            units=pressure_units,
                                            num_outliers=press_outliers,
                                            diff=pressure_diff,
                                            pressure_max=pressure_max,
                                            variable=press,
                                            pressure_compare=pressure_compare),
                                        vars_in_file=ds_variables,
                                        vars_not_in_file=[
                                            x for x in unmatch1
                                            if 'time' not in x
                                        ],
                                        vars_not_in_db=unmatch2,
                                        sci_var_stats=OrderedDict())

                        # calculate statistics for science variables, excluding outliers +/- 5 SD
                        for sv in sci_vars:
                            if sv != 't_max':  # for ADCP
                                if sv != 'wavss_a_buoymotion_time':
                                    print(sv)
                                    try:
                                        var = ds[sv]
                                        # need to round SPKIR values to 1 decimal place to match the global ranges.
                                        # otherwise, values that round to zero (e.g. 1.55294e-05) will be excluded by
                                        # the global range test
                                        # if 'spkir' in sv:
                                        #     vD = np.round(var.values, 1)
                                        # else:
                                        #     vD = var.values
                                        vD = var.values
                                        if 'timedelta' not in str(
                                                var.values.dtype):
                                            # for OPTAA wavelengths: when multiple files are opened with xr.open_mfdataset
                                            # xarray automatically forces all variables to have the same number of
                                            # dimensions. So in this case wavelength_a and wavelength_c have 1 dimension
                                            # in the individual files, so I'm forcing the analysis to treat them like
                                            # they have 1 dimension (when there are multiple files for 1 deployment)
                                            if sv == 'wavelength_a' or sv == 'wavelength_c':
                                                [g_min,
                                                 g_max] = cf.get_global_ranges(
                                                     r, sv)
                                                vnum_dims = len(var.dims)
                                                if vnum_dims == 1:
                                                    n_all = len(var)
                                                    mean = list(vD)
                                                else:
                                                    vnum_dims = 1
                                                    n_all = len(vD[0])
                                                    mean = list(vD[0])
                                                num_outliers = None
                                                vmin = None
                                                vmax = None
                                                sd = None
                                                n_stats = 'not calculated'
                                                var_units = var.units
                                                n_nan = None
                                                n_fv = None
                                                n_grange = 'no global ranges'
                                                fv = var._FillValue

                                            else:
                                                vnum_dims = len(var.dims)
                                                if vnum_dims > 2:
                                                    print(
                                                        'variable has more than 2 dimensions'
                                                    )
                                                    num_outliers = None
                                                    mean = None
                                                    vmin = None
                                                    vmax = None
                                                    sd = None
                                                    n_stats = 'variable has more than 2 dimensions'
                                                    var_units = var.units
                                                    n_nan = None
                                                    n_fv = None
                                                    n_grange = None
                                                    fv = None
                                                    n_all = None
                                                else:
                                                    if vnum_dims > 1:
                                                        n_all = [
                                                            len(vD),
                                                            len(vD.flatten())
                                                        ]
                                                    else:
                                                        n_all = len(vD)
                                                    n_nan = int(
                                                        np.sum(np.isnan(vD)))
                                                    fv = var._FillValue
                                                    var_nofv = var.where(
                                                        var != fv)
                                                    n_fv = int(
                                                        np.sum(
                                                            np.isnan(
                                                                var_nofv.values
                                                            ))) - n_nan

                                                    try:
                                                        var_units = var.units
                                                    except AttributeError:
                                                        var_units = 'no_units'
                                                    [g_min, g_max
                                                     ] = cf.get_global_ranges(
                                                         r, sv)
                                                    if list(
                                                            np.unique(
                                                                np.isnan(
                                                                    var_nofv))
                                                    ) != [True]:
                                                        # reject data outside of global ranges
                                                        if g_min is not None and g_max is not None:
                                                            var_gr = var_nofv.where(
                                                                (var_nofv >=
                                                                 g_min)
                                                                & (var_nofv <=
                                                                   g_max))
                                                            n_grange = int(
                                                                np.sum(
                                                                    np.isnan(
                                                                        var_gr)
                                                                ) - n_fv -
                                                                n_nan)
                                                        else:
                                                            n_grange = 'no global ranges'
                                                            var_gr = var_nofv

                                                        if list(
                                                                np.unique(
                                                                    np.isnan(
                                                                        var_gr)
                                                                )) != [True]:
                                                            if sv == 'spkir_abj_cspp_downwelling_vector':
                                                                # don't remove outliers from dataset
                                                                [
                                                                    num_outliers,
                                                                    mean, vmin,
                                                                    vmax, sd,
                                                                    n_stats
                                                                ] = cf.variable_statistics_spkir(
                                                                    var_gr)
                                                            else:
                                                                if vnum_dims > 1:
                                                                    var_gr = var_gr.values.flatten(
                                                                    )
                                                                # drop nans before calculating stats
                                                                var_gr = var_gr[
                                                                    ~np.isnan(
                                                                        var_gr
                                                                    )]
                                                                [
                                                                    num_outliers,
                                                                    mean, vmin,
                                                                    vmax, sd,
                                                                    n_stats
                                                                ] = cf.variable_statistics(
                                                                    var_gr, 5)
                                                        else:
                                                            num_outliers = None
                                                            mean = None
                                                            vmin = None
                                                            vmax = None
                                                            sd = None
                                                            n_stats = 0
                                                            n_grange = None
                                                    else:
                                                        num_outliers = None
                                                        mean = None
                                                        vmin = None
                                                        vmax = None
                                                        sd = None
                                                        n_stats = 0
                                                        n_grange = None

                                    except KeyError:
                                        if sv == 'int_ctd_pressure':
                                            continue
                                        else:
                                            num_outliers = None
                                            mean = None
                                            vmin = None
                                            vmax = None
                                            sd = None
                                            n_stats = 'variable not found in file'
                                            var_units = None
                                            n_nan = None
                                            n_fv = None
                                            fv = None
                                            n_grange = None
                                            n_all = None

                                    if vnum_dims > 1:
                                        sv = '{} (dims: {})'.format(
                                            sv, list(var.dims))
                                    else:
                                        sv = sv
                                    if 'timedelta' not in str(
                                            var.values.dtype):
                                        data['deployments'][deployment][
                                            'method'][method]['stream'][
                                                data_stream]['file'][fname][
                                                    'sci_var_stats'][sv] = dict(
                                                        n_outliers=num_outliers,
                                                        mean=mean,
                                                        min=vmin,
                                                        max=vmax,
                                                        stdev=sd,
                                                        n_stats=n_stats,
                                                        units=var_units,
                                                        n_nans=n_nan,
                                                        n_fillvalues=n_fv,
                                                        fill_value=str(fv),
                                                        global_ranges=[
                                                            g_min, g_max
                                                        ],
                                                        n_grange=n_grange,
                                                        n_all=n_all)

        sfile = os.path.join(save_dir, '{}-file_analysis.json'.format(r))
        with open(sfile, 'w') as outfile:
            json.dump(data, outfile)

        depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r))
        with open(depfile, 'w') as depf:
            depf.write(str(dependencies))

        json_file_list.append(str(sfile))

    return json_file_list

Пример #6

Показать файл

Файл: plot_adcp.py Проект: leilabbb/data-review-tools

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        fdatasets = np.unique(fdatasets).tolist()
        for fd in fdatasets:
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})
            #ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x]  # get pressure variable from coordinates

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            sci_vars = cf.return_science_vars(stream)
            # drop the following list of key words from science variables list
            sci_vars = notin_list(
                sci_vars, ['bin_depths', 'salinity', 'temperature', 'beam'])
            sci_vars = [
                name for name in sci_vars if ds[name].units != 'mm s-1'
            ]

            print('\nPlotting {} {}'.format(r, deployment))
            array = subsite[0:2]
            filename = '_'.join(fname.split('_')[:-1])
            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'preferred_method_plots', deployment)
            cf.create_dir(save_dir)

            tm = ds['time'].values
            t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title_text = ' '.join((deployment, refdes, method))

            for var in sci_vars:
                print(var)
                v = ds[var]
                fv = v._FillValue
                v_name = v.long_name

                if len(v.dims) == 1:
                    v, n_nan, n_fv, n_ev, n_grange, g_min, g_max, n_std = reject_err_data_1_dims(
                        v, fv, r, var, n=5)

                    # Plot all data
                    fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=None)
                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1),
                                 fontsize=9)
                    sfile = '-'.join((filename, v_name, t0[:10]))
                    pf.save_fig(save_dir, sfile)

                    # Plot data with outliers removed
                    fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=5)
                    title_i = 'removed: {} nans, {} fill values, {} extreme values, {} GR [{}, {}],' \
                              ' {} outliers +/- 5 SD'.format(n_nan, n_fv , n_ev, n_grange, g_min, g_max, n_std)

                    ax.set_title(
                        (title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i),
                        fontsize=8)
                    sfile = '-'.join(
                        (filename, v_name, t0[:10])) + '_rmoutliers'
                    pf.save_fig(save_dir, sfile)

                else:
                    v = v.values.T.astype(float)
                    v_bad_beams = ds[
                        'percent_bad_beams']  # get bad beams percent
                    fv_bad_beam = v_bad_beams._FillValue
                    v_bad_beams = v_bad_beams.values.T.astype(float)
                    v_bad_beams[v_bad_beams ==
                                fv_bad_beam] = np.nan  # mask fill values

                    v, n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max = reject_err_data_2_dims(
                        v, v_bad_beams, fv, r, var)

                    ylabel = 'bin_depths ({})'.format(ds['bin_depths'].units)
                    clabel = '{} ({})'.format(var, ds[var].units)

                    # check bin depths for extreme values
                    y = ds['bin_depths'].values.T
                    y_nan = np.sum(np.isnan(y))
                    y = np.where(y < 6000, y,
                                 np.nan)  # replace extreme bin_depths by nans
                    bin_nan = np.sum(np.isnan(y)) - y_nan
                    bin_title = 'removed: {} bin depths > 6000'.format(bin_nan)

                    if 'echo' in var:
                        color = 'BuGn'
                    else:
                        color = 'RdBu'

                    new_y = dropna(y,
                                   axis=1)  # convert to DataFrame to drop nan
                    y_mask = new_y.loc[list(new_y.index), list(new_y.columns)]
                    v_new = pd.DataFrame(v)
                    v_mask = v_new.loc[list(new_y.index), list(new_y.columns)]
                    tm_mask = tm[new_y.columns]

                    fig, ax, __ = pf.plot_adcp(tm_mask,
                                               np.array(y_mask),
                                               np.array(v_mask),
                                               ylabel,
                                               clabel,
                                               color,
                                               n_stdev=None)

                    if bin_nan > 0:
                        ax.set_title((title_text + '\n' + t0 + ' - ' + t1 +
                                      '\n' + bin_title),
                                     fontsize=8)
                    else:
                        ax.set_title((title_text + '\n' + t0 + ' - ' + t1),
                                     fontsize=8)

                    sfile = '-'.join((filename, var, t0[:10]))
                    pf.save_fig(save_dir, sfile)

                    fig, ax, n_nans_all = pf.plot_adcp(tm_mask,
                                                       np.array(y_mask),
                                                       np.array(v_mask),
                                                       ylabel,
                                                       clabel,
                                                       color,
                                                       n_stdev=5)
                    title_i = 'removed: {} nans {} fill values, {} extreme values, {} bad beams, {} GR [{}, {}]'.format(
                        n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max)

                    if bin_nan > 0:
                        ax.set_title((title_text + '\n' + t0 + ' - ' + t1 +
                                      '\n' + title_i + '\n' + bin_title),
                                     fontsize=8)
                    else:
                        ax.set_title((title_text + '\n' + t0 + ' - ' + t1 +
                                      '\n' + title_i),
                                     fontsize=8)

                    sfile = '-'.join((filename, var, t0[:10])) + '_rmoutliers'
                    pf.save_fig(save_dir, sfile)

Пример #7

Показать файл

def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only,
         n_std, surface_params, depth_params):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'ENG' not in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        for fd in fdatasets_sel:
            part_d = fd.split('/')[-1]
            print('\n{}'.format(part_d))
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            array = subsite[0:2]
            sci_vars = cf.return_science_vars(stream)

            if 'CE05MOAS' in r or 'CP05MOAS' in r:  # for coastal gliders, get m_water_depth for bathymetry
                eng = '-'.join((r.split('-')[0], r.split('-')[1],
                                '00-ENG000000', method, 'glider_eng'))
                eng_url = [s for s in url_list if eng in s]
                if len(eng_url) == 1:
                    eng_datasets = cf.get_nc_urls(eng_url)
                    # filter out collocated datasets
                    eng_dataset = [
                        j for j in eng_datasets
                        if (eng in j.split('/')[-1]
                            and deployment in j.split('/')[-1])
                    ]
                    if len(eng_dataset) > 0:
                        ds_eng = xr.open_dataset(eng_dataset[0],
                                                 mask_and_scale=False)
                        t_eng = ds_eng['time'].values
                        m_water_depth = ds_eng['m_water_depth'].values

                        # m_altimeter_status = 0 means a good reading (not nan or -1)
                        eng_ind = ds_eng['m_altimeter_status'].values == 0
                        m_water_depth = m_water_depth[eng_ind]
                        t_eng = t_eng[eng_ind]
                    else:
                        print('No engineering file for deployment {}'.format(
                            deployment))
                        m_water_depth = None
                        t_eng = None
                else:
                    m_water_depth = None
                    t_eng = None
            else:
                m_water_depth = None
                t_eng = None

            if deployment_num is not None:
                if int(deployment.split('0')[-1]) is not deployment_num:
                    print(type(int(deployment.split('0')[-1])),
                          type(deployment_num))
                    continue

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue
                stime = start_time.strftime('%Y-%m-%d')
                etime = end_time.strftime('%Y-%m-%d')
                ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                save_dir_profile = os.path.join(sDir, array, subsite, refdes,
                                                'profile_plots', deployment,
                                                ext)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes,
                                                 'xsection_plots', deployment,
                                                 ext)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes,
                                           'xsection_plots_4d', deployment,
                                           ext)
            else:
                save_dir_profile = os.path.join(sDir, array, subsite, refdes,
                                                'profile_plots', deployment)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes,
                                                 'xsection_plots', deployment)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes,
                                           'xsection_plots_4d', deployment)

            tm = ds['time'].values
            try:
                ds_lat = ds['lat'].values
            except KeyError:
                ds_lat = None
                print('No latitude variable in file')
            try:
                ds_lon = ds['lon'].values
            except KeyError:
                ds_lon = None
                print('No longitude variable in file')

            # get pressure variable
            y, y_units, press = cf.add_pressure_to_dictionary_of_sci_vars(ds)

            for sv in sci_vars:
                print(sv)
                if 'pressure' not in sv:
                    z = ds[sv].values
                    fv = ds[sv]._FillValue
                    sv_units = ds[sv].units

                    # Check if the array is all NaNs
                    if sum(np.isnan(z)) == len(z):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(z[z != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:
                        # reject erroneous data
                        dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \
                            cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon)

                        # get rid of 0.0 data
                        if 'CTD' in r:
                            ind = zpressure > 0.0
                        else:
                            ind = ndata > 0.0

                        lenzero = np.sum(~ind)
                        dtime = dtime[ind]
                        zpressure = zpressure[ind]
                        ndata = ndata[ind]
                        if ds_lat is not None and ds_lon is not None:
                            lat = lat[ind]
                            lon = lon[ind]
                        else:
                            lat = None
                            lon = None

                        t0 = pd.to_datetime(
                            dtime.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            dtime.max()).strftime('%Y-%m-%dT%H:%M:%S')
                        title = ' '.join((deployment, refdes,
                                          method)) + '\n' + t0 + ' to ' + t1

                        # reject time range from data portal file export
                        t_portal, z_portal, y_portal, lat_portal, lon_portal = \
                            cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon)

                        print(
                            'removed {} data points using visual inspection of data'
                            .format(len(ndata) - len(z_portal)))

                        # create data groups
                        columns = ['tsec', 'dbar', str(sv)]
                        # min_r = int(round(min(y_portal) - zcell_size))
                        # max_r = int(round(max(y_portal) + zcell_size))
                        # ranges = list(range(min_r, max_r, zcell_size))
                        #ranges = [0, 10, 20, 30, 40, 50, 60, 70, 80, 200]
                        range1 = list(
                            range(surface_params[0], surface_params[1],
                                  surface_params[2]))
                        range2 = list(
                            range(depth_params[0],
                                  depth_params[1] + depth_params[2],
                                  depth_params[2]))
                        ranges = range1 + range2

                        groups, d_groups = gt.group_by_depth_range(
                            t_portal, y_portal, z_portal, columns, ranges)

                        if 'scatter' in sv:
                            n_std = None  # to use percentile
                        else:
                            n_std = n_std

                        #  get percentile analysis for printing on the profile plot
                        inpercentile = [surface_params[3]] * len(
                            range1) + [depth_params[3]] * len(range2)
                        n_std = [surface_params[3]] * len(
                            range1) + [depth_params[3]] * len(range2)
                        y_plt, n_med, n_min, n_max, n0_std, n1_std, l_arr, time_ex = reject_timestamps_in_groups(
                            groups, d_groups, n_std, inpercentile)
                        """
                        Plot all data
                        """
                        if len(tm) > 0:
                            cf.create_dir(save_dir_profile)
                            cf.create_dir(save_dir_xsection)
                            sname = '-'.join((r, method, sv))
                            sfileall = '_'.join(('all_data', sname))
                            '''
                            profile plot
                            '''
                            xlabel = sv + " (" + sv_units + ")"
                            ylabel = press[0] + " (" + y_units[0] + ")"
                            clabel = 'Time'

                            fig, ax = pf.plot_profiles(z,
                                                       y,
                                                       tm,
                                                       ylabel,
                                                       xlabel,
                                                       clabel,
                                                       stdev=None)

                            ax.set_title(title, fontsize=9)
                            fig.tight_layout()
                            pf.save_fig(save_dir_profile, sfileall)
                            '''
                            xsection plot
                            '''
                            clabel = sv + " (" + sv_units + ")"
                            ylabel = press[0] + " (" + y_units[0] + ")"

                            fig, ax, bar = pf.plot_xsection(subsite,
                                                            tm,
                                                            y,
                                                            z,
                                                            clabel,
                                                            ylabel,
                                                            t_eng,
                                                            m_water_depth,
                                                            inpercentile=None,
                                                            stdev=None)

                            ax.set_title(title, fontsize=9)
                            fig.tight_layout()
                            pf.save_fig(save_dir_xsection, sfileall)
                        """
                        Plot cleaned-up data
                        """
                        if len(dtime) > 0:

                            sfile = '_'.join(('rm_erroneous_data', sname))
                            '''
                            profile plot
                            '''
                            xlabel = sv + " (" + sv_units + ")"
                            ylabel = press[0] + " (" + y_units[0] + ")"
                            clabel = 'Time'

                            fig, ax = pf.plot_profiles(z_portal,
                                                       y_portal,
                                                       t_portal,
                                                       ylabel,
                                                       xlabel,
                                                       clabel,
                                                       stdev=None)

                            ax.set_title(title, fontsize=9)
                            ax.plot(n_med, y_plt, '.k')
                            ax.fill_betweenx(y_plt,
                                             n0_std,
                                             n1_std,
                                             color='m',
                                             alpha=0.2)
                            leg_text = (
                                'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                '{} zeros'.format(lenfv, lennan, lenev, lengr,
                                                  global_min, global_max,
                                                  lenzero) +
                                '\nexcluded {} suspect data points when inspected visually'
                                .format(len(ndata) - len(z_portal)) +
                                '\n(black) data median in {} dbar segments (break at {} dbar)'
                                .format([surface_params[2], depth_params[2]],
                                        depth_params[0]) +
                                '\n(magenta) upper and lower {} percentile envelope in {} dbar segments'
                                .format(
                                    [surface_params[3], depth_params[3]],
                                    [surface_params[2], depth_params[2]]), )
                            ax.legend(leg_text,
                                      loc='upper center',
                                      bbox_to_anchor=(0.5, -0.17),
                                      fontsize=6)
                            fig.tight_layout()
                            pf.save_fig(save_dir_profile, sfile)
                            '''
                            xsection plot
                            '''
                            clabel = sv + " (" + sv_units + ")"
                            ylabel = press[0] + " (" + y_units[0] + ")"

                            # plot non-erroneous data
                            fig, ax, bar = pf.plot_xsection(subsite,
                                                            t_portal,
                                                            y_portal,
                                                            z_portal,
                                                            clabel,
                                                            ylabel,
                                                            t_eng,
                                                            m_water_depth,
                                                            inpercentile=None,
                                                            stdev=None)

                            ax.set_title(title, fontsize=9)
                            leg_text = (
                                'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                '{} zeros'.format(lenfv, lennan, lenev, lengr,
                                                  global_min, global_max,
                                                  lenzero) +
                                '\nexcluded {} suspect data points when inspected visually'
                                .format(len(ndata) - len(z_portal)), )
                            ax.legend(leg_text,
                                      loc='upper center',
                                      bbox_to_anchor=(0.5, -0.17),
                                      fontsize=6)
                            fig.tight_layout()
                            pf.save_fig(save_dir_xsection, sfile)
                            '''
                            4D plot for gliders only
                            '''
                            if 'MOAS' in r:
                                if ds_lat is not None and ds_lon is not None:
                                    cf.create_dir(save_dir_4d)

                                    clabel = sv + " (" + sv_units + ")"
                                    zlabel = press[0] + " (" + y_units[0] + ")"

                                    fig = plt.figure()
                                    ax = fig.add_subplot(111, projection='3d')
                                    sct = ax.scatter(lon_portal,
                                                     lat_portal,
                                                     y_portal,
                                                     c=z_portal,
                                                     s=2)
                                    cbar = plt.colorbar(sct,
                                                        label=clabel,
                                                        extend='both')
                                    cbar.ax.tick_params(labelsize=8)
                                    ax.invert_zaxis()
                                    ax.view_init(25, 32)
                                    ax.invert_xaxis()
                                    ax.invert_yaxis()
                                    ax.set_zlabel(zlabel, fontsize=9)
                                    ax.set_ylabel('Latitude', fontsize=9)
                                    ax.set_xlabel('Longitude', fontsize=9)

                                    ax.set_title(title, fontsize=9)
                                    pf.save_fig(save_dir_4d, sfile)

Пример #8

Показать файл

Файл: plot_presf.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'PRESF' in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                for ud in udatasets:  # filter out collocated data files
                    if 'PRESF' in ud.split('/')[-1]:
                        datasets.append(ud)
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        fdatasets = np.unique(fdatasets).tolist()
        for fd in fdatasets:
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            sci_vars = cf.return_science_vars(stream)
            print('\nPlotting {} {}'.format(r, deployment))
            array = subsite[0:2]
            filename = '_'.join(fname.split('_')[:-1])
            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'timeseries_plots', deployment)
            cf.create_dir(save_dir)

            tm = ds['time'].values
            t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title = ' '.join((deployment, refdes, method))

            for var in sci_vars:
                print(var)
                if var != 'id':
                    #if var == 'presf_wave_burst_pressure':
                    y = ds[var]
                    fv = y._FillValue
                    if len(y.dims) == 1:

                        # Check if the array is all NaNs
                        if sum(np.isnan(y.values)) == len(y.values):
                            print('Array of all NaNs - skipping plot.')

                        # Check if the array is all fill values
                        elif len(y[y != fv]) == 0:
                            print('Array of all fill values - skipping plot.')

                        else:
                            # reject fill values
                            ind = y.values != fv
                            t = tm[ind]
                            y = y[ind]

                            # Plot all data
                            fig, ax = pf.plot_timeseries(t,
                                                         y,
                                                         y.name,
                                                         stdev=None)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)
                            sfile = '-'.join((filename, y.name, t0[:10]))
                            pf.save_fig(save_dir, sfile)

                            # Plot data with outliers removed
                            fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)
                            sfile = '-'.join(
                                (filename, y.name, t0[:10])) + '_rmoutliers'
                            pf.save_fig(save_dir, sfile)
                    else:
                        v = y.values.T
                        n_nan = np.sum(np.isnan(v))

                        # convert fill values to nans
                        try:
                            v[v == fv] = np.nan
                        except ValueError:
                            v = v.astype(float)
                            v[v == fv] = np.nan
                        n_fv = np.sum(np.isnan(v)) - n_nan

                        # plot before global ranges are removed
                        fig, ax = pf.plot_presf_2d(tm, v, y.name, y.units)
                        ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                     fontsize=9)
                        sfile = '-'.join((filename, var, t0[:10]))
                        pf.save_fig(save_dir, sfile)

                        # reject data outside of global ranges
                        [g_min, g_max] = cf.get_global_ranges(r, var)
                        if g_min is not None and g_max is not None:
                            v[v < g_min] = np.nan
                            v[v > g_max] = np.nan
                            n_grange = np.sum(np.isnan(v)) - n_fv - n_nan

                            if n_grange > 0:
                                # don't plot if the array is all nans
                                if len(np.unique(
                                        np.isnan(v))) == 1 and np.unique(
                                            np.isnan(v))[0] == True:
                                    continue
                                else:
                                    # plot after global ranges are removed
                                    fig, ax = pf.plot_presf_2d(
                                        tm, v, y.name, y.units)
                                    title2 = 'removed: {} global ranges [{}, {}]'.format(
                                        n_grange, g_min, g_max)
                                    ax.set_title((title + '\n' + t0 + ' - ' +
                                                  t1 + '\n' + title2),
                                                 fontsize=9)
                                    sfile = '-'.join(
                                        (filename, var, t0[:10], 'rmgr'))
                                    pf.save_fig(save_dir, sfile)

Пример #9

Показать файл

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        fdatasets = np.unique(fdatasets).tolist()
        for fd in fdatasets:
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            sci_vars = cf.return_science_vars(stream)
            print('\nPlotting {} {}'.format(r, deployment))
            array = subsite[0:2]
            filename = '_'.join(fname.split('_')[:-1])
            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'timeseries_plots')
            cf.create_dir(save_dir)

            tm = ds['time'].values
            t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title = ' '.join((deployment, refdes, method))

            # -------- plot entire deployment --------

            for var in sci_vars:
                print(var)
                vv = ds[var]
                fv = vv._FillValue
                # need to round SPKIR values to 1 decimal place to match the global ranges. otherwise, values that
                # round to zero (e.g. 1.55294e-05) will be excluded by the global range test
                # v = np.round(vv.values.T, 1)  # .T = transpose 2D array
                v = vv.values.T
                n_nan = np.sum(np.isnan(v))

                # convert fill values to nans
                v[v == fv] = np.nan
                n_fv = np.sum(np.isnan(v)) - n_nan

                # plot before global ranges are removed
                fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                sfile = '-'.join((filename, var, t0[:10]))
                pf.save_fig(save_dir, sfile)

                # reject data outside of global ranges
                [g_min, g_max] = cf.get_global_ranges(r, var)
                if g_min is not None and g_max is not None:
                    v[v < g_min] = np.nan
                    v[v > g_max] = np.nan
                    n_grange = np.sum(np.isnan(v)) - n_fv - n_nan
                else:
                    n_grange = 'no global ranges'

                # plot after global ranges are removed
                fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                title2 = 'removed: {} global ranges [{}, {}]'.format(
                    n_grange, g_min, g_max)
                ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\n' + title2),
                             fontsize=9)
                sfile = '-'.join((filename, var, t0[:10], 'rmgr'))
                pf.save_fig(save_dir, sfile)

            # -------- break the deployment into months and plot --------

            save_dir = os.path.join(sDir, array, subsite, refdes,
                                    'timeseries_plots', 'monthly')
            cf.create_dir(save_dir)

            # create list of start and end dates
            dt_start = dt.datetime.strptime(t0, '%Y-%m-%dT%H:%M:%S')
            dt_end = dt.datetime.strptime(t1, '%Y-%m-%dT%H:%M:%S')
            start_dates = [dt_start.strftime('%m-%d-%YT00:00:00')]
            end_dates = []
            ts1 = dt_start
            while ts1 <= dt_end:
                ts2 = ts1 + dt.timedelta(days=1)
                if ts2.month != ts1.month:
                    start_dates.append(ts2.strftime('%m-%d-%YT00:00:00'))
                    end_dates.append(ts1.strftime('%m-%d-%YT23:59:59'))
                ts1 = ts2
            end_dates.append(dt_end.strftime('%m-%d-%YT23:59:59'))

            for sd, ed in zip(start_dates, end_dates):
                sd_format = dt.datetime.strptime(sd, '%m-%d-%YT%H:%M:%S')
                ed_format = dt.datetime.strptime(ed, '%m-%d-%YT%H:%M:%S')
                ds_month = ds.sel(time=slice(sd_format, ed_format))
                if len(ds_month['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(sd, ed))
                    continue
                tm = ds_month['time'].values
                t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')

                for var in sci_vars:
                    print(var)
                    vv = ds_month[var]
                    fv = vv._FillValue
                    v = vv.values.T  # transpose 2D array
                    n_nan = np.sum(np.isnan(v))

                    # convert fill values to nans
                    v[v == fv] = np.nan
                    n_fv = np.sum(np.isnan(v)) - n_nan

                    # reject data outside of global ranges
                    [g_min, g_max] = cf.get_global_ranges(r, var)
                    if g_min is not None and g_max is not None:
                        v[v < g_min] = np.nan
                        v[v > g_max] = np.nan
                        n_grange = np.sum(np.isnan(v)) - n_fv - n_nan
                    else:
                        n_grange = 'no global ranges'

                    # plot after global ranges are removed
                    fig, ax = pf.plot_spkir(tm, v, vv.name, vv.units)
                    title2 = 'removed: {} global ranges [{}, {}]'.format(
                        n_grange, g_min, g_max)
                    ax.set_title(
                        (title + '\n' + t0 + ' - ' + t1 + '\n' + title2),
                        fontsize=9)
                    sfile = '-'.join((filename, var, t0[:7], 'rmgr'))
                    pf.save_fig(save_dir, sfile)

Пример #10

Показать файл

Файл: plot_timeseries_hpies.py Проект: ooi-data-lab/data-review-tools

def main(sDir, ncdir, start_time, end_time):
    rd_list = [ncdir.split('/')[-2]]

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for root, dirs, files in os.walk(ncdir):
            for f in files:
                if f.endswith('.nc'):
                    datasets.append(f)
        # for u in url_list:
        #     splitter = u.split('/')[-2].split('-')
        #     rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
        #     if rd_check == r:
        #         udatasets = cf.get_nc_urls([u])
        #         datasets.append(udatasets)
        #datasets = list(itertools.chain(*datasets))
        for fd in datasets:
            if '_blank' not in fd:
                ds = xr.open_dataset(os.path.join(ncdir, fd),
                                     mask_and_scale=False)
                ds = ds.swap_dims({'obs': 'time'})
                ds_vars = list(ds.data_vars.keys()) + [
                    x for x in ds.coords.keys() if 'pressure' in x
                ]  # get pressure variable from coordinates
                #raw_vars = cf.return_raw_vars(ds_vars)

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    os.path.join(ncdir, fd))
                if 'NUTNR' in refdes or 'VEL3D in refdes':
                    vars = cf.return_science_vars(stream)
                else:
                    vars = cf.return_raw_vars(ds_vars)
                print('\nPlotting {} {}'.format(r, deployment))
                array = subsite[0:2]
                filename = '_'.join(fname.split('_')[:-1])
                save_dir = os.path.join(sDir, array, subsite, refdes,
                                        'timeseries_plots', deployment)
                cf.create_dir(save_dir)

                tm = ds['time'].values
                t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))

                for var in vars:
                    print(var)
                    if var not in ['id', 'record_type',
                                   'unique_id']:  # if var != 'id'
                        y = ds[var]
                        try:
                            fv = y._FillValue
                        except AttributeError:
                            fv = np.nan
                        if len(y.dims) == 1:
                            # Check if the array is all NaNs
                            y[y == fv] = np.nan  # turn fill values to nans
                            if sum(np.isnan(y.values)) == len(y.values):
                                print(
                                    'Array of all NaNs and/or fill values - skipping plot.'
                                )

                            # Check if the array is all fill values
                            # elif len(y[y != fv]) == 0:
                            #     print('Array of all fill values - skipping plot.')

                            else:
                                # reject fill values
                                ind = y.values != fv
                                t = tm[ind]
                                y = y[ind]

                                # Plot all data
                                fig, ax = pf.plot_timeseries(t,
                                                             y,
                                                             y.name,
                                                             stdev=None)
                                ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                             fontsize=9)
                                sfile = '-'.join((filename, y.name, t0[:10]))
                                pf.save_fig(save_dir, sfile)

                                # Plot data with outliers removed
                                fig, ax = pf.plot_timeseries(t,
                                                             y,
                                                             y.name,
                                                             stdev=5)
                                ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                             fontsize=9)
                                sfile = '-'.join((filename, y.name,
                                                  t0[:10])) + '_rmoutliers'
                                pf.save_fig(save_dir, sfile)

Пример #11

Показать файл

def compare_plot_datasets(df, r, start_time, end_time, sDir, strm=None):
    names = df.columns
    for d, row in df.iterrows():
        #if '0001' not in d:
        print('\n{}'.format(d))
        for i, n in enumerate(names):
            ii = i + 1
            if ii > 1:
                f1 = row[n]
                if type(f1) == float:
                    continue
                elif type(f1) == list:
                    for x in range(ii - 1):
                        f0 = row[names[x]]
                        if type(f0) == float:
                            continue
                        elif type(f0) == list:
                            compare = '{} {}'.format(names[x], n)

                            if len(f0) == 1:
                                ds0 = xr.open_dataset(f0[0])
                                ds0 = ds0.swap_dims({'obs': 'time'})
                            else:
                                ds0 = xr.open_mfdataset(f0)
                                ds0 = ds0.swap_dims({'obs': 'time'})
                                ds0 = ds0.chunk({'time': 100})
                            splt0 = compare.split(' ')[0].split('-')
                            ds0_sci_vars = cf.return_science_vars(splt0[1])
                            ds0_method = splt0[0]

                            if start_time is not None and end_time is not None:
                                ds0 = ds0.sel(time=slice(start_time, end_time))

                                if len(ds0['time'].values) == 0:
                                    print(
                                        'No {} data to plot for specified time range: ({} to {})'
                                        .format(ds0_method, start_time,
                                                end_time))
                                    continue

                            if len(f1) == 1:
                                ds1 = xr.open_dataset(f1[0])
                                ds1 = ds1.swap_dims({'obs': 'time'})
                            else:
                                ds1 = xr.open_mfdataset(f1)
                                ds1 = ds1.swap_dims({'obs': 'time'})
                                ds1 = ds1.chunk({'time': 100})
                            splt1 = compare.split(' ')[1].split('-')
                            ds1_sci_vars = cf.return_science_vars(splt1[1])
                            ds1_method = splt1[0]

                            if start_time is not None and end_time is not None:
                                ds1 = ds1.sel(time=slice(start_time, end_time))
                                if len(ds1['time'].values) == 0:
                                    print(
                                        'No {} data to plot for specified time range: ({} to {})'
                                        .format(ds1_method, start_time,
                                                end_time))
                                    continue

                            t0 = ds0['time']
                            t1 = ds1['time']

                            # find where the variable long names are the same
                            ds0names = long_names(ds0, ds0_sci_vars)
                            ds0names.rename(columns={'name': 'name_ds0'},
                                            inplace=True)
                            ds1names = long_names(ds1, ds1_sci_vars)
                            ds1names.rename(columns={'name': 'name_ds1'},
                                            inplace=True)
                            mapping = pd.merge(ds0names,
                                               ds1names,
                                               on='long_name',
                                               how='inner')
                            print('----------------------')
                            print('{}: {}'.format(d, compare))
                            print('----------------------')

                            subsite = r.split('-')[0]
                            array = subsite[0:2]
                            if start_time is not None and end_time is not None:
                                stime = start_time.strftime('%Y-%m-%d')
                                etime = end_time.strftime('%Y-%m-%d')
                                ext = '-'.join(
                                    (d, compare)
                                ) + '-' + stime + 'to' + etime  #.join((ds0_method, ds1_method
                                save_dir = os.path.join(
                                    sDir, array, subsite, r,
                                    'method_compare_plots', ext)
                            else:
                                save_dir = os.path.join(
                                    sDir, array, subsite, r,
                                    'method_compare_plots', '-'.join(
                                        (ds0_method, ds1_method)))
                            cf.create_dir(save_dir)

                            for rr in mapping.itertuples():
                                index, name_ds0, long_name, name_ds1 = rr
                                print(long_name)

                                ds0_var = ds0[name_ds0]
                                ds1_var = ds1[name_ds1]

                                # reject NaNs
                                nan0_ind = ~np.isnan(ds0_var.data)
                                ds0_nonan = ds0_var.data[nan0_ind]

                                nan1_ind = ~np.isnan(ds1_var.data)
                                ds1_nonan = ds1_var.data[nan1_ind]

                                # only plot if both arrays have data
                                if len(ds0_nonan) > 0 and len(ds1_nonan) > 0:
                                    # Plot all data
                                    fig, ax = pf.plot_timeseries_compare(
                                        t0,
                                        t1,
                                        ds0_var,
                                        ds1_var,
                                        ds0_method,
                                        ds1_method,
                                        long_name,
                                        stdev=None)

                                    title = ' '.join((d, r, '{} vs {}'.format(
                                        ds0_method, ds1_method)))
                                    ax.set_title(title, fontsize=9)
                                    if strm:
                                        sfile = '_'.join(
                                            (d, r, long_name, strm))
                                    else:
                                        sfile = '_'.join((d, r, long_name))
                                    pf.save_fig(save_dir, sfile)

                                    # Plot data with outliers removed
                                    fig, ax = pf.plot_timeseries_compare(
                                        t0,
                                        t1,
                                        ds0_var,
                                        ds1_var,
                                        ds0_method,
                                        ds1_method,
                                        long_name,
                                        stdev=5)

                                    title = ' '.join((d, r, '{} vs {}'.format(
                                        ds0_method, ds1_method)))
                                    ax.set_title(title, fontsize=9)
                                    if strm:
                                        sfile = '_'.join((d, r, long_name,
                                                          strm, 'rmoutliers'))
                                    else:
                                        sfile = '_'.join(
                                            (d, r, long_name, 'rmoutliers'))
                                    pf.save_fig(save_dir, sfile)

Пример #12

Показать файл

Файл: plot_adcp_cabled.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list, start_time, end_time, deployment_num, interval):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        deployments = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
                for ud in udatasets:
                    if ud.split('/')[-1].split('_')[0] not in deployments:
                        deployments.append(ud.split('/')[-1].split('_')[0])
        datasets = list(itertools.chain(*datasets))
        datasets = cf.filter_collocated_instruments(r, datasets)
        deployments.sort()

        fdatasets = np.unique(datasets).tolist()
        for deploy in deployments:
            if deployment_num is not None:
                if int(deploy[-4:]) is not deployment_num:
                    print('\nskipping {}'.format(deploy))
                    continue

            rdatasets = [s for s in fdatasets if deploy in s]

            # break deployment into 4 segments or make a list of the time range specified
            if start_time is not None and end_time is not None:
                dt_range = [dt.datetime.strftime(start_time, '%Y-%m-%d'), dt.datetime.strftime(end_time, '%Y-%m-%d')]
            else:
                # Get deployment info from the data review database
                dr_data = cf.refdes_datareview_json(r)
                d_info = [x for x in dr_data['instrument']['deployments'] if x['deployment_number'] == int(deploy[-4:])]
                d_info = d_info[0]
                deploy_start = dt.datetime.strptime(str(d_info['start_date']).split('T')[0], '%Y-%m-%d')
                deploy_stop = dt.datetime.strptime(str(d_info['stop_date']).split('T')[0], '%Y-%m-%d') + dt.timedelta(
                    days=1)
                dt_range = list(date_range(deploy_start, deploy_stop, 4))

            sci_vars_dict = {'time': dict(values=np.array([], dtype=np.datetime64), fv=[], ln=[]),
                             'bin_depths': dict(values=np.array([]), units=[], fv=[], ln=[])}
            percentgood = {'percent_good_beam1': dict(values=np.array([])),
                           'percent_good_beam2': dict(values=np.array([])),
                           'percent_good_beam3': dict(values=np.array([])),
                           'percent_good_beam4': dict(values=np.array([]))}

            if interval is None:
                toplot = range(len(dt_range) - 1)
            else:
                toplot = [interval - 1]

            for dtri in toplot:
                stime = dt.datetime.strptime(dt_range[dtri], '%Y-%m-%d')
                etime = dt.datetime.strptime(dt_range[dtri + 1], '%Y-%m-%d')
                if len(rdatasets) > 0:
                    for i in range(len(rdatasets)):
                    #for i in range(0, 2):  ##### for testing
                        ds = xr.open_dataset(rdatasets[i], mask_and_scale=False)
                        ds = ds.swap_dims({'obs': 'time'})
                        print('\nAppending data from {}: file {} of {}'.format(deploy, i + 1, len(rdatasets)))

                        ds = ds.sel(time=slice(stime, etime))
                        if len(ds['time'].values) == 0:
                            print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                            continue

                        try:
                            print(fname)
                        except NameError:
                            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(rdatasets[0])
                            array = subsite[0:2]
                            sci_vars = cf.return_science_vars(stream)
                            # drop the following list of key words from science variables list
                            sci_vars = notin_list(sci_vars, ['salinity', 'temperature', 'bin_depths', 'beam'])
                            sci_vars = [name for name in sci_vars if ds[name].units != 'mm s-1']

                            for sci_var in sci_vars:
                                sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[], ln=[])})

                        # append data for the deployment into a dictionary
                        for s_v, info in sci_vars_dict.items():
                            print(s_v)
                            vv = ds[s_v]
                            try:
                                if vv.units not in info['units']:
                                    info['units'].append(vv.units)
                            except AttributeError:
                                print('no units')
                            try:
                                if vv._FillValue not in info['fv']:
                                    info['fv'].append(vv._FillValue)
                            except AttributeError:
                                print('no fill value')

                            try:
                                if vv.long_name not in info['ln']:
                                    info['ln'].append(vv.long_name)
                            except AttributeError:
                                print('no long name')

                            if len(vv.dims) == 1:
                                info['values'] = np.append(info['values'], vv.values)
                            else:
                                if len(info['values']) == 0:
                                    info['values'] = vv.values.T
                                else:
                                    info['values'] = np.concatenate((info['values'], vv.values.T), axis=1)

                        # append percent good beams
                        for j, k in percentgood.items():
                            pgvv = ds[j]
                            fv_pgvv = pgvv._FillValue
                            pgvv = pgvv.values.T.astype(float)
                            pgvv[pgvv == fv_pgvv] = np.nan
                            if len(k['values']) == 0:
                                k['values'] = pgvv
                            else:
                                k['values'] = np.concatenate((k['values'], pgvv), axis=1)

                    if len(sci_vars_dict['time']['values']) > 0:
                        filename = '_'.join(fname.split('_')[:-1])
                        save_dir = os.path.join(sDir, array, subsite, refdes, 'plots', deployment)
                        cf.create_dir(save_dir)

                        tm = sci_vars_dict['time']['values']
                        t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
                        title_text = ' '.join((deployment, refdes, method))

                        bd = sci_vars_dict['bin_depths']
                        ylabel = 'bin_depths ({})'.format(bd['units'][0])

                        print('\nPlotting interval {}'.format(int(dtri) + 1))
                        for var in sci_vars:
                            print('----{}'.format(var))
                            v = sci_vars_dict[var]
                            fv = v['fv'][0]
                            v_name = v['ln'][0]
                            units = v['units'][0]

                            if len(np.shape(v['values'])) == 1:
                                v, n_nan, n_fv, n_ev, n_grange, g_min, g_max, n_std = reject_err_data_1_dims(v['values'], fv, r, var, n=5)

                                if len(tm) > np.sum(np.isnan(v)):  # only plot if the array contains values
                                    # Plot all data
                                    fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=None)
                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=9)
                                    sfile = '-'.join((filename, v_name, t0[:10]))
                                    pf.save_fig(save_dir, sfile)

                                    # Plot data with outliers removed
                                    fig, ax = pf.plot_timeseries(tm, v, v_name, stdev=5)
                                    title_i = 'removed: {} nans, {} fill values, {} extreme values, {} GR [{}, {}],' \
                                              ' {} outliers +/- 5 SD'.format(n_nan, n_fv , n_ev, n_grange, g_min, g_max, n_std)

                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8)
                                    sfile = '-'.join((filename, v_name, t0[:10])) + '_rmoutliers'
                                    pf.save_fig(save_dir, sfile)
                                else:
                                    print('Array of all nans - skipping plot')

                            else:
                                v, n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max = reject_err_data_2_dims(v['values'], percentgood, fv, r, var)

                                clabel = '{} ({})'.format(var, units)

                                # check bin depths for extreme values
                                y = bd['values']
                                # if all the values are negative, take the absolute value (cabled data bin depths are negative)
                                if int(np.nanmin(y)) < 0 and int(np.nanmax(y)) < 0:
                                    y = abs(y)
                                y_nan = np.sum(np.isnan(y))
                                y = np.where(y < 6000, y, np.nan)  # replace extreme bin_depths by nans
                                bin_nan = np.sum(np.isnan(y)) - y_nan
                                bin_title = 'removed: {} bin depths > 6000'.format(bin_nan)

                                if 'echo' in var:
                                    color = 'BuGn'
                                else:
                                    color = 'RdBu'

                                new_y = dropna(y, axis=1)  # convert to DataFrame to drop nan
                                y_mask = new_y.loc[list(new_y.index), list(new_y.columns)]
                                v_new = pd.DataFrame(v)
                                v_mask = v_new.loc[list(new_y.index), list(new_y.columns)]
                                tm_mask = tm[new_y.columns]

                                fig, ax, __ = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color,
                                                           n_stdev=None)

                                if bin_nan > 0:
                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + bin_title), fontsize=8)
                                else:
                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1), fontsize=8)

                                sfile = '-'.join((filename, var, t0[:10]))
                                pf.save_fig(save_dir, sfile)

                                fig, ax, n_nans_all = pf.plot_adcp(tm_mask, np.array(y_mask), np.array(v_mask), ylabel, clabel, color, n_stdev=5)
                                title_i = 'removed: {} nans, {} fill values, {} extreme values, {} bad beams, {} GR [{}, {}]'.format(
                                    n_nan, n_fv, n_ev, n_bb, n_grange, g_min, g_max)

                                if bin_nan > 0:
                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i + '\n' + bin_title), fontsize=8)
                                else:
                                    ax.set_title((title_text + '\n' + t0 + ' - ' + t1 + '\n' + title_i), fontsize=8)

                                sfile = '-'.join((filename, var, t0[:10])) + '_rmoutliers'
                                pf.save_fig(save_dir, sfile)

Пример #13

Показать файл

Файл: plot_timeseries.py Проект: leilabbb/data-review-tools

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        fdatasets = np.unique(fdatasets).tolist()
        main_sensor = r.split('-')[-1]
        fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets)

        for fd in fdatasets:
            if '_blank' not in fd:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                ds = ds.swap_dims({'obs': 'time'})
                ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x]  # get pressure variable from coordinates
                #raw_vars = cf.return_raw_vars(ds_vars)

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd)
                if 'NUTNR' in refdes:
                    vars = cf.return_science_vars(stream)
                else:
                    vars = cf.return_raw_vars(ds_vars)
                print('\nPlotting {} {}'.format(r, deployment))
                array = subsite[0:2]
                filename = '_'.join(fname.split('_')[:-1])
                save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment)
                cf.create_dir(save_dir)

                tm = ds['time'].values
                t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))

                for var in vars:
                    print(var)
                    if var != 'id':
                        y = ds[var]
                        try:
                            fv = y._FillValue
                        except AttributeError:
                            fv = np.nan
                        if len(y.dims) == 1:
                            # Check if the array is all NaNs
                            if sum(np.isnan(y.values)) == len(y.values):
                                print('Array of all NaNs - skipping plot.')

                            # Check if the array is all fill values
                            elif len(y[y != fv]) == 0:
                                print('Array of all fill values - skipping plot.')

                            else:
                                # reject fill values
                                ind = y.values != fv
                                t = tm[ind]
                                y = y[ind]

                                # Plot all data
                                fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None)
                                ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                                sfile = '-'.join((filename, y.name, t0[:10]))
                                pf.save_fig(save_dir, sfile)

                                # Plot data with outliers removed
                                fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5)
                                ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                                sfile = '-'.join((filename, y.name, t0[:10])) + '_rmoutliers'
                                pf.save_fig(save_dir, sfile)

Пример #14

Показать файл

Файл: plot_xsection.py Проект: ooi-data-lab/data-review-tools

def main(sDir, f, start_time, end_time):
    ff = pd.read_csv(os.path.join(sDir, f))
    url_list = ff['outputUrl'].tolist()
    for i, u in enumerate(url_list):
        print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u))
        main_sensor = u.split('/')[-2].split('-')[4]
        datasets = cf.get_nc_urls([u])
        datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets)

        for ii, d in enumerate(datasets_sel):
            print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel),
                                                  d))
            with xr.open_dataset(d, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    d)
                vars = ds.data_vars.keys()

                if 'MOAS' in subsite and 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                    pressure = 'sci_water_pressure_dbar'
                else:
                    pressure = pf.pressure_var(ds, vars)

                sci_vars = cf.return_science_vars(stream)
                sci_vars = [s for s in sci_vars if s not in [pressure]
                            ]  # remove pressure from sci_vars

                save_dir = os.path.join(sDir, subsite, refdes,
                                        'xsection_plots', deployment)
                cf.create_dir(save_dir)

                t = ds['time'].values
                t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))

                y = ds[pressure]

                print('Plotting variables...')
                for var in sci_vars:
                    print(var)
                    z = ds[var]

                    # Plot all data
                    clabel = var + " (" + z.units + ")"
                    ylabel = pressure + " (" + y.units + ")"

                    fig, ax = pf.plot_xsection(subsite,
                                               t,
                                               y,
                                               z,
                                               clabel,
                                               ylabel,
                                               stdev=None)
                    ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                    sfile = '_'.join((fname[0:-46], z.name))
                    pf.save_fig(save_dir, sfile)

                    # Plot data with outliers removed
                    fig, ax = pf.plot_xsection(subsite,
                                               t,
                                               y,
                                               z,
                                               clabel,
                                               ylabel,
                                               stdev=5)
                    ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
                    sfile = '_'.join((fname[0:-46], z.name, 'rmoutliers'))
                    pf.save_fig(save_dir, sfile)

Пример #15

Показать файл

Файл: plot_deployment_3Dcolor_scatter.py Проект: ooi-data-lab/data-review-tools

def main(url_list, sDir, plot_type, deployment_num, start_time, end_time, preferred_only, glider, zdbar, n_std, inpercentile, zcell_size):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'ENG' not in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets)

        for fd in fdatasets_sel:
            part_d = fd.split('/')[-1]
            print(part_d)
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd)
            array = subsite[0:2]
            sci_vars = cf.return_science_vars(stream)

            if 'CE05MOAS' in r or 'CP05MOAS' in r:  # for coastal gliders, get m_water_depth for bathymetry
                eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng'))
                eng_url = [s for s in url_list if eng in s]
                if len(eng_url) == 1:
                    eng_datasets = cf.get_nc_urls(eng_url)
                    # filter out collocated datasets
                    eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])]
                    if len(eng_dataset) > 0:
                        ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False)
                        t_eng = ds_eng['time'].values
                        m_water_depth = ds_eng['m_water_depth'].values

                        # m_altimeter_status = 0 means a good reading (not nan or -1)
                        eng_ind = ds_eng['m_altimeter_status'].values == 0
                        m_water_depth = m_water_depth[eng_ind]
                        t_eng = t_eng[eng_ind]
                    else:
                        print('No engineering file for deployment {}'.format(deployment))

            if deployment_num is not None:
                if int(deployment.split('0')[-1]) is not deployment_num:
                    print(type(int(deployment.split('0')[-1])), type(deployment_num))
                    continue

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                    continue
                stime = start_time.strftime('%Y-%m-%d')
                etime = end_time.strftime('%Y-%m-%d')
                ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, deployment, ext)
            else:
                save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, deployment)

            cf.create_dir(save_dir)

            tm = ds['time'].values

            # get pressure variable
            ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x]

            y, y_units, press = cf.add_pressure_to_dictionary_of_sci_vars(ds)
            print(y_units, press)

            # press = pf.pressure_var(ds, ds_vars)
            # print(press)
            # y = ds[press].values
            # y_units = ds[press].units

            for sv in sci_vars:
                print(sv)
                if 'sci_water_pressure' not in sv:
                    z = ds[sv].values
                    fv = ds[sv]._FillValue
                    z_units = ds[sv].units

                    # Check if the array is all NaNs
                    if sum(np.isnan(z)) == len(z):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(z[z != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:

                        """
                        clean up data
                        """
                        # reject erroneous data
                        dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max = \
                                                                        cf.reject_erroneous_data(r, sv, tm, y, z, fv)

                        # get rid of 0.0 data
                        if 'CTD' in r:
                            ind = zpressure > 0.0
                        else:
                            ind = ndata > 0.0

                        lenzero = np.sum(~ind)
                        dtime = dtime[ind]
                        zpressure = zpressure[ind]
                        ndata = ndata[ind]

                        # creating data groups
                        columns = ['tsec', 'dbar', str(sv)]
                        min_r = int(round(min(zpressure) - zcell_size))
                        max_r = int(round(max(zpressure) + zcell_size))
                        ranges = list(range(min_r, max_r, zcell_size))

                        groups, d_groups = gt.group_by_depth_range(dtime, zpressure, ndata, columns, ranges)

                        #  rejecting timestamps from percentile analysis
                        y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups(
                            groups, d_groups, n_std, inpercentile)

                        t_nospct, z_nospct, y_nospct = cf.reject_suspect_data(dtime, zpressure, ndata, time_ex)

                        print('removed {} data points using {} percentile of data grouped in {} dbar segments'.format(
                                                    len(zpressure) - len(z_nospct), inpercentile, zcell_size))

                        # reject time range from data portal file export
                        t_portal, z_portal, y_portal = cf.reject_timestamps_dataportal(subsite, r,
                                                                                    t_nospct, y_nospct, z_nospct)
                        print('removed {} data points using visual inspection of data'.format(len(z_nospct) - len(z_portal)))

                        # reject data in a depth range
                        if zdbar:
                            y_ind = y_portal < zdbar
                            n_zdbar = np.sum(~y_ind)
                            t_array = t_portal[y_ind]
                            y_array = y_portal[y_ind]
                            z_array = z_portal[y_ind]
                        else:
                            n_zdbar = 0
                            t_array = t_portal
                            y_array = y_portal
                            z_array = z_portal
                        print('{} in water depth > {} dbar'.format(n_zdbar, zdbar))

                    """
                    Plot data
                    """

                    if len(dtime) > 0:
                        sname = '-'.join((r, method, sv))

                        clabel = sv + " (" + z_units + ")"
                        ylabel = press[0] + " (" + y_units[0] + ")"

                        if glider == 'no':
                            t_eng = None
                            m_water_depth = None

                        # plot non-erroneous data
                        fig, ax, bar = pf.plot_xsection(subsite, dtime, zpressure, ndata, clabel, ylabel,
                                                        t_eng, m_water_depth, inpercentile, stdev=None)

                        t0 = pd.to_datetime(dtime.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(dtime.max()).strftime('%Y-%m-%dT%H:%M:%S')
                        title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1

                        ax.set_title(title, fontsize=9)
                        leg_text = (
                            'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                            '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero),
                        )
                        ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                        fig.tight_layout()
                        sfile = '_'.join(('rm_erroneous_data', sname))
                        pf.save_fig(save_dir, sfile)

                        # plots removing all suspect data
                        if len(t_array) > 0:
                            if len(t_array) != len(dtime):
                                # plot bathymetry only within data time ranges
                                if glider == 'yes':
                                    eng_ind = (t_eng >= np.min(t_array)) & (t_eng <= np.max(t_array))
                                    t_eng = t_eng[eng_ind]
                                    m_water_depth = m_water_depth[eng_ind]

                                fig, ax, bar = pf.plot_xsection(subsite, t_array, y_array, z_array, clabel, ylabel,
                                                                t_eng, m_water_depth, inpercentile, stdev=None)

                                t0 = pd.to_datetime(t_array.min()).strftime('%Y-%m-%dT%H:%M:%S')
                                t1 = pd.to_datetime(t_array.max()).strftime('%Y-%m-%dT%H:%M:%S')
                                title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1

                                ax.set_title(title, fontsize=9)
                                if zdbar:
                                    leg_text = (
                                        'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                        '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero)
                                        + '\nremoved {} in the upper and lower {}th percentile of data grouped in {} dbar segments'.format(
                                            len(zpressure) - len(z_nospct), inpercentile, zcell_size)
                                        + '\nexcluded {} suspect data points when inspected visually'.format(
                                            len(z_nospct) - len(z_portal))
                                        + '\nexcluded {} suspect data in water depth greater than {} dbar'.format(n_zdbar,
                                                                                                             zdbar),
                                    )
                                else:
                                    leg_text = (
                                        'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                        '{} zeros'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero)
                                        + '\nremoved {} in the upper and lower {}th percentile of data grouped in {} dbar segments'.format(
                                            len(zpressure) - len(z_nospct), inpercentile, zcell_size)
                                        + '\nexcluded {} suspect data points when inspected visually'.format(
                                            len(z_nospct) - len(z_portal)),
                                    )
                                ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                                fig.tight_layout()

                                sfile = '_'.join(('rm_suspect_data', sname))
                                pf.save_fig(save_dir, sfile)

Пример #16

Показать файл

Файл: plot_velocity.py Проект: ooi-data-lab/data-review-tools

def plot_velocity_variables(r, fdatasets, num_plots, save_dir):

    fig, ax = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True)
    fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig_file = 'calculated_currents_plot'

    fig0, ax0 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True)
    fig0.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig0_file = 'uvw_plots'

    fig1, ax1 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True)
    fig1.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig1_file = 'pressure_plots'

    fig2, ax2 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True)
    fig2.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig2_file = 'roll_plots'

    fig3, ax3 = pyplot.subplots(nrows=num_plots, ncols=1, sharey=True)
    fig3.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig3_file = 'pitch_plots'

    for ii in range(len(fdatasets)):

        if num_plots > len(fdatasets):
            for jj in range(len(fdatasets),num_plots,1):
                ax[jj].axis('off')
                ax0[jj].axis('off')
                # ax0[jj].axis('tight')
                ax1[jj].axis('off')
                ax2[jj].axis('off')
                ax3[jj].axis('off')

        print('\n', fdatasets[ii].split('/')[-1])
        deployment = fdatasets[ii].split('/')[-1].split('_')[0].split('deployment')[-1]
        deployment = int(deployment)

        ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False)
        time = ds['time'].values

        '''
        science veriable
        '''
        sci_var = cf.return_science_vars(ds.stream)
        z_var = [z_var for z_var in sci_var if 'pressure' in z_var]
        z = ds[z_var[0]].values
        z_unit = ds[z_var[0]].units
        z_name = ds[z_var[0]].long_name
        z_fill = ds[z_var[0]]._FillValue

        z, err_count_z = reject_err_data_1_dims(z, z_fill, r, z_name[0], n=5)

        w_var = [w_var for w_var in sci_var if 'upward_velocity' in w_var]
        w = ds[w_var[0]].values
        w_unit = ds[w_var[0]].units
        w_name = ds[w_var[0]].long_name
        w_fill = ds[w_var[0]]._FillValue

        w, err_count_w = reject_err_data_1_dims(w, w_fill, r, w_name[0], n=5)

        u_var = [u_var for u_var in sci_var if 'eastward_velocity' in u_var]
        u = ds[u_var[0]].values
        u_unit = ds[u_var[0]].units
        u_name = ds[u_var[0]].long_name
        u_fill = ds[u_var[0]]._FillValue

        u, err_count_u = reject_err_data_1_dims(u, u_fill, r, u_name[0], n=5)

        v_var = [v_var for v_var in sci_var if 'northward_velocity' in v_var]
        v = ds[v_var[0]].values
        v_unit = ds[v_var[0]].units
        v_name = ds[v_var[0]].long_name
        v_fill = ds[v_var[0]]._FillValue

        v, err_count_v = reject_err_data_1_dims(v, v_fill, r, v_name[0], n=5)

        uv_magnitude = np.sqrt(u ** 2 + v ** 2)
        uv_maxmag = max(uv_magnitude)


        '''
         non science veriable
         According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees
         redmine ticket: Marine Hardware #12960
         '''

        roll = ds['roll_decidegree'].values
        roll_unit = ds['roll_decidegree'].units
        roll_name = ds['roll_decidegree'].long_name
        roll_fill = ds['roll_decidegree']._FillValue

        roll, err_count_r = reject_err_data_1_dims(roll, roll_fill, r, 'roll_decidegree', n=5)

        pitch = ds['pitch_decidegree'].values
        pitch_units = ds['pitch_decidegree'].units
        pitch_name = ds['pitch_decidegree'].long_name
        pitch_fill = ds['pitch_decidegree']. _FillValue

        pitch, err_count_p = reject_err_data_1_dims(pitch, pitch_fill, r, 'pitch_decidegree', n=5)

        tilt_ind = np.logical_or(pitch > 200, roll > 200)

        '''
        Plot pressure
        '''
        z_fit = z[tilt_ind]
        percent_good = ((len(z) - len(z_fit)) / len(u)) * 100
        ax1[ii].plot(time, z, 'b-', linestyle='--', linewidth=.6)
        ax1[ii].plot(time[tilt_ind], z_fit, 'r.', linestyle='None', marker='.', markersize=0.5,
                                                                        label= str(round(100 - percent_good, 3))+'%')

        prepare_axis(r, time, deployment, ax1[ii], ii, len(fdatasets), z_name, z_unit, err_count_z)

        fig1_file = fig1_file + str(deployment)

        '''
        plot roll
        '''
        roll_fit = roll[tilt_ind]
        percent_good = ((len(roll) - len(roll_fit)) / len(u)) * 100


        ax2[ii].plot(time, roll, 'b-', linestyle='--', linewidth=.6)
        ax2[ii].plot(time[tilt_ind], roll_fit, 'r.', linestyle='None', marker='.', markersize=0.5,
                                                                            label= str(round(100 - percent_good,3)) + '%')

        prepare_axis(r, time, deployment, ax2[ii], ii, len(fdatasets), roll_name, roll_unit, err_count_r)

        fig2_file = fig2_file + str(deployment)

        '''
        plot pitch
        '''
        pitch_fit = pitch[tilt_ind]
        percent_good = ((len(pitch) - len(pitch_fit)) / len(u)) * 100

        ax3[ii].plot(time, pitch, 'b-', linestyle='--', linewidth=.6)
        ax3[ii].plot(time[tilt_ind], pitch_fit, 'r.', linestyle='None', marker='.', markersize=0.5,
                                                                      label= str(round(100 - percent_good,3)) + '%')

        prepare_axis(r, time, deployment, ax3[ii], ii, len(fdatasets), pitch_name, pitch_units, err_count_p)

        fig3_file = fig3_file + str(deployment)


        '''
        1D Quiver plot
        '''
        u_fit = u[tilt_ind]
        v_fit = v[tilt_ind]
        percent_good = ((len(u) - len(u_fit)) / len(u)) * 100

        ax[ii].quiver(time, 0, u, v,
                      color='b',
                      units='y',
                      scale_units='y',
                      scale=1,
                      headlength=1,
                      headaxislength=1,
                      width=0.004,
                      alpha=0.5)

        ax[ii].quiver(time[tilt_ind], 0, u_fit, v_fit,
                      color='r',
                      units='y',
                      scale_units='y',
                      scale=1,
                      headlength=1,
                      headaxislength=1,
                      width=0.004,
                      alpha=0.5,
                      label=str(round(100 - percent_good,3)) + '%')

        ax[ii].set_ylim(-uv_maxmag, uv_maxmag)
        prepare_axis(r, time, deployment, ax[ii], ii, len(fdatasets), 'Current Velocity', u_unit, err_count_u)

        fig_file = fig_file + str(deployment)

        '''
        Plot u and v components
        '''

        ax0[ii].plot(time, v, 'b-', linestyle='--', linewidth=.6, label='V')
        ax0[ii].plot(time, u, 'g-', linestyle='--', linewidth=.6, label='U')
        ax0[ii].plot(time, w, 'm-', linestyle='--', linewidth=.6, label='W')

        prepare_axis(r, time, deployment, ax0[ii], ii, len(fdatasets), 'Velocity Components', u_unit, err_count_u)

        fig0_file = fig0_file + str(deployment)


    save_file = os.path.join(save_dir, fig1_file)
    fig1.savefig(str(save_file), dpi=150, bbox_inches='tight')

    save_file = os.path.join(save_dir, fig_file)
    fig.savefig(str(save_file), dpi=150, bbox_inches='tight')

    save_file = os.path.join(save_dir, fig0_file)
    fig0.savefig(str(save_file), dpi=150, bbox_inches='tight')

    save_file = os.path.join(save_dir, fig2_file)
    fig2.savefig(str(save_file), dpi=150, bbox_inches='tight')

    save_file = os.path.join(save_dir, fig3_file)
    fig3.savefig(str(save_file), dpi=150, bbox_inches='tight')

Пример #17

Показать файл

Файл: plot_profile_xsection_cabled.py Проект: ooi-data-lab/data-review-tools

def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only,
         n_std, inpercentile, zcell_size, zdbar):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        deployments = []
        for url in url_list:
            splitter = url.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([url])
                for u in udatasets:  # filter out collocated data files
                    if catalog_rms == u.split('/')[-1].split('_20')[0][15:]:
                        datasets.append(u)
                        deployments.append(
                            int(u.split('/')[-1].split('_')[0][-4:]))
        deployments = np.unique(deployments).tolist()
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        for dep in deployments:
            if deployment_num is not None:
                if dep is not deployment_num:
                    print('\nskipping deployment {}'.format(dep))
                    continue
            rdatasets = [
                s for s in fdatasets_sel if 'deployment%04d' % dep in s
            ]
            rdatasets.sort()
            if len(rdatasets) > 0:
                sci_vars_dict = {}
                # rdatasets = rdatasets[0:2]  #### for testing
                for i in range(len(rdatasets)):
                    ds = xr.open_dataset(rdatasets[i], mask_and_scale=False)
                    ds = ds.swap_dims({'obs': 'time'})
                    print('\nAppending data from {}: file {} of {}'.format(
                        'deployment%04d' % dep, i + 1, len(rdatasets)))

                    array = r[0:2]
                    subsite = r.split('-')[0]

                    if start_time is not None and end_time is not None:
                        ds = ds.sel(time=slice(start_time, end_time))
                        if len(ds['time'].values) == 0:
                            print(
                                'No data to plot for specified time range: ({} to {})'
                                .format(start_time, end_time))
                            continue
                        stime = start_time.strftime('%Y-%m-%d')
                        etime = end_time.strftime('%Y-%m-%d')
                        ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                        save_dir_profile = os.path.join(
                            sDir, array, subsite, r, 'profile_plots',
                            'deployment%04d' % dep, ext)
                        save_dir_xsection = os.path.join(
                            sDir, array, subsite, r, 'xsection_plots',
                            'deployment%04d' % dep, ext)
                    else:
                        save_dir_profile = os.path.join(
                            sDir, array, subsite, r, 'profile_plots',
                            'deployment%04d' % dep)
                        save_dir_xsection = os.path.join(
                            sDir, array, subsite, r, 'xsection_plots',
                            'deployment%04d' % dep)

                    if len(sci_vars_dict) == 0:
                        fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                            rdatasets[0])
                        sci_vars = cf.return_science_vars(stream)
                        if 'CTDPF' not in r:
                            sci_vars.append('int_ctd_pressure')
                        sci_vars.append('time')
                        sci_vars = list(np.unique(sci_vars))

                        # initialize the dictionary
                        for sci_var in sci_vars:
                            if sci_var == 'time':
                                sci_vars_dict.update({
                                    sci_var:
                                    dict(values=np.array([],
                                                         dtype=np.datetime64),
                                         units=[],
                                         fv=[])
                                })
                            else:
                                sci_vars_dict.update({
                                    sci_var:
                                    dict(values=np.array([]), units=[], fv=[])
                                })

                    # append data for the deployment into the dictionary
                    for s_v in sci_vars_dict.keys():
                        vv = ds[s_v]
                        try:
                            if vv.units not in sci_vars_dict[s_v]['units']:
                                sci_vars_dict[s_v]['units'].append(vv.units)
                        except AttributeError:
                            print('')
                        try:
                            if vv._FillValue not in sci_vars_dict[s_v]['fv']:
                                sci_vars_dict[s_v]['fv'].append(vv._FillValue)
                                vv_data = vv.values
                                try:
                                    vv_data[
                                        vv_data == vv.
                                        _FillValue] = np.nan  # turn fill values to nans
                                except ValueError:
                                    print('')
                        except AttributeError:
                            print('')

                        if len(vv.dims) > 1:
                            print('Skipping plot: variable has >1 dimension')
                        else:
                            sci_vars_dict[s_v]['values'] = np.append(
                                sci_vars_dict[s_v]['values'], vv.values)

                # plot after appending all data into one file
                data_start = pd.to_datetime(
                    min(sci_vars_dict['time']['values'])).strftime(
                        '%Y-%m-%dT%H:%M:%S')
                data_stop = pd.to_datetime(max(
                    sci_vars_dict['time']['values'])).strftime(
                        '%Y-%m-%dT%H:%M:%S')
                time1 = sci_vars_dict['time']['values']
                ds_lat1 = np.empty(np.shape(time1))
                ds_lon1 = np.empty(np.shape(time1))

                # define pressure variable
                try:
                    pname = 'seawater_pressure'
                    press = sci_vars_dict[pname]
                except KeyError:
                    pname = 'int_ctd_pressure'
                    press = sci_vars_dict[pname]
                y1 = press['values']
                try:
                    y_units = press['units'][0]
                except IndexError:
                    y_units = ''

                for sv in sci_vars_dict.keys():
                    print('')
                    print(sv)
                    if sv not in [
                            'seawater_pressure', 'int_ctd_pressure', 'time'
                    ]:
                        z1 = sci_vars_dict[sv]['values']
                        fv = sci_vars_dict[sv]['fv'][0]
                        sv_units = sci_vars_dict[sv]['units'][0]

                        # Check if the array is all NaNs
                        if sum(np.isnan(z1)) == len(z1):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z1[z1 != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # remove unreasonable pressure data (e.g. for surface piercing profilers)
                            if zdbar:
                                po_ind = (0 < y1) & (y1 < zdbar)
                                tm = time1[po_ind]
                                y = y1[po_ind]
                                z = z1[po_ind]
                                ds_lat = ds_lat1[po_ind]
                                ds_lon = ds_lon1[po_ind]
                            else:
                                tm = time1
                                y = y1
                                z = z1
                                ds_lat = ds_lat1
                                ds_lon = ds_lon1

                            # reject erroneous data
                            dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \
                                cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon)

                            # get rid of 0.0 data
                            # if sv == 'salinity':
                            #     ind = ndata > 20
                            # elif sv == 'density':
                            #     ind = ndata > 1010
                            # elif sv == 'conductivity':
                            #     ind = ndata > 2
                            # else:
                            #     ind = ndata > 0
                            # if sv == 'sci_flbbcd_chlor_units':
                            #     ind = ndata < 7.5
                            # elif sv == 'sci_flbbcd_cdom_units':
                            #     ind = ndata < 25
                            # else:
                            #     ind = ndata > 0.0

                            if 'CTD' in r:
                                ind = zpressure > 0.0
                            else:
                                ind = ndata > 0.0

                            lenzero = np.sum(~ind)
                            dtime = dtime[ind]
                            zpressure = zpressure[ind]
                            ndata = ndata[ind]
                            if ds_lat is not None and ds_lon is not None:
                                lat = lat[ind]
                                lon = lon[ind]
                            else:
                                lat = None
                                lon = None

                            if len(dtime) > 0:
                                # reject time range from data portal file export
                                t_portal, z_portal, y_portal, lat_portal, lon_portal = \
                                    cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon)

                                print(
                                    'removed {} data points using visual inspection of data'
                                    .format(len(ndata) - len(z_portal)))

                                # create data groups
                                # if len(y_portal) > 0:
                                #     columns = ['tsec', 'dbar', str(sv)]
                                #     min_r = int(round(np.nanmin(y_portal) - zcell_size))
                                #     max_r = int(round(np.nanmax(y_portal) + zcell_size))
                                #     ranges = list(range(min_r, max_r, zcell_size))
                                #
                                #     groups, d_groups = gt.group_by_depth_range(t_portal, y_portal, z_portal, columns, ranges)
                                #
                                #     if 'scatter' in sv:
                                #         n_std = None  # to use percentile
                                #     else:
                                #         n_std = n_std
                                #
                                #     #  get percentile analysis for printing on the profile plot
                                #     y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups(
                                #         groups, d_groups, n_std, inpercentile)
                            """
                            Plot all data
                            """
                            if len(time1) > 0:
                                cf.create_dir(save_dir_profile)
                                cf.create_dir(save_dir_xsection)
                                sname = '-'.join((r, method, sv))
                                # sfileall = '_'.join(('all_data', sname, pd.to_datetime(time1.min()).strftime('%Y%m%d')))
                                # tm0 = pd.to_datetime(time1.min()).strftime('%Y-%m-%dT%H:%M:%S')
                                # tm1 = pd.to_datetime(time1.max()).strftime('%Y-%m-%dT%H:%M:%S')
                                sfileall = '_'.join(
                                    (sname, pd.to_datetime(
                                        t_portal.min()).strftime('%Y%m%d')))
                                tm0 = pd.to_datetime(t_portal.min()).strftime(
                                    '%Y-%m-%dT%H:%M:%S')
                                tm1 = pd.to_datetime(t_portal.max()).strftime(
                                    '%Y-%m-%dT%H:%M:%S')
                                title = ' '.join(
                                    (deployment, refdes,
                                     method)) + '\n' + tm0 + ' to ' + tm1
                                if 'SPKIR' in r:
                                    title = title + '\nWavelength = 510 nm'
                                '''
                                profile plot
                                '''
                                xlabel = sv + " (" + sv_units + ")"
                                ylabel = pname + " (" + y_units + ")"
                                clabel = 'Time'

                                # fig, ax = pf.plot_profiles(z1, y1, time1, ylabel, xlabel, clabel, stdev=None)
                                fig, ax = pf.plot_profiles(z_portal,
                                                           y_portal,
                                                           t_portal,
                                                           ylabel,
                                                           xlabel,
                                                           clabel,
                                                           stdev=None)

                                ax.set_title(title, fontsize=9)
                                fig.tight_layout()
                                pf.save_fig(save_dir_profile, sfileall)
                                '''
                                xsection plot
                                '''
                                clabel = sv + " (" + sv_units + ")"
                                ylabel = pname + " (" + y_units + ")"

                                # fig, ax, bar = pf.plot_xsection(subsite, time1, y1, z1, clabel, ylabel, t_eng=None,
                                #                                 m_water_depth=None, inpercentile=None, stdev=None)
                                fig, ax, bar = pf.plot_xsection(
                                    subsite,
                                    t_portal,
                                    y_portal,
                                    z_portal,
                                    clabel,
                                    ylabel,
                                    t_eng=None,
                                    m_water_depth=None,
                                    inpercentile=None,
                                    stdev=None)

                                if fig:
                                    ax.set_title(title, fontsize=9)
                                    fig.tight_layout()
                                    pf.save_fig(save_dir_xsection, sfileall)
                            """

Пример #18

Показать файл

Файл: _plot_velocity.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]

        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))

        if preferred_only == 'yes':

            ps_df, n_streams = cf.get_preferred_stream_info(r)

            fdatasets = []
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets)

        save_dir = os.path.join(sDir, array, subsite, r,
                                'preferred_method_plots')
        cf.create_dir(save_dir)

        # get the preferred stream information
        fig, ax = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

        fig0, ax0 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig0.tight_layout()

        fig1, ax1 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig1.tight_layout()

        fig2, ax2 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig2.tight_layout()

        fig3, ax3 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig3.tight_layout()

        fig4, ax4 = pyplot.subplots(nrows=len(fdatasets), ncols=1, sharey=True)
        fig4.tight_layout()

        for ii in range(len(fdatasets)):
            print('\n', fdatasets[ii])
            deployment = fdatasets[ii].split('/')[-1].split('_')[0].split(
                'deployment')[-1]
            deployment = int(deployment)

            ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False)
            time = ds['time'].values
            sci_var = cf.return_science_vars(ds.stream)

            # Plot pressure
            z_name = [z_var for z_var in sci_var if 'pressure' in z_var]
            z = ds[z_name[0]].values
            z_unit = ds[z_name[0]].units

            ax1[ii].plot(time,
                         z,
                         'b-',
                         linestyle='--',
                         linewidth=.6,
                         label='V')
            ax1[ii].set_ylabel(str(deployment),
                               rotation=0,
                               fontsize=8,
                               color='b',
                               labelpad=11)
            ax1[ii].yaxis.set_label_position("right")
            ax1[ii].tick_params(which='both',
                                color='r',
                                labelsize=7,
                                labelcolor='m',
                                pad=0.1,
                                length=1,
                                rotation=0)
            if ii < len(fdatasets) - 1:
                ax1[ii].set_xlabel(' ')
            else:
                ax1[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b')

            if ii == 0:
                ax1[ii].set_title(r + ' - Pressure ' + z_unit, fontsize=8)

            sfile = 'pressure_plots'
            save_file = os.path.join(save_dir, sfile)
            fig1.savefig(str(save_file), dpi=150)

            # non science veriable
            # According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees
            # redmine ticket: Marine Hardware #12960

            roll = ds['roll_decidegree'].values
            roll_unit = ds['roll_decidegree'].units
            pitch = ds['pitch_decidegree'].values
            pitch_units = ds['pitch_decidegree'].units
            headng = ds['heading_decidegree'].values
            headng_units = ds['heading_decidegree'].values

            tilt_ind = np.logical_or(pitch > 200, roll > 200)
            pitch_fit = pitch[tilt_ind]
            roll_fit = roll[tilt_ind]

            # plot roll
            ax2[ii].plot(time,
                         roll,
                         'b-',
                         linestyle='--',
                         linewidth=.6,
                         label='Roll')
            ax2[ii].plot(time[tilt_ind],
                         roll_fit,
                         'g.',
                         linestyle='None',
                         marker='.',
                         markersize=0.5,
                         label='Roll < 200')
            ax2[ii].set_ylabel(str(deployment),
                               rotation=0,
                               fontsize=8,
                               color='b',
                               labelpad=11)
            ax2[ii].yaxis.set_label_position("right")
            ax2[ii].tick_params(which='both',
                                color='r',
                                labelsize=7,
                                labelcolor='m',
                                pad=0.1,
                                length=1,
                                rotation=0)
            if ii < len(fdatasets) - 1:
                ax2[ii].set_xlabel(' ')
            else:
                ax2[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b')

            if ii == 0:
                ax2[ii].set_title(r + ' - Roll ' + roll_unit, fontsize=8)
                leg2 = ax2[ii].legend(fontsize=6,
                                      bbox_to_anchor=(0., 0.80, 1., .102),
                                      loc=3,
                                      ncol=3,
                                      mode="expand",
                                      borderaxespad=0.)
                leg2._drawFrame = False

            sfile = 'roll_plots'
            save_file = os.path.join(save_dir, sfile)
            fig2.savefig(str(save_file), dpi=150)

            # plot pitch
            ax3[ii].plot(time,
                         pitch,
                         'b-',
                         linestyle='--',
                         linewidth=.6,
                         label='Roll')
            ax3[ii].plot(time[tilt_ind],
                         pitch_fit,
                         'g.',
                         linestyle='None',
                         marker='.',
                         markersize=0.5,
                         label='Roll < 200')
            ax3[ii].set_ylabel(str(deployment),
                               rotation=0,
                               fontsize=8,
                               color='b',
                               labelpad=11)
            ax3[ii].yaxis.set_label_position("right")
            ax3[ii].tick_params(which='both',
                                color='r',
                                labelsize=7,
                                labelcolor='m',
                                pad=0.1,
                                length=1,
                                rotation=0)
            if ii < len(fdatasets) - 1:
                ax3[ii].set_xlabel(' ')
            else:
                ax3[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b')

            if ii == 0:
                ax3[ii].set_title(r + ' - Pitch ' + roll_unit, fontsize=8)
                leg3 = ax2[ii].legend(fontsize=6,
                                      bbox_to_anchor=(0., 0.80, 1., .102),
                                      loc=3,
                                      ncol=3,
                                      mode="expand",
                                      borderaxespad=0.)
                leg3._drawFrame = False

            sfile = 'pitch_plots'
            save_file = os.path.join(save_dir, sfile)
            fig3.savefig(str(save_file), dpi=150)

            # plot heading
            ax4[ii].plot(time,
                         headng,
                         'b-',
                         linestyle='None',
                         marker='.',
                         markersize=0.5,
                         label='Roll')
            ax4[ii].plot(time[tilt_ind],
                         headng[tilt_ind],
                         'g.',
                         linestyle='None',
                         marker='.',
                         markersize=0.5,
                         label='Roll < 200')
            ax4[ii].set_ylabel(str(deployment),
                               rotation=0,
                               fontsize=8,
                               color='b',
                               labelpad=11)
            ax4[ii].yaxis.set_label_position("right")
            ax4[ii].tick_params(which='both',
                                color='r',
                                labelsize=7,
                                labelcolor='m',
                                pad=0.1,
                                length=1,
                                rotation=0)
            if ii < len(fdatasets) - 1:
                ax4[ii].set_xlabel(' ')
            else:
                ax4[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b')

            if ii == 0:
                ax4[ii].set_title(r + ' - Heading ' + roll_unit, fontsize=8)
                leg4 = ax2[ii].legend(fontsize=6,
                                      bbox_to_anchor=(0., 0.80, 1., .102),
                                      loc=3,
                                      ncol=3,
                                      mode="expand",
                                      borderaxespad=0.)
                leg4._drawFrame = False

            sfile = 'heading_plots'
            save_file = os.path.join(save_dir, sfile)
            fig4.savefig(str(save_file), dpi=150)

            # velocity variable
            u_name = [
                u_var for u_var in sci_var if 'eastward_velocity' in u_var
            ]
            v_name = [
                v_var for v_var in sci_var if 'northward_velocity' in v_var
            ]
            w_name = [w_var for w_var in sci_var if 'upward_velocity' in w_var]

            w = ds[w_name[0]].values
            w_unit = ds[w_name[0]].units
            u = ds[u_name[0]].values
            v = ds[v_name[0]].values
            uv_magnitude = np.sqrt(u**2 + v**2)
            uv_maxmag = max(uv_magnitude)

            # 1D Quiver plot
            ax[ii].quiver(time,
                          0,
                          u,
                          v,
                          color='r',
                          units='y',
                          scale_units='y',
                          scale=1,
                          headlength=1,
                          headaxislength=1,
                          width=0.004,
                          alpha=0.5)

            u_fit = u[tilt_ind]
            v_fit = v[tilt_ind]
            ax[ii].quiver(time[tilt_ind],
                          0,
                          u_fit,
                          v_fit,
                          color='b',
                          units='y',
                          scale_units='y',
                          scale=1,
                          headlength=1,
                          headaxislength=1,
                          width=0.004,
                          alpha=0.5)
            percent_bad = round(((len(u) - len(u_fit)) / len(u)) * 100, 2)
            print(len(u_fit), len(u), percent_bad)
            ax[ii].text(time[-1],
                        0,
                        ' ' + str(percent_bad) + '%',
                        fontsize=5,
                        style='italic',
                        color='blue')

            ax[ii].set_ylim(-uv_maxmag, uv_maxmag)
            ax[ii].set_ylabel(str(deployment),
                              rotation=0,
                              fontsize=8,
                              color='b',
                              labelpad=11)
            ax[ii].yaxis.set_label_position("right")
            ax[ii].tick_params(which='both',
                               color='r',
                               labelsize=7,
                               labelcolor='m',
                               pad=0.1,
                               length=1,
                               rotation=0)
            if ii < len(fdatasets) - 1:
                ax[ii].set_xlabel(' ')
            else:
                ax[ii].set_xlabel('Time', rotation=0, fontsize=8, color='b')

            if ii == 0:
                ax[ii].set_title(
                    r + ' - Current Velocity ' + w_unit + '\n' +
                    ' Currents in blue when pitch or roll are > 20 degrees',
                    fontsize=8)

            # ax[ii].text(time[0], uv_magnitude- 0.05, 'mim: ' + str(round(min(uv_magnitude),3)) + ' , max: ' + str(round(max(uv_magnitude),3)), fontsize=8)

            sfile = 'current_plot'
            save_file = os.path.join(save_dir, sfile)
            fig.savefig(str(save_file), dpi=150, bbox_inches='tight')

Пример #19

Показать файл

Файл: plot_timeseries_panel.py Проект: ooi-data-lab/data-review-tools

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    rms = '-'.join((r, row[ii]))
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        for fd in fdatasets_sel:
            with xr.open_dataset(fd, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    fd)
                print('\nPlotting {} {}'.format(r, deployment))
                array = subsite[0:2]
                save_dir = os.path.join(sDir, array, subsite, refdes,
                                        'timeseries_panel_plots')
                filename = '_'.join(fname.split('_')[:-1])
                sci_vars = cf.return_science_vars(stream)

                if len(sci_vars) > 1:
                    cf.create_dir(save_dir)
                    colors = cm.jet(np.linspace(0, 1, len(sci_vars)))

                    t = ds['time'].values
                    t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                    t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S')
                    title = ' '.join((deployment, refdes, method))

                    # Plot data with outliers removed
                    fig, ax = pf.plot_timeseries_panel(ds, t, sci_vars, colors,
                                                       5)
                    plt.xticks(fontsize=7)
                    ax[0].set_title((title + '\n' + t0 + ' - ' + t1),
                                    fontsize=7)
                    sfile = '-'.join((filename, 'timeseries_panel', t0[:10]))
                    pf.save_fig(save_dir, sfile)
                else:
                    print(
                        'Only one science variable in file, no panel plots necessary'
                    )

Пример #20

Показать файл

Файл: plot_profile_xsection_rm_suspect_data.py Проект: ooi-data-lab/data-review-tools

def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only,
         zdbar, n_std, inpercentile, zcell_size):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join(
                            (spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(
            main_sensor, fdatasets)

        for fd in fdatasets_sel:
            part_d = fd.split('/')[-1]
            print('\n{}'.format(part_d))
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                fd)
            array = subsite[0:2]
            sci_vars = cf.return_science_vars(stream)

            # if 'CE05MOAS' in r or 'CP05MOAS' in r:  # for coastal gliders, get m_water_depth for bathymetry
            #     eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng'))
            #     eng_url = [s for s in url_list if eng in s]
            #     if len(eng_url) == 1:
            #         eng_datasets = cf.get_nc_urls(eng_url)
            #         # filter out collocated datasets
            #         eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])]
            #         if len(eng_dataset) > 0:
            #             ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False)
            #             t_eng = ds_eng['time'].values
            #             m_water_depth = ds_eng['m_water_depth'].values
            #
            #             # m_altimeter_status = 0 means a good reading (not nan or -1)
            #             try:
            #                 eng_ind = ds_eng['m_altimeter_status'].values == 0
            #             except KeyError:
            #                 eng_ind = (~np.isnan(m_water_depth)) & (m_water_depth >= 0)
            #
            #             m_water_depth = m_water_depth[eng_ind]
            #             t_eng = t_eng[eng_ind]
            #
            #             # get rid of any remaining nans or fill values
            #             eng_ind2 = (~np.isnan(m_water_depth)) & (m_water_depth >= 0)
            #             m_water_depth = m_water_depth[eng_ind2]
            #             t_eng = t_eng[eng_ind2]
            #         else:
            #             print('No engineering file for deployment {}'.format(deployment))
            #             m_water_depth = None
            #             t_eng = None
            #     else:
            #         m_water_depth = None
            #         t_eng = None
            # else:
            #     m_water_depth = None
            #     t_eng = None

            if deployment_num is not None:
                if int(int(deployment[-4:])) is not deployment_num:
                    print(type(int(deployment[-4:])), type(deployment_num))
                    continue

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print(
                        'No data to plot for specified time range: ({} to {})'.
                        format(start_time, end_time))
                    continue
                stime = start_time.strftime('%Y-%m-%d')
                etime = end_time.strftime('%Y-%m-%d')
                ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                save_dir_profile = os.path.join(sDir, array, subsite, refdes,
                                                'profile_plots', deployment,
                                                ext)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes,
                                                 'xsection_plots', deployment,
                                                 ext)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes,
                                           'xsection_plots_4d', deployment,
                                           ext)
            else:
                save_dir_profile = os.path.join(sDir, array, subsite, refdes,
                                                'profile_plots', deployment)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes,
                                                 'xsection_plots', deployment)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes,
                                           'xsection_plots_4d', deployment)

            texclude_dir = os.path.join(sDir, array, subsite, refdes,
                                        'time_to_exclude')
            cf.create_dir(texclude_dir)

            time1 = ds['time'].values
            try:
                ds_lat1 = ds['lat'].values
            except KeyError:
                ds_lat1 = None
                print('No latitude variable in file')
            try:
                ds_lon1 = ds['lon'].values
            except KeyError:
                ds_lon1 = None
                print('No longitude variable in file')

            # get pressure variable
            pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars(
                ds)

            # prepare file to list timestamps with suspect data  for each data parameter
            stat_data = pd.DataFrame(
                columns=['deployments', 'time_to_exclude'])
            file_exclude = '{}/{}_{}_{}_excluded_timestamps.csv'.format(
                texclude_dir, deployment, refdes, method)
            stat_data.to_csv(file_exclude, index=True)

            # loop through sensor-data parameters
            for sv in sci_vars:
                print(sv)
                if 'pressure' not in sv:
                    z1 = ds[sv].values
                    fv = ds[sv]._FillValue
                    sv_units = ds[sv].units

                    # Check if the array is all NaNs
                    if sum(np.isnan(z1)) == len(z1):
                        print('Array of all NaNs - skipping plot.')
                        continue

                    # Check if the array is all fill values
                    elif len(z1[z1 != fv]) == 0:
                        print('Array of all fill values - skipping plot.')
                        continue

                    else:
                        # remove unreasonable pressure data (e.g. for surface piercing profilers)
                        if zdbar:
                            po_ind = (0 < y1) & (y1 < zdbar)
                            n_zdbar = np.sum(~po_ind)
                            tm = time1[po_ind]
                            y = y1[po_ind]
                            z = z1[po_ind]
                            ds_lat = ds_lat1[po_ind]
                            ds_lon = ds_lon1[po_ind]
                            print('{} in water depth > {} dbar'.format(
                                n_zdbar, zdbar))
                        else:
                            tm = time1
                            y = y1
                            z = z1
                            ds_lat = ds_lat1
                            ds_lon = ds_lon1

                        # reject erroneous data
                        dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \
                            cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon)

                        # get rid of 0.0 data
                        if sv == 'salinity':
                            ind = ndata > 30
                        elif sv == 'density':
                            ind = ndata > 1022.5
                        elif sv == 'conductivity':
                            ind = ndata > 3.45
                        else:
                            ind = ndata > 0
                        # if sv == 'sci_flbbcd_chlor_units':
                        #     ind = ndata < 7.5
                        # elif sv == 'sci_flbbcd_cdom_units':
                        #     ind = ndata < 25
                        # else:
                        #     ind = ndata > 0.0

                        # if 'CTD' in r:
                        #     ind = zpressure > 0.0
                        # else:
                        #     ind = ndata > 0.0

                        lenzero = np.sum(~ind)
                        dtime = dtime[ind]
                        zpressure = zpressure[ind]
                        ndata = ndata[ind]
                        if ds_lat is not None and ds_lon is not None:
                            lat = lat[ind]
                            lon = lon[ind]
                        else:
                            lat = None
                            lon = None

                        if len(dtime) > 0:
                            # reject time range from data portal file export
                            t_portal, z_portal, y_portal, lat_portal, lon_portal = \
                                cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon)

                            print(
                                'removed {} data points using visual inspection of data'
                                .format(len(ndata) - len(z_portal)))

                            # create data groups
                            if len(y_portal) > 0:
                                columns = ['tsec', 'dbar', str(sv)]
                                min_r = int(round(min(y_portal) - zcell_size))
                                max_r = int(round(max(y_portal) + zcell_size))
                                ranges = list(range(min_r, max_r, zcell_size))

                                groups, d_groups = gt.group_by_depth_range(
                                    t_portal, y_portal, z_portal, columns,
                                    ranges)

                                if 'scatter' in sv:
                                    n_std = None  # to use percentile
                                else:
                                    n_std = n_std

                                #  identifying timestamps from percentile analysis
                                y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups(
                                    groups, d_groups, n_std, inpercentile)
                                """
                                writing timestamps to .csv file to use with data_range.py script
                                """
                                if len(time_ex) != 0:
                                    t_exclude = time_ex[0]
                                    for i in range(
                                            len(time_ex))[1:len(time_ex)]:
                                        t_exclude = '{}, {}'.format(
                                            t_exclude, time_ex[i])

                                    stat_data = pd.DataFrame(
                                        {
                                            'deployments': deployment,
                                            'time_to_exclude': t_exclude
                                        },
                                        index=[sv])
                                    stat_data.to_csv(file_exclude,
                                                     index=True,
                                                     mode='a',
                                                     header=False)

                                #  rejecting timestamps from percentile analysis
                                if len(time_ex) > 0:
                                    t_nospct, z_nospct, y_nospct = cf.reject_suspect_data(
                                        t_portal, y_portal, z_portal, time_ex)
                                else:
                                    t_nospct = t_portal
                                    z_nospct = z_portal
                                    y_nospct = y_portal
                                """
                                Plot data
                                """
                                if len(t_nospct) > 0:
                                    if len(t_nospct) != len(dtime):
                                        cf.create_dir(save_dir_profile)
                                        cf.create_dir(save_dir_xsection)
                                        sname = '-'.join((r, method, sv))
                                        sfile = '_'.join(
                                            ('rm_suspect_data', sname,
                                             pd.to_datetime(
                                                 t_nospct.min()).strftime(
                                                     '%Y%m%d')))

                                        t0 = pd.to_datetime(
                                            t_nospct.min()).strftime(
                                                '%Y-%m-%dT%H:%M:%S')
                                        t1 = pd.to_datetime(
                                            t_nospct.max()).strftime(
                                                '%Y-%m-%dT%H:%M:%S')
                                        title = ' '.join(
                                            (deployment, refdes,
                                             method)) + '\n' + t0 + ' to ' + t1

                                        if zdbar:
                                            leg_text = (
                                                'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges '
                                                '[{} - {}], {} unreasonable values'
                                                .format(
                                                    lenfv, lennan, lenev,
                                                    lengr, global_min,
                                                    global_max, lenzero) +
                                                '\nremoved {} in the upper and lower {} percentile of data grouped in {} '
                                                'dbar segments'.format(
                                                    len(z_portal) -
                                                    len(z_nospct),
                                                    inpercentile, zcell_size) +
                                                '\nexcluded {} suspect data points when inspected visually'
                                                .format(
                                                    len(ndata) - len(z_portal))
                                                +
                                                '\nexcluded {} suspect data in water depth greater than {} dbar'
                                                .format(n_zdbar, zdbar), )

                                        elif n_std:
                                            leg_text = (
                                                'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                                '{} unreasonable values'.
                                                format(lenfv, lennan, lenev,
                                                       lengr, global_min,
                                                       global_max, lenzero) +
                                                '\nremoved {} data points +/- {} SD of data grouped in {} dbar segments'
                                                .format(
                                                    len(z_portal) -
                                                    len(z_nospct), n_std,
                                                    zcell_size) +
                                                '\nexcluded {} suspect data points when inspected visually'
                                                .format(
                                                    len(ndata) -
                                                    len(z_portal)), )
                                        else:
                                            leg_text = (
                                                'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                                '{} unreasonable values'.
                                                format(lenfv, lennan, lenev,
                                                       lengr, global_min,
                                                       global_max, lenzero) +
                                                '\nremoved {} in the upper and lower {} percentile of data grouped in {} dbar segments'
                                                .format(
                                                    len(z_portal) -
                                                    len(z_nospct),
                                                    inpercentile, zcell_size) +
                                                '\nexcluded {} suspect data points when inspected visually'
                                                .format(
                                                    len(ndata) -
                                                    len(z_portal)), )
                                        '''
                                        profile plot
                                        '''
                                        xlabel = sv + " (" + sv_units + ")"
                                        ylabel = press[0] + " (" + y_units[
                                            0] + ")"
                                        clabel = 'Time'

                                        # plot non-erroneous data
                                        print('plotting profile')
                                        fig, ax = pf.plot_profiles(z_nospct,
                                                                   y_nospct,
                                                                   t_nospct,
                                                                   ylabel,
                                                                   xlabel,
                                                                   clabel,
                                                                   stdev=None)

                                        ax.set_title(title, fontsize=9)
                                        ax.plot(n_avg, y_avg, '-k')
                                        #ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2)
                                        ax.legend(leg_text,
                                                  loc='upper center',
                                                  bbox_to_anchor=(0.5, -0.17),
                                                  fontsize=6)
                                        fig.tight_layout()
                                        pf.save_fig(save_dir_profile, sfile)
                                        '''
                                        xsection plot
                                        '''
                                        print('plotting xsection')
                                        clabel = sv + " (" + sv_units + ")"
                                        ylabel = press[0] + " (" + y_units[
                                            0] + ")"

                                        # plot bathymetry only within data time ranges
                                        # if t_eng is not None:
                                        #     eng_ind = (t_eng >= np.nanmin(t_array)) & (t_eng <= np.nanmax(t_array))
                                        #     t_eng = t_eng[eng_ind]
                                        #     m_water_depth = m_water_depth[eng_ind]

                                        # plot non-erroneous data
                                        fig, ax, bar = pf.plot_xsection(
                                            subsite,
                                            t_nospct,
                                            y_nospct,
                                            z_nospct,
                                            clabel,
                                            ylabel,
                                            t_eng=None,
                                            m_water_depth=None,
                                            inpercentile=inpercentile,
                                            stdev=None)

                                        ax.set_title(title, fontsize=9)
                                        ax.legend(leg_text,
                                                  loc='upper center',
                                                  bbox_to_anchor=(0.5, -0.17),
                                                  fontsize=6)
                                        fig.tight_layout()
                                        pf.save_fig(save_dir_xsection, sfile)

Пример #21

Показать файл

def main(url_list, sDir, deployment_num, start_time, end_time, preferred_only, n_std, inpercentile, zcell_size, zdbar):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list and 'ENG' not in rd and 'ADCP' not in rd:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    try:
                        rms = '-'.join((r, row[ii]))
                    except TypeError:
                        continue
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        main_sensor = r.split('-')[-1]
        fdatasets_sel = cf.filter_collocated_instruments(main_sensor, fdatasets)

        for fd in fdatasets_sel:
            part_d = fd.split('/')[-1]
            print('\n{}'.format(part_d))
            ds = xr.open_dataset(fd, mask_and_scale=False)
            ds = ds.swap_dims({'obs': 'time'})

            fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd)
            array = subsite[0:2]
            sci_vars = cf.return_science_vars(stream)

            # if 'CE05MOAS' in r or 'CP05MOAS' in r:  # for coastal gliders, get m_water_depth for bathymetry
            #     eng = '-'.join((r.split('-')[0], r.split('-')[1], '00-ENG000000', method, 'glider_eng'))
            #     eng_url = [s for s in url_list if eng in s]
            #     if len(eng_url) == 1:
            #         eng_datasets = cf.get_nc_urls(eng_url)
            #         # filter out collocated datasets
            #         eng_dataset = [j for j in eng_datasets if (eng in j.split('/')[-1] and deployment in j.split('/')[-1])]
            #         if len(eng_dataset) > 0:
            #             ds_eng = xr.open_dataset(eng_dataset[0], mask_and_scale=False)
            #             t_eng = ds_eng['time'].values
            #             m_water_depth = ds_eng['m_water_depth'].values
            #
            #             # m_altitude = glider height above seafloor
            #             # m_depth = glider depth in the water column
            #             # m_altitude = ds_eng['m_altitude'].values
            #             # m_depth = ds_eng['m_depth'].values
            #             # calc_water_depth = m_altitude + m_depth
            #
            #             # m_altimeter_status = 0 means a good reading (not nan or -1)
            #             try:
            #                 eng_ind = ds_eng['m_altimeter_status'].values == 0
            #             except KeyError:
            #                 eng_ind = (~np.isnan(m_water_depth)) & (m_water_depth >= 0)
            #
            #             m_water_depth = m_water_depth[eng_ind]
            #             t_eng = t_eng[eng_ind]
            #
            #             # get rid of any remaining nans or fill values
            #             eng_ind2 = (~np.isnan(m_water_depth)) & (m_water_depth >= 0)
            #             m_water_depth = m_water_depth[eng_ind2]
            #             t_eng = t_eng[eng_ind2]
            #         else:
            #             print('No engineering file for deployment {}'.format(deployment))
            #             m_water_depth = None
            #             t_eng = None
            #     else:
            #         m_water_depth = None
            #         t_eng = None
            # else:
            #     m_water_depth = None
            #     t_eng = None

            if deployment_num is not None:
                if int(int(deployment[-4:])) is not deployment_num:
                    print(type(int(deployment[-4:])), type(deployment_num))
                    continue

            if start_time is not None and end_time is not None:
                ds = ds.sel(time=slice(start_time, end_time))
                if len(ds['time'].values) == 0:
                    print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                    continue
                stime = start_time.strftime('%Y-%m-%d')
                etime = end_time.strftime('%Y-%m-%d')
                ext = stime + 'to' + etime  # .join((ds0_method, ds1_method
                save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment, ext)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment, ext)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment, ext)
            else:
                save_dir_profile = os.path.join(sDir, array, subsite, refdes, 'profile_plots', deployment)
                save_dir_xsection = os.path.join(sDir, array, subsite, refdes, 'xsection_plots', deployment)
                save_dir_4d = os.path.join(sDir, array, subsite, refdes, 'xsection_plots_4d', deployment)

            time1 = ds['time'].values
            try:
                ds_lat1 = ds['lat'].values
            except KeyError:
                ds_lat1 = None
                print('No latitude variable in file')
            try:
                ds_lon1 = ds['lon'].values
            except KeyError:
                ds_lon1 = None
                print('No longitude variable in file')

            # get pressure variable
            pvarname, y1, y_units, press, y_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars(ds)

            for sv in sci_vars:
                print('')
                print(sv)
                if 'pressure' not in sv:
                    if sv == 'spkir_abj_cspp_downwelling_vector':
                        pxso.pf_xs_spkir(ds, sv, time1, y1, ds_lat1, ds_lon1, zcell_size, inpercentile, save_dir_profile,
                                         save_dir_xsection, deployment, press, y_units, n_std, zdbar)
                    elif 'OPTAA' in r:
                        if sv not in ['wavelength_a', 'wavelength_c']:
                            pxso.pf_xs_optaa(ds, sv, time1, y1, ds_lat1, ds_lon1, zcell_size, inpercentile, save_dir_profile,
                                             save_dir_xsection, deployment, press, y_units, n_std, zdbar)
                    else:
                        z1 = ds[sv].values
                        fv = ds[sv]._FillValue
                        sv_units = ds[sv].units

                        # Check if the array is all NaNs
                        if sum(np.isnan(z1)) == len(z1):
                            print('Array of all NaNs - skipping plot.')
                            continue

                        # Check if the array is all fill values
                        elif len(z1[z1 != fv]) == 0:
                            print('Array of all fill values - skipping plot.')
                            continue

                        else:
                            # remove unreasonable pressure data (e.g. for surface piercing profilers)
                            if zdbar:
                                po_ind = (0 < y1) & (y1 < zdbar)
                                tm = time1[po_ind]
                                y = y1[po_ind]
                                z = z1[po_ind]
                                ds_lat = ds_lat1[po_ind]
                                ds_lon = ds_lon1[po_ind]
                            else:
                                tm = time1
                                y = y1
                                z = z1
                                ds_lat = ds_lat1
                                ds_lon = ds_lon1

                            # reject erroneous data
                            dtime, zpressure, ndata, lenfv, lennan, lenev, lengr, global_min, global_max, lat, lon = \
                                cf.reject_erroneous_data(r, sv, tm, y, z, fv, ds_lat, ds_lon)

                            # get rid of 0.0 data
                            if sv == 'salinity':
                                ind = ndata > 30
                            elif sv == 'density':
                                ind = ndata > 1022.5
                            elif sv == 'conductivity':
                                ind = ndata > 3.45
                            else:
                                ind = ndata > 0
                            # if sv == 'sci_flbbcd_chlor_units':
                            #     ind = ndata < 7.5
                            # elif sv == 'sci_flbbcd_cdom_units':
                            #     ind = ndata < 25
                            # else:
                            #     ind = ndata > 0.0

                            # if 'CTD' in r:
                            #     ind = zpressure > 0.0
                            # else:
                            #     ind = ndata > 0.0

                            lenzero = np.sum(~ind)
                            dtime = dtime[ind]
                            zpressure = zpressure[ind]
                            ndata = ndata[ind]
                            if ds_lat is not None and ds_lon is not None:
                                lat = lat[ind]
                                lon = lon[ind]
                            else:
                                lat = None
                                lon = None

                            if len(dtime) > 0:
                                # reject time range from data portal file export
                                t_portal, z_portal, y_portal, lat_portal, lon_portal = \
                                    cf.reject_timestamps_dataportal(subsite, r, dtime, zpressure, ndata, lat, lon)

                                print('removed {} data points using visual inspection of data'.format(
                                    len(ndata) - len(z_portal)))

                                # create data groups
                                if len(y_portal) > 0:
                                    columns = ['tsec', 'dbar', str(sv)]
                                    min_r = int(round(np.nanmin(y_portal) - zcell_size))
                                    max_r = int(round(np.nanmax(y_portal) + zcell_size))
                                    ranges = list(range(min_r, max_r, zcell_size))

                                    groups, d_groups = gt.group_by_depth_range(t_portal, y_portal, z_portal, columns, ranges)

                                    if 'scatter' in sv:
                                        n_std = None  # to use percentile
                                    else:
                                        n_std = n_std

                                    #  get percentile analysis for printing on the profile plot
                                    y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr, time_ex = cf.reject_timestamps_in_groups(
                                        groups, d_groups, n_std, inpercentile)

                            """
                            Plot all data
                            """
                            if len(time1) > 0:
                                cf.create_dir(save_dir_profile)
                                cf.create_dir(save_dir_xsection)
                                sname = '-'.join((r, method, sv))
                                sfileall = '_'.join(('all_data', sname, pd.to_datetime(time1.min()).strftime('%Y%m%d')))
                                tm0 = pd.to_datetime(time1.min()).strftime('%Y-%m-%dT%H:%M:%S')
                                tm1 = pd.to_datetime(time1.max()).strftime('%Y-%m-%dT%H:%M:%S')
                                title = ' '.join((deployment, refdes, method)) + '\n' + tm0 + ' to ' + tm1
                                if 'SPKIR' in r:
                                    title = title + '\nWavelength = 510 nm'

                                '''
                                profile plot
                                '''
                                xlabel = sv + " (" + sv_units + ")"
                                ylabel = press[0] + " (" + y_units[0] + ")"
                                clabel = 'Time'

                                fig, ax = pf.plot_profiles(z1, y1, time1, ylabel, xlabel, clabel, stdev=None)

                                ax.set_title(title, fontsize=9)
                                fig.tight_layout()
                                pf.save_fig(save_dir_profile, sfileall)

                                '''
                                xsection plot
                                '''
                                clabel = sv + " (" + sv_units + ")"
                                ylabel = press[0] + " (" + y_units[0] + ")"

                                fig, ax, bar = pf.plot_xsection(subsite, time1, y1, z1, clabel, ylabel, t_eng=None,
                                                                m_water_depth=None, inpercentile=None, stdev=None)

                                if fig:
                                    ax.set_title(title, fontsize=9)
                                    fig.tight_layout()
                                    pf.save_fig(save_dir_xsection, sfileall)

                            """
                            Plot cleaned-up data
                            """
                            if len(dtime) > 0:
                                if len(y_portal) > 0:
                                    sfile = '_'.join(('rm_erroneous_data', sname, pd.to_datetime(t_portal.min()).strftime('%Y%m%d')))
                                    t0 = pd.to_datetime(t_portal.min()).strftime('%Y-%m-%dT%H:%M:%S')
                                    t1 = pd.to_datetime(t_portal.max()).strftime('%Y-%m-%dT%H:%M:%S')
                                    title = ' '.join((deployment, refdes, method)) + '\n' + t0 + ' to ' + t1
                                    if 'SPKIR' in r:
                                        title = title + '\nWavelength = 510 nm'

                                    '''
                                    profile plot
                                    '''
                                    xlabel = sv + " (" + sv_units + ")"
                                    ylabel = press[0] + " (" + y_units[0] + ")"
                                    clabel = 'Time'

                                    fig, ax = pf.plot_profiles(z_portal, y_portal, t_portal, ylabel, xlabel, clabel, stdev=None)

                                    ax.set_title(title, fontsize=9)
                                    ax.plot(n_avg, y_avg, '-k')
                                    ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2)
                                    if inpercentile:
                                        leg_text = (
                                            'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                            '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) +
                                            '\nexcluded {} suspect data points when inspected visually'.format(
                                                len(ndata) - len(z_portal)) +
                                            '\n(black) data average in {} dbar segments'.format(zcell_size) +
                                            '\n(magenta) {} percentile envelope in {} dbar segments'.format(
                                                int(100 - inpercentile * 2), zcell_size),)
                                    elif n_std:
                                        leg_text = (
                                            'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                            '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max,
                                                              lenzero) +
                                            '\nexcluded {} suspect data points when inspected visually'.format(
                                                len(ndata) - len(z_portal)) +
                                            '\n(black) data average in {} dbar segments'.format(zcell_size) +
                                            '\n(magenta) +/- {} SD envelope in {} dbar segments'.format(
                                                int(n_std), zcell_size),)
                                    ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                                    fig.tight_layout()
                                    pf.save_fig(save_dir_profile, sfile)

                                    '''
                                    xsection plot
                                    '''
                                    clabel = sv + " (" + sv_units + ")"
                                    ylabel = press[0] + " (" + y_units[0] + ")"

                                    # plot non-erroneous data
                                    fig, ax, bar = pf.plot_xsection(subsite, t_portal, y_portal, z_portal, clabel, ylabel,
                                                                    t_eng=None, m_water_depth=None, inpercentile=None,
                                                                    stdev=None)

                                    ax.set_title(title, fontsize=9)
                                    leg_text = (
                                        'removed {} fill values, {} NaNs, {} Extreme Values (1e7), {} Global ranges [{} - {}], '
                                        '{} unreasonable values'.format(lenfv, lennan, lenev, lengr, global_min, global_max, lenzero) +
                                        '\nexcluded {} suspect data points when inspected visually'.format(
                                            len(ndata) - len(z_portal)),
                                    )
                                    ax.legend(leg_text, loc='upper center', bbox_to_anchor=(0.5, -0.17), fontsize=6)
                                    fig.tight_layout()
                                    pf.save_fig(save_dir_xsection, sfile)

                                    '''
                                    4D plot for gliders only
                                    '''
                                    if 'MOAS' in r:
                                        if ds_lat is not None and ds_lon is not None:
                                            cf.create_dir(save_dir_4d)

                                            clabel = sv + " (" + sv_units + ")"
                                            zlabel = press[0] + " (" + y_units[0] + ")"

                                            fig = plt.figure()
                                            ax = fig.add_subplot(111, projection='3d')
                                            sct = ax.scatter(lon_portal, lat_portal, y_portal, c=z_portal, s=2)
                                            cbar = plt.colorbar(sct, label=clabel, extend='both')
                                            cbar.ax.tick_params(labelsize=8)
                                            ax.invert_zaxis()
                                            ax.view_init(25, 32)
                                            ax.invert_xaxis()
                                            ax.invert_yaxis()
                                            ax.set_zlabel(zlabel, fontsize=9)
                                            ax.set_ylabel('Latitude', fontsize=9)
                                            ax.set_xlabel('Longitude', fontsize=9)

                                            ax.set_title(title, fontsize=9)
                                            pf.save_fig(save_dir_4d, sfile)

Пример #22

Показать файл

Файл: _plot_velocity_final.py Проект: ooi-data-lab/data-review-tools

def plot_data(fig, ax, fdatasets, save_dir, r):

    for ii in range(fdatasets):
        print('\n', fdatasets[ii])
        deployment = fdatasets[ii].split('/')[-1].split('_')[0].split(
            'deployment')[-1]
        deployment = int(deployment)

        ds = xr.open_dataset(fdatasets[ii], mask_and_scale=False)
        time = ds['time'].values
        '''
        science veriable
        '''
        sci_var = cf.return_science_vars(ds.stream)
        z_name = [z_var for z_var in sci_var if 'pressure' in z_var]
        z = ds[z_name[0]].values
        z_unit = ds[z_name[0]].units
        z_fill = ds[z_name[0]]._FillValue

        z = reject_err_data(z, z_fill, r, z_name[0])

        w_name = [w_var for w_var in sci_var if 'upward_velocity' in w_var]
        w = ds[w_name[0]].values
        w_unit = ds[w_name[0]].units
        w_fill = ds[w_name[0]]._FillValue

        w = reject_err_data(w, w_fill, r, w_name[0])

        u_name = [u_var for u_var in sci_var if 'eastward_velocity' in u_var]
        u = ds[u_name[0]].values
        u_fill = ds[u_name[0]]._FillValue

        u = reject_err_data(u, u_fill, r, u_name[0])

        v_name = [v_var for v_var in sci_var if 'northward_velocity' in v_var]
        v = ds[v_name[0]].values
        v_fill = ds[v_name[0]]._FillValue

        v = reject_err_data(v, v_fill, r, v_name[0])

        uv_magnitude = np.sqrt(u**2 + v**2)
        uv_maxmag = max(uv_magnitude)
        '''
         non science veriable
         According to VELPT manufacturer, data are suspect when this instrument is tilted more than 20 degrees
         redmine ticket: Marine Hardware #12960
         '''

        roll = ds['roll_decidegree'].values
        roll_unit = ds['roll_decidegree'].units
        roll_fill = ds['roll_decidegree']._FillValue

        roll = reject_err_data(roll, roll_fill, r, 'roll_decidegree')

        pitch = ds['pitch_decidegree'].values
        pitch_units = ds['pitch_decidegree'].units
        pitch_fill = ds['pitch_decidegree']._FillValue

        pitch = reject_err_data(pitch, pitch_fill, r, 'pitch_decidegree')

        headng = ds['heading_decidegree'].values
        headng_units = ds['heading_decidegree'].units
        headng_fill = ds['heading_decidegree']._FillValue

        headng = reject_err_data(headng, headng_fill, r, 'heading_decidegree')

        tilt_ind = np.logical_or(pitch > 200, roll > 200)

        pitch_fit = pitch[tilt_ind]
        roll_fit = roll[tilt_ind]
        '''
        Plot pressure
        '''

        ax1[ii].plot(time, z, 'b-', linestyle='--', linewidth=.6, label='V')

        if ii == 0:
            ax1[ii].set_title(r + ' - Pressure ' + z_unit, fontsize=8)

        prepare_axis(time, deployment, ax1[ii], ii, num_plots)

        sfile = 'pressure_plots' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig1.savefig(str(save_file), dpi=150)
        '''
        plot roll
        '''
        ax2[ii].plot(time,
                     roll,
                     'b-',
                     linestyle='--',
                     linewidth=.6,
                     label='Roll')
        ax2[ii].plot(time[tilt_ind],
                     roll_fit,
                     'g.',
                     linestyle='None',
                     marker='.',
                     markersize=0.5,
                     label='Roll < 200')

        prepare_axis(time, deployment, ax2[ii], ii, num_plots)

        if ii == 0:
            ax2[ii].set_title(r + ' - Roll ' + roll_unit, fontsize=8)
            # leg2 = ax2[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3,
            #                       ncol=3, mode="expand", borderaxespad=0.)
            # leg2._drawFrame = False
            ax2[ii].legend()

        sfile = 'roll_plots' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig2.savefig(str(save_file), dpi=150)
        '''
        plot pitch
        '''

        ax3[ii].plot(time,
                     pitch,
                     'b-',
                     linestyle='--',
                     linewidth=.6,
                     label='Roll')
        ax3[ii].plot(time[tilt_ind],
                     pitch_fit,
                     'g.',
                     linestyle='None',
                     marker='.',
                     markersize=0.5,
                     label='Roll < 200')

        prepare_axis(time, deployment, ax3[ii], ii, num_plots)

        if ii == 0:
            ax3[ii].set_title(r + ' - Pitch ' + pitch_units, fontsize=8)
            # leg3 = ax3[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3,
            #                       ncol=3, mode="expand", borderaxespad=0.)
            # leg3._drawFrame = False
            ax3[ii].legend()

        sfile = 'pitch_plots' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig3.savefig(str(save_file), dpi=150)
        '''
        plot heading
        '''
        ax4[ii].plot(time,
                     headng,
                     'b-',
                     linestyle='None',
                     marker='.',
                     markersize=0.5,
                     label='Roll')
        ax4[ii].plot(time[tilt_ind],
                     headng[tilt_ind],
                     'g.',
                     linestyle='None',
                     marker='.',
                     markersize=0.5,
                     label='Roll < 200')
        prepare_axis(time, deployment, ax4[ii], ii, num_plots)

        if ii == 0:
            ax4[ii].set_title(r + ' - Heading ' + headng_units, fontsize=8)
            # leg4 = ax4[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3,
            #                       ncol=3, mode="expand", borderaxespad=0.)
            # leg4._drawFrame = False
            ax4[ii].legend()

        sfile = 'heading_plots' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig4.savefig(str(save_file), dpi=150)
        '''
        1D Quiver plot
        '''
        ax[ii].quiver(time,
                      0,
                      u,
                      v,
                      color='r',
                      units='y',
                      scale_units='y',
                      scale=1,
                      headlength=1,
                      headaxislength=1,
                      width=0.004,
                      alpha=0.5)

        u_fit = u[tilt_ind]
        v_fit = v[tilt_ind]
        ax[ii].quiver(time[tilt_ind],
                      0,
                      u_fit,
                      v_fit,
                      color='b',
                      units='y',
                      scale_units='y',
                      scale=1,
                      headlength=1,
                      headaxislength=1,
                      width=0.004,
                      alpha=0.5)
        percent_bad = round(((len(u) - len(u_fit)) / len(u)) * 100, 2)
        print(len(u_fit), len(u), percent_bad)
        ax[ii].text(time[-1],
                    0,
                    ' ' + str(percent_bad) + '%',
                    fontsize=5,
                    style='italic',
                    color='blue')

        ax[ii].set_ylim(-uv_maxmag, uv_maxmag)
        prepare_axis(time, deployment, ax[ii], ii, num_plots)

        if ii == 0:
            ax[ii].set_title(
                r + ' - Current Velocity ' + w_unit + '\n' +
                ' Currents in blue when pitch or roll are > 20 degrees',
                fontsize=8)
            ax[ii].legend()

        sfile = 'current_plot' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig.savefig(str(save_file), dpi=150, bbox_inches='tight')
        '''
        Plot u and v components
        '''

        ax0[ii].plot(time, v, 'b-', linestyle='--', linewidth=.6, label='V')
        ax0[ii].plot(time, u, 'g-', linestyle='--', linewidth=.6, label='U')
        ax0[ii].plot(time, w, 'r-', linestyle='--', linewidth=.6, label='W')

        prepare_axis(time, deployment, ax0[ii], ii, num_plots)

        # set title
        if ii == 0:
            ax0[ii].set_title(r + ' - Velocity Components' + w_unit,
                              fontsize=8)
            # Set legend location - See: http://matplotlib.org/users/legend_guide.html#legend-location
            ax0[ii].legend()
            # leg0 = ax0[ii].legend(fontsize=6, bbox_to_anchor=(0., 0.80, 1., .102), loc=3,
            #               ncol=3, mode="expand", borderaxespad=0.)
            # leg0._drawFrame = False

        sfile = 'uv_plots' + group_num
        save_file = os.path.join(save_dir, sfile)
        fig0.savefig(str(save_file), dpi=150)