Python reject_outliers 예제들, functions.common.reject_outliers Python 예제들

예제 #1

0

파일 보기

def depth_glider_cross_section(x, y, z, s=5, title=None, stdev=3, interactive=False):
    fig, ax = plt.subplots()
    plt.grid()


    # remove measurements at the surface
    ind = y['data'] > 5 # meters (dbar) to exclude
    y['data'] = y['data'][ind]
    x['data'] = x['data'][ind]
    z['data'] = z['data'][ind]

    # remove stdev of z values
    ind = reject_outliers(z['data'], stdev)
    y['data'] = y['data'][ind]
    x['data'] = x['data'][ind]
    z['data'] = z['data'][ind]

    ax.invert_yaxis()
    sc = plt.scatter(x['data'], y['data'],
                     s=s,
                     c=z['data'],
                     edgecolors='face', picker=interactive)
                     # vmin=)

    # add colorbar
    cb = fig.colorbar(sc, ax=ax, label=z['info']['label'] + " (" + z['info']['units'] + ")")
    cb.formatter.set_useOffset(False)
    cb.update_ticks()


    ax.set_title(title)
    format_axes(ax, x_data=x['data'])
    set_labels(ax, x['info'], y['info'])
    return fig, ax

예제 #2

0

파일 보기

파일: ctdmo_platform_timeseries.py 프로젝트: ooi-data-lab/data-review-tools

def plot_ctdmo(data_dict, var, stdev=None):
    colors10 = [
        'red', 'firebrick', 'orange', 'mediumseagreen', 'blue', 'darkgreen',
        'purple', 'indigo', 'slategray', 'black'
    ]

    colors16 = [
        'red', 'firebrick', 'orange', 'gold', 'mediumseagreen', 'darkcyan',
        'blue', 'darkgreen', 'purple', 'lightgray', 'slategray', 'black',
        'coral', 'gold', 'limegreen', 'midnightblue'
    ]

    fig, ax1 = plt.subplots()
    sensor_list = []
    median_list = []

    for i, (key, value) in enumerate(data_dict.items()):
        if len(data_dict) < 11:
            colors = colors10
        else:
            colors = colors16
        t = value['time']
        y = value['yD']
        if stdev != None:
            ind = cf.reject_outliers(value['yD'], stdev)
            t = t[ind]
            y = y[ind]

        refdes = str(key)
        sensor_list.append(refdes.split('-')[-1])
        median_list.append(value['median'])

        plt.scatter(t, y, c=colors[i], marker='.', s=.5)

        if i == len(data_dict) - 1:  # if the last dataset has been plotted
            plt.grid()
            plt.margins(y=.05, x=.05)

            # refdes on secondary y-axis only for pressure and density
            if var in ['ctdmo_seawater_pressure', 'density']:
                ax2 = ax1.twinx()
                ax2.set_ylim(ax1.get_ylim())
                plt.yticks(median_list, sensor_list, fontsize=7.5)
                plt.subplots_adjust(right=.85)

            pf.format_date_axis(ax1, fig)
            pf.y_axis_disable_offset(ax1)

            subsite = refdes.split('-')[0]
            title = subsite + ' ' + ('-'.join(
                (value['dms'].split('-')[0], value['dms'].split('-')[1])))
            ax1.set_ylabel((var + " (" + value['yunits'] + ")"), fontsize=9)
            ax1.set_title(title, fontsize=10)

            fname = '-'.join((subsite, value['dms'], var))
            if stdev != None:
                fname = '-'.join((fname, 'outliers_rejected'))
            sdir = os.path.join(sDir, subsite, value['dms'].split('-')[0])
            cf.create_dir(sdir)
            pf.save_fig(sdir, fname)

예제 #3

0

파일 보기

파일: ctd_timeseries.py 프로젝트: ooi-data-lab/data-review-tools

def main(sDir, f):
    ff = pd.read_csv(os.path.join(sDir, f))
    datasets = cf.get_nc_urls(ff['outputUrl'].tolist())
    for d in datasets:
        print(d)
        fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
            d)
        save_dir = os.path.join(sDir, subsite, refdes, deployment)
        cf.create_dir(save_dir)

        sci_vars = cf.return_science_vars(stream)

        colors = cm.jet(np.linspace(0, 1, len(sci_vars)))

        with xr.open_dataset(d, mask_and_scale=False) as ds:
            ds = ds.swap_dims({'obs': 'time'})
            t = ds['time'].data
            t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S')
            t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S')
            title = ' '.join((deployment, refdes, method))

            fig, ax = plt.subplots()
            axes = [ax]
            for i in range(len(sci_vars)):
                if i > 0:
                    axes.append(ax.twinx()
                                )  # twin the x-axis to make independent y-axes

            fig.subplots_adjust(right=0.6)
            right_additive = (0.98 - 0.6) / float(5)

            for i in range(len(sci_vars)):
                if i > 0:
                    axes[i].spines['right'].set_position(
                        ('axes', 1. + right_additive * i))
                y = ds[sci_vars[i]]

                ind = cf.reject_outliers(y, 5)
                yD = y.data[ind]
                x = t[ind]

                #yD = y.data
                c = colors[i]
                axes[i].plot(x, yD, '.', markersize=2, color=c)
                axes[i].set_ylabel((y.name + " (" + y.units + ")"),
                                   color=c,
                                   fontsize=9)
                axes[i].tick_params(axis='y', colors=c)
                if i == len(
                        sci_vars) - 1:  # if the last variable has been plotted
                    pf.format_date_axis(axes[i], fig)

            axes[0].set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)
            sfile = '_'.join((fname, 'timeseries'))
            pf.save_fig(save_dir, sfile)

예제 #4

0

파일 보기

def plot(x, y, title, stdev=None, line_style='.', g_ranges=False, color=None, interactive=False):
    """
    :param x: Dictionary must be in the form:
    {'data': numpy data array ,
    'info': {'label': axis label, 'units': axis units'}}
    :param y:
    :param file_name:
    :param save_dir:
    :param line_style:
    :return:
    """

    if stdev is None:
        y = y
        outlier_text = ''
    else:
        # if len(np.unique(y['data'])) is 1:
        #     y_max = np.unique(y['data'])[0]
        #     y_min = np.unique(y['data'])[0]
        #     outliers = 0
        # else:
        ind = reject_outliers(y['data'], stdev)
        y['data'] = y['data'][ind]
        x['data'] = x['data'][ind]
        outliers = str(len(ind) - sum(ind))
        outlier_text = 'n removed $\pm$ {}$\sigma: $ {}'.format(stdev, outliers)

    fig, ax = plt.subplots()
    # ax.set_autoscale_on(False)
    plt.grid()
    if not interactive == True:
        plt.plot(x['data'], y['data'], line_style, linewidth=1, markersize=3, color=color)
    else:
        plt.plot(x['data'], y['data'], line_style, linewidth=1, markersize=3, color=color, picker=True)

    ax.set_title(title)

    # Format legend
    try:
        leg_text = ('$\max:$ {:6.4f}\n$\min:$ {:6.4f}\n{}'.format(np.nanmax(y['data']), np.nanmin(y['data']), outlier_text),)
    except ValueError:
        leg_text = ()

    if g_ranges:
        gr = add_global_ranges(ax, y)
        leg_text += ('Global Ranges\n$\max$: {} \n$\min$: {}'.format(gr[1], gr[0]),)

    ax.legend(leg_text, loc='best', fontsize=8)
    format_axes(ax)
    set_labels(ax, x['info'], y['info'])
    return fig, ax

예제 #5

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_timeseries(x, y, y_name, stdev=None):
    """
    Create a simple timeseries plot
    :param x: array containing data for x-axis (e.g. time)
    :param y: .nc data array for plotting on the y-axis, including data values, coordinates, and variable attributes
    :param stdev: desired standard deviation to exclude from plotting
    """

    if type(y) is not np.ndarray:
        yval = y.values
    else:
        yval = y

    if type(x) is not np.ndarray:
        x = x.values

    if stdev is None:
        xD = x
        yD = yval
        leg_text = ()
    else:
        ind = cf.reject_extreme_values(yval)
        ydata = yval[ind]
        xdata = x[ind]

        if len(xdata) > 0:
            ind2 = cf.reject_outliers(ydata, stdev)
            yD = ydata[ind2]
            xD = xdata[ind2]
            outliers = str(len(y) - len(yD))
            leg_text = ('removed {} outliers (SD={})'.format(outliers,
                                                             stdev), )
        else:
            xD = []

    fig, ax = plt.subplots()
    plt.grid()
    if len(xD) > 0:
        plt.plot(xD, yD, '.', markersize=2)

        y_units = get_units(y)

        ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9)
        format_date_axis(ax, fig)
        y_axis_disable_offset(ax)
        ax.legend(leg_text, loc='best', fontsize=6)

    return fig, ax

예제 #6

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_profiles(x, y, t, ylabel, xlabel, clabel, stdev=None):
    """
    Create a profile plot for mobile instruments
    :param x: .nc data array containing data for plotting variable of interest (e.g. density)
    :param y: .nc data array containing data for plotting on the y-axis (e.g. pressure)
    :param t: .nc data array containing time data to be used for coloring (x,y) data pairs
    :param stdev: desired standard deviation to exclude from plotting
    """
    if type(t) is not np.ndarray and type(t) is not list:
        t = t.values

    if type(y) is not np.ndarray and type(t) is not list:
        y = y.values

    if type(x) is not np.ndarray and type(t) is not list:
        x = x.values

    if stdev is None:
        xD = x
        yD = y
        tD = t
        leg_text = ()
    else:
        ind2 = cf.reject_outliers(x, stdev)
        xD = x[ind2]
        yD = y[ind2]
        tD = t[ind2]
        outliers = str(len(x) - len(xD))
        leg_text = ('removed {} outliers (SD={})'.format(outliers, stdev), )

    fig, ax = plt.subplots()
    plt.margins(y=.08, x=.02)
    plt.grid()
    sct = ax.scatter(xD, yD, c=tD, s=2, edgecolor='None', cmap='rainbow')
    cbar = plt.colorbar(sct, label=clabel)
    #cbar.ax.set_yticklabels(pd.to_datetime(end_times).strftime(date_format='%Y-%m-%d'), update_ticks=True)
    #cbar.ax.set_yticklabels(pd.to_datetime(cbar.get_ticks()).strftime(date_format='%Y-%m-%d'))
    cbar.ax.set_yticklabels(
        pd.to_datetime(cbar.ax.get_yticks()).strftime(date_format='%Y-%m-%d'))

    ax.invert_yaxis()
    #plt.xlim([-0.5, 0.5])
    ax.set_xlabel(xlabel, fontsize=9)
    ax.set_ylabel(ylabel, fontsize=9)
    ax.legend(leg_text, loc='best', fontsize=6)

    return fig, ax

예제 #7

0

파일 보기

def plot_outlier_comparison(x, y, title, stdev = 1, line_style='r-o', g_range=False):
    """

    :param x: Dictionary must be in the form:
    {'data': numpy data array ,
    'info': {'label': axis label, 'units': axis units'}}
    :param y:
    :param file_name:
    :param save_dir:
    :param line_style:
    :return:
    """
    ind = reject_outliers(y['data'], stdev)
    y['data'] = y['data']
    x['data'] = x['data']
    outliers = str(len(ind) - sum(ind))
    outlier_text = 'n removed $\pm$ {}$\sigma: $ {}'.format(stdev, outliers)

    ax1 = plt.subplot(211)
    plt.plot(x['data'], y['data'], line_style, linewidth=2, markersize=2)
    plt.grid()
    format_axes(ax1)

    # Format legend
    leg_text = ('$\max:$ {:6.4f}\n$\min:$ {:6.4f}\n{}'.format(np.nanmax(y['data'][ind]), np.nanmin(y['data'][ind]), outlier_text),)

    ax2 = plt.subplot(212, sharex=ax1)
    plt.grid()
    plt.plot(x['data'][ind], y['data'][ind], line_style, linewidth=2, markersize=2)
    format_axes(ax2)
    # plt.setp(ax2.get_xticklabels(), fontsize=8)

    ax1.set_title(title)
    # ax2.set_title('Global Ranges $\max$: {} $\min$: {}'.format(gr[1], gr[0]), fontsize=8)
    if g_range:
        gr = add_global_ranges(ax2, y)
        leg_text += ('Global Ranges $\max$: {} $\min$: {}'.format(gr[1], gr[0]),)
    ax2.legend(leg_text, loc='best', fontsize=8)

    ax1.set_ylabel(y['info']['label'] + " (" + y['info']['units'] + ")")
    ax2.set_ylabel(y['info']['label'] + " (" + y['info']['units'] + ")")
    ax2.set_xlabel(x['info']['label'] + " (" + x['info']['units'] + ")")
    return ax1, ax2

예제 #8

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_timeseries_panel(ds, x, vars, colors, stdev=None):
    """
    Create a timeseries plot with horizontal panels of each science parameter
    :param ds: dataset (e.g. .nc file opened with xarray) containing data for plotting
    :param x: array containing data for x-axis (e.g. time)
    :param vars: list of science variables to plot
    :param colors: list of colors to be used for plotting
    :param stdev: desired standard deviation to exclude from plotting
    """
    fig, ax = plt.subplots(len(vars), sharex=True)

    for i in range(len(vars)):
        y = ds[vars[i]]

        if stdev is None:
            yD = y.values
            xD = x
            leg_text = ()
        else:
            ind = cf.reject_extreme_values(y.values)
            ydata = y[ind]
            xdata = x[ind]

            ind2 = cf.reject_outliers(ydata.values, stdev)
            yD = ydata[ind2].values
            xD = xdata[ind2]
            outliers = str(len(y) - len(yD))
            leg_text = ('{}: rm {} outliers'.format(vars[i], outliers), )

        y_units = get_units(y)
        c = colors[i]
        ax[i].plot(xD, yD, '.', markersize=2, color=c)
        ax[i].set_ylabel(('(' + y_units + ')'), fontsize=5)
        ax[i].tick_params(axis='y', labelsize=6)
        ax[i].legend(leg_text, loc='best', fontsize=4)
        y_axis_disable_offset(ax[i])
        if i == len(vars) - 1:  # if the last variable has been plotted
            format_date_axis(ax[i], fig)

    return fig, ax

예제 #9

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_timeseries_all(x, y, y_name, y_units, stdev=None):
    """
    Create a simple timeseries plot
    :param x: array containing data for x-axis (e.g. time)
    :param y: array containing data for y-axis
    :param stdev: desired standard deviation to exclude from plotting
    """
    if stdev is None:
        xD = x
        yD = y
        leg_text = ()
    else:
        ind = cf.reject_extreme_values(y)
        ydata = y[ind]
        xdata = x[ind]

        ind2 = cf.reject_outliers(ydata, stdev)
        yD = ydata[ind2]
        xD = xdata[ind2]

        # ind2 = cf.reject_outliers(y, stdev)
        # yD = y[ind2]
        # xD = x[ind2]
        outliers = str(len(y) - len(yD))
        leg_text = ('removed {} outliers (SD={})'.format(outliers, stdev), )

    fig, ax = plt.subplots()
    plt.grid()
    plt.plot(xD, yD, '.', markersize=2)

    #plt.ylim([-10, 50])

    ax.set_ylabel((y_name + " (" + y_units + ")"), fontsize=9)
    format_date_axis(ax, fig)
    y_axis_disable_offset(ax)
    ax.legend(leg_text, loc='best', fontsize=6)
    return fig, ax

예제 #10

0

파일 보기

파일: plot_ts.py 프로젝트: ooi-data-lab/data-review-tools

def main(sDir, url_list, start_time, end_time, preferred_only):
    rd_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        if rd not in rd_list:
            rd_list.append(rd)

    for r in rd_list:
        print('\n{}'.format(r))
        datasets = []
        for u in url_list:
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)
        datasets = list(itertools.chain(*datasets))
        fdatasets = []
        if preferred_only == 'yes':
            # get the preferred stream information
            ps_df, n_streams = cf.get_preferred_stream_info(r)
            for index, row in ps_df.iterrows():
                for ii in range(n_streams):
                    rms = '-'.join((r, row[ii]))
                    for dd in datasets:
                        spl = dd.split('/')[-2].split('-')
                        catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6]))
                        fdeploy = dd.split('/')[-1].split('_')[0]
                        if rms == catalog_rms and fdeploy == row['deployment']:
                            fdatasets.append(dd)
        else:
            fdatasets = datasets

        for fd in fdatasets:
            with xr.open_dataset(fd, mask_and_scale=False) as ds:
                ds = ds.swap_dims({'obs': 'time'})

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd)
                print('\nPlotting {} {}'.format(r, deployment))
                array = subsite[0:2]
                save_dir = os.path.join(sDir, array, subsite, refdes, 'ts_plots')
                cf.create_dir(save_dir)

                tme = ds['time'].values
                t0 = pd.to_datetime(tme.min()).strftime('%Y-%m-%dT%H:%M:%S')
                t1 = pd.to_datetime(tme.max()).strftime('%Y-%m-%dT%H:%M:%S')
                title = ' '.join((deployment, refdes, method))
                filename = '-'.join(('_'.join(fname.split('_')[:-1]), 'ts', t0[:10]))

                ds_vars = list(ds.data_vars.keys())
                raw_vars = cf.return_raw_vars(ds_vars)

                xvar = return_var(ds, raw_vars, 'salinity', 'Practical Salinity')
                sal = ds[xvar].values
                sal_fv = ds[xvar]._FillValue

                yvar = return_var(ds, raw_vars, 'temp', 'Seawater Temperature')
                temp = ds[yvar].values
                temp_fv = ds[yvar]._FillValue

                press = pf.pressure_var(ds, list(ds.coords.keys()))
                if press is None:
                    press = pf.pressure_var(ds, list(ds.data_vars.keys()))
                p = ds[press].values

                # get rid of nans, 0.0s, fill values
                sind1 = (~np.isnan(sal)) & (sal != 0.0) & (sal != sal_fv)
                sal = sal[sind1]
                temp = temp[sind1]
                tme = tme[sind1]
                p = p[sind1]
                tind1 = (~np.isnan(temp)) & (temp != 0.0) & (temp != temp_fv)
                sal = sal[tind1]
                temp = temp[tind1]
                tme = tme[tind1]
                p = p[tind1]

                # reject values outside global ranges:
                global_min, global_max = cf.get_global_ranges(r, xvar)
                if any(e is None for e in [global_min, global_max]):
                    sal = sal
                    temp = temp
                    tme = tme
                    p = p
                else:
                    sgr_ind = cf.reject_global_ranges(sal, global_min, global_max)
                    sal = sal[sgr_ind]
                    temp = temp[sgr_ind]
                    tme = tme[sgr_ind]
                    p = p[sgr_ind]

                global_min, global_max = cf.get_global_ranges(r, yvar)
                if any(e is None for e in [global_min, global_max]):
                    sal = sal
                    temp = temp
                    tme = tme
                    p = p
                else:
                    tgr_ind = cf.reject_global_ranges(temp, global_min, global_max)
                    sal = sal[tgr_ind]
                    temp = temp[tgr_ind]
                    tme = tme[tgr_ind]
                    p = p[tgr_ind]

                # get rid of outliers
                soind = cf.reject_outliers(sal, 5)
                sal = sal[soind]
                temp = temp[soind]
                tme = tme[soind]
                p = p[soind]

                toind = cf.reject_outliers(temp, 5)
                sal = sal[toind]
                temp = temp[toind]
                tme = tme[toind]
                p = p[toind]

                if len(sal) > 0:  # if there are any data to plot

                    colors = cm.rainbow(np.linspace(0, 1, len(tme)))

                    # Figure out boundaries (mins and maxes)
                    #smin = sal.min() - (0.01 * sal.min())
                    #smax = sal.max() + (0.01 * sal.max())
                    if sal.max() - sal.min() < 0.2:
                        smin = sal.min() - (0.0005 * sal.min())
                        smax = sal.max() + (0.0005 * sal.max())
                    else:
                        smin = sal.min() - (0.001 * sal.min())
                        smax = sal.max() + (0.001 * sal.max())

                    if temp.max() - temp.min() <= 1:
                        tmin = temp.min() - (0.01 * temp.min())
                        tmax = temp.max() + (0.01 * temp.max())
                    elif 1 < temp.max() - temp.min() < 1.5:
                        tmin = temp.min() - (0.05 * temp.min())
                        tmax = temp.max() + (0.05 * temp.max())
                    else:
                        tmin = temp.min() - (0.1 * temp.min())
                        tmax = temp.max() + (0.1 * temp.max())

                    # Calculate how many gridcells are needed in the x and y directions and
                    # Create temp and sal vectors of appropriate dimensions
                    xdim = int(round((smax-smin)/0.1 + 1, 0))
                    if xdim == 1:
                        xdim = 2
                    si = np.linspace(0, xdim - 1, xdim) * 0.1 + smin

                    if 1.1 <= temp.max() - temp.min() < 1.7:  # if the diff between min and max temp is small
                        ydim = int(round((tmax-tmin)/0.75 + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) * 0.75 + tmin
                    elif temp.max() - temp.min() < 1.1:
                        ydim = int(round((tmax - tmin) / 0.1 + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) * 0.1 + tmin
                    else:
                        ydim = int(round((tmax - tmin) + 1, 0))
                        ti = np.linspace(0, ydim - 1, ydim) + tmin

                    # Create empty grid of zeros
                    mdens = np.zeros((ydim, xdim))

                    # Loop to fill in grid with densities
                    for j in range(0, ydim):
                        for i in range(0, xdim):
                            mdens[j, i] = gsw.density.rho(si[i], ti[j], np.median(p))  # calculate density using median pressure value

                    fig, ax = pf.plot_ts(si, ti, mdens, sal, temp, colors)

                    ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\ncolors = time (cooler: earlier)'), fontsize=9)
                    leg_text = ('Removed {} values (SD=5)'.format(len(ds[xvar].values) - len(sal)),)
                    ax.legend(leg_text, loc='best', fontsize=6)
                    pf.save_fig(save_dir, filename)

예제 #11

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_xsection(subsite,
                  x,
                  y,
                  z,
                  clabel,
                  ylabel,
                  t_eng=None,
                  m_water_depth=None,
                  inpercentile=None,
                  stdev=None):
    """
    Create a cross-section plot for mobile instruments
    :param subsite: subsite part of reference designator to plot
    :param x:  array containing data for x-axis (e.g. time)
    :param y: .nc data array containing data for plotting on the y-axis (e.g. pressure)
    :param z: .nc data array containing data for plotting variable of interest (e.g. density)
    :param clabel: label for the colorbar
    :param ylabel: label for the y-axis
    :param t_eng: .nc data array containing engineering timestamps (to plot water depth)
    :param m_water_depth: .nc data array containing water depth data from the engineering data stream
    :param inpercentile: percentile of data to exclude from plot
    :param stdev: desired standard deviation to exclude from plotting
    """
    if type(z) is not np.ndarray:
        z = z.values

    if type(y) is not np.ndarray:
        y = y.values

    if type(x) is not np.ndarray:
        x = x.values

    # when plotting gliders, remove zeros (glider fill values) and negative numbers
    if 'MOAS' in subsite:
        z[z <= 0.0] = np.nan
        zeros = str(len(z) - np.count_nonzero(~np.isnan(z)))

    if stdev is None:
        xD = x
        yD = y
        zD = z
    else:
        ind = cf.reject_extreme_values(z)
        xdata = x[ind]
        ydata = y[ind]
        zdata = z[ind]

        ind2 = cf.reject_outliers(zdata, stdev)
        xD = xdata[ind2]
        yD = ydata[ind2]
        zD = zdata[ind2]
        outliers = str(len(zdata) - len(zD))

    try:
        zeros
    except NameError:
        zeros = None

    try:
        outliers
    except NameError:
        outliers = None

    fig, ax = plt.subplots()
    plt.margins(y=.08, x=.02)
    try:
        xc = ax.scatter(xD, yD, c=zD, s=2, edgecolor='None')
        #plt.ylim([0, 100])
        ax.invert_yaxis()

        # add bathymetry for coastal gliders
        if t_eng is not None and m_water_depth is not None:
            if len(t_eng) > 1:
                ax.fill_between(t_eng,
                                m_water_depth,
                                np.max(m_water_depth) + 2,
                                facecolor='k',
                                alpha=0.4)

        # add color bar
        #ticks = np.linspace(np.nanmin(zD), np.nanmax(zD), 5).tolist()
        bar = fig.colorbar(xc, ax=ax, label=clabel, extend='both')
        bar.formatter.set_useOffset(False)
        bar.ax.tick_params(labelsize=8)

        if inpercentile is not None:
            upper_lim = np.percentile(zD, 100 - inpercentile)
            # upper_mid = np.percentile(zD, 100 - 15*inpercentile)
            # lower_mid = np.percentile(zD, 100 - 10*inpercentile)
            lower_lim = np.percentile(zD, inpercentile)
            bar.set_clim(lower_lim, upper_lim)
            bar.set_ticks([lower_lim, upper_lim],
                          update_ticks=True)  #lower_mid, upper_mid,

        ax.set_ylabel(ylabel, fontsize=9)
        format_date_axis(ax, fig)

        if zeros is None and type(outliers) is str:
            leg = ('rm: {} outliers (SD={})'.format(outliers, stdev), )
            ax.legend(leg, loc=1, fontsize=6)
        if type(zeros) is str and outliers is None:
            leg = ('rm: {} values <=0.0'.format(zeros), )
            ax.legend(leg, loc=1, fontsize=6)
        if type(zeros) is str and type(outliers) is str:
            leg = ('rm: {} values <=0.0, rm: {} outliers (SD={})'.format(
                zeros, outliers, stdev), )
            ax.legend(leg, loc=1, fontsize=6)
    except ValueError:
        print("plot can't be generated")
        fig = None
        ax = None
        bar = None

    return fig, ax, bar

예제 #12

0

파일 보기

파일: plotting.py 프로젝트: leilabbb/data-review-tools

def plot_timeseries_compare(t0, t1, var0, var1, m0, m1, long_name, stdev=None):
    """
    Create a timeseries plot containing two datasets
    :param t0: data array of time for dataset 0
    :param t1: data array of time for dataset 1
    :param var0: .nc data array for plotting on the y-axis for dataset 0, including data values and variable attributes
    :param var1: .nc data array for plotting on the y-axis for dataset 1, including data values and variable attributes
    :param stdev: desired standard deviation to exclude from plotting
    """
    if stdev is None:
        t0_data = t0.values
        var0_data = var0.values
        leg_text = ('{}'.format(m0), )
        t1_data = t1.values
        var1_data = var1.values
        leg_text += ('{}'.format(m1), )
    else:
        ind0 = cf.reject_extreme_values(var0.values)
        t0i = t0[ind0]
        var0i = var0[ind0]

        ind02 = cf.reject_outliers(var0i.values, stdev)
        t0_data = t0i[ind02].values
        var0_data = var0i[ind02].values
        #var0_data[var0_data <= 0.0] = np.nan  # get rid of zeros and negative numbers
        outliers0 = str((len(var0) - len(var0_data)) +
                        (len(t0_data) -
                         np.count_nonzero(~np.isnan(var0_data))))
        leg_text = ('{}: removed {} outliers (SD={})'.format(
            m0, outliers0, stdev), )

        ind1 = cf.reject_extreme_values(var1.values)
        t1i = t1[ind1]
        var1i = var1[ind1]

        ind12 = cf.reject_outliers(var1i.values, stdev)
        t1_data = t1i[ind12].values
        var1_data = var1i[ind12].values
        #var1_data[var1_data <= 0.0] = np.nan  # get rid of zeros and negative numbers
        outliers1 = str((len(var1) - len(var1_data)) +
                        (len(t1_data) -
                         np.count_nonzero(~np.isnan(var1_data))))
        leg_text += ('{}: removed {} outliers (SD={})'.format(
            m1, outliers1, stdev), )

    y_units = get_units(var0)

    fig, ax = plt.subplots()
    plt.grid()
    #plt.ylim([2000, 2500])

    ax.plot(t0_data,
            var0_data,
            'o',
            markerfacecolor='none',
            markeredgecolor='r',
            markersize=5,
            lw=.75)
    #ax.plot(t1_data, var1_data, 'x', markeredgecolor='b', markersize=5, lw=.75)
    ax.plot(t1_data, var1_data, '.', markeredgecolor='b', markersize=2)
    ax.set_ylabel((long_name + " (" + y_units + ")"), fontsize=9)
    format_date_axis(ax, fig)
    y_axis_disable_offset(ax)
    ax.legend(leg_text, loc='best', fontsize=6)
    return fig, ax

예제 #13

0

파일 보기

def main(url_list, sDir, plot_type, deployment_num, start_time, end_time):
    """""
    URL : path to instrument data by methods
    sDir : path to the directory on your machine to save files
    plot_type: folder name for a plot type

    """ ""
    rd_list = []
    ms_list = []
    for uu in url_list:
        elements = uu.split('/')[-2].split('-')
        rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
        ms = uu.split(rd + '-')[1].split('/')[0]
        if rd not in rd_list:
            rd_list.append(rd)
        if ms not in ms_list:
            ms_list.append(ms)
    ''' 
    separate different instruments
    '''
    for r in rd_list:
        print('\n{}'.format(r))
        subsite = r.split('-')[0]
        array = subsite[0:2]
        main_sensor = r.split('-')[-1]

        ps_df, n_streams = cf.get_preferred_stream_info(r)

        # read in the analysis file
        dr_data = cf.refdes_datareview_json(r)

        # get end times of deployments
        deployments = []
        end_times = []
        for index, row in ps_df.iterrows():
            deploy = row['deployment']
            deploy_info = get_deployment_information(dr_data, int(deploy[-4:]))
            deployments.append(int(deploy[-4:]))
            end_times.append(pd.to_datetime(deploy_info['stop_date']))

        # get the list of data files and filter out collocated instruments and other streams chat
        datasets = []
        for u in url_list:
            print(u)
            splitter = u.split('/')[-2].split('-')
            rd_check = '-'.join(
                (splitter[1], splitter[2], splitter[3], splitter[4]))
            if rd_check == r:
                udatasets = cf.get_nc_urls([u])
                datasets.append(udatasets)

        datasets = list(itertools.chain(*datasets))
        fdatasets = cf.filter_collocated_instruments(main_sensor, datasets)
        fdatasets = cf.filter_other_streams(r, ms_list, fdatasets)
        '''
        separate the data files by methods
        '''
        for ms in ms_list:
            fdatasets_sel = [x for x in fdatasets if ms in x]

            # create a dictionary for science variables from analysis file
            stream_sci_vars_dict = dict()
            for x in dr_data['instrument']['data_streams']:
                dr_ms = '-'.join((x['method'], x['stream_name']))
                if ms == dr_ms:
                    stream_sci_vars_dict[dr_ms] = dict(vars=dict())
                    sci_vars = dict()
                    for y in x['stream']['parameters']:
                        if y['data_product_type'] == 'Science Data':
                            sci_vars.update(
                                {y['name']: dict(db_units=y['unit'])})
                    if len(sci_vars) > 0:
                        stream_sci_vars_dict[dr_ms]['vars'] = sci_vars

            # initialize an empty data array for science variables in dictionary
            sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict,
                                                       ms)

            print('\nAppending data from files: {}'.format(ms))
            y_unit = []
            y_name = []
            for fd in fdatasets_sel:
                ds = xr.open_dataset(fd, mask_and_scale=False)
                print(fd)

                if start_time is not None and end_time is not None:
                    ds = ds.sel(time=slice(start_time, end_time))
                    if len(ds['time'].values) == 0:
                        print(
                            'No data to plot for specified time range: ({} to {})'
                            .format(start_time, end_time))
                        continue

                fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(
                    fd)

                if deployment_num is not None:
                    if int(deployment.split('0')[-1]) is not deployment_num:
                        print(type(int(deployment.split('0')[-1])),
                              type(deployment_num))
                        continue

                save_dir = os.path.join(sDir, array, subsite, refdes,
                                        plot_type,
                                        ms.split('-')[0], deployment)
                cf.create_dir(save_dir)

                for var in list(sci_vars_dict[ms]['vars'].keys()):
                    sh = sci_vars_dict[ms]['vars'][var]
                    if ds[var].units == sh['db_units']:
                        if ds[var]._FillValue not in sh['fv']:
                            sh['fv'].append(ds[var]._FillValue)
                        if ds[var].units not in sh['units']:
                            sh['units'].append(ds[var].units)

                        # time
                        t = ds['time'].values
                        t0 = pd.to_datetime(
                            t.min()).strftime('%Y-%m-%dT%H:%M:%S')
                        t1 = pd.to_datetime(
                            t.max()).strftime('%Y-%m-%dT%H:%M:%S')

                        # sci variable
                        z = ds[var].values
                        sh['t'] = np.append(sh['t'], t)
                        sh['values'] = np.append(sh['values'], z)

                        # add pressure to dictionary of sci vars
                        if 'MOAS' in subsite:
                            if 'CTD' in main_sensor:  # for glider CTDs, pressure is a coordinate
                                pressure = 'sci_water_pressure_dbar'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                            else:
                                pressure = 'int_ctd_pressure'
                                y = ds[pressure].values
                                if ds[pressure].units not in y_unit:
                                    y_unit.append(ds[pressure].units)
                                if ds[pressure].long_name not in y_name:
                                    y_name.append(ds[pressure].long_name)
                        else:
                            pressure = pf.pressure_var(ds, ds.data_vars.keys())
                            y = ds[pressure].values
                            if ds[pressure].units not in y_unit:
                                y_unit.append(ds[pressure].units)
                            if ds[pressure].long_name not in y_name:
                                y_name.append(ds[pressure].long_name)

                        sh['pressure'] = np.append(sh['pressure'], y)

                if len(y_unit) != 1:
                    print('pressure unit varies UHHHHHHHHH')
                else:
                    y_unit = y_unit[0]

                if len(y_name) != 1:
                    print('pressure long name varies UHHHHHHHHH')
                else:
                    y_name = y_name[0]

                for m, n in sci_vars_dict.items():
                    for sv, vinfo in n['vars'].items():
                        print(sv)
                        if len(vinfo['t']) < 1:
                            print('no variable data to plot')
                        else:
                            sv_units = vinfo['units'][0]
                            fv = vinfo['fv'][0]
                            t0 = pd.to_datetime(min(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t1 = pd.to_datetime(max(
                                vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S')
                            t = vinfo['t']
                            z = vinfo['values']
                            y = vinfo['pressure']

                            title = ' '.join((r, ms.split('-')[1]))

                        # Check if the array is all NaNs
                        if sum(np.isnan(z)) == len(z):
                            print('Array of all NaNs - skipping plot.')

                        # Check if the array is all fill values
                        elif len(z[z != fv]) == 0:
                            print('Array of all fill values - skipping plot.')

                        else:
                            # reject fill values
                            fv_ind = z != fv
                            y_nofv = y[fv_ind]
                            t_nofv = t[fv_ind]
                            z_nofv = z[fv_ind]
                            print(len(z) - len(fv_ind), ' fill values')

                            # reject NaNs
                            nan_ind = ~np.isnan(z)
                            t_nofv_nonan = t_nofv[nan_ind]
                            y_nofv_nonan = y_nofv[nan_ind]
                            z_nofv_nonan = z_nofv[nan_ind]
                            print(len(z) - len(nan_ind), ' NaNs')

                            # reject extreme values
                            ev_ind = cf.reject_extreme_values(z_nofv_nonan)
                            t_nofv_nonan_noev = t_nofv_nonan[ev_ind]
                            colors = cm.rainbow(
                                np.linspace(0, 1, len(t_nofv_nonan_noev)))
                            y_nofv_nonan_noev = y_nofv_nonan[ev_ind]
                            z_nofv_nonan_noev = z_nofv_nonan[ev_ind]
                            print(
                                len(z) - len(ev_ind), ' Extreme Values',
                                '|1e7|')

                        if len(y_nofv_nonan_noev) > 0:
                            if m == 'common_stream_placeholder':
                                sname = '-'.join((r, sv))
                            else:
                                sname = '-'.join((r, m, sv))
                        # Plot all data
                        ylabel = y_name + " (" + y_unit + ")"
                        xlabel = sv + " (" + sv_units + ")"
                        clabel = 'Time'
                        clabel = sv + " (" + sv_units + ")"

                        fig, ax = pf.plot_profiles(z_nofv_nonan_noev,
                                                   y_nofv_nonan_noev,
                                                   colors,
                                                   xlabel,
                                                   ylabel,
                                                   stdev=None)
                        ax.set_title((
                            title + '\n' + str(deployment_num) + ': ' + t0 +
                            ' - ' + t1 + '\n' +
                            'used bin = 2 dbar to calculate an average profile (black line) and 3-STD envelope (shaded area)'
                        ),
                                     fontsize=9)

                        # group by depth range
                        columns = ['time', 'pressure', str(sv)]
                        # ranges = [0, 50, 100, 200, 400, 600]
                        ranges = list(
                            range(int(round(min(y_nofv_nonan_noev))),
                                  int(round(max(y_nofv_nonan_noev))), 1))
                        groups, d_groups = gt.group_by_depth_range(
                            t_nofv_nonan_noev, y_nofv_nonan_noev,
                            z_nofv_nonan_noev, columns, ranges)

                        # describe_file = '_'.join((sname, 'statistics.csv'))
                        # # groups.describe().to_csv(save_dir + '/' + describe_file)
                        ind = groups.describe()[sv]['mean'].notnull()
                        groups.describe()[sv][ind].to_csv(
                            '{}/{}_statistics.csv'.format(save_dir, sname),
                            index=True)

                        tm = 1
                        fig, ax = pyplot.subplots(nrows=2, ncols=1)
                        pyplot.margins(y=.08, x=.02)
                        pyplot.grid()
                        y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], []

                        for ii in range(len(groups)):

                            nan_ind = d_groups[ii + tm].notnull()
                            xtime = d_groups[ii + tm][nan_ind]
                            colors = cm.rainbow(np.linspace(0, 1, len(xtime)))
                            ypres = d_groups[ii + tm + 1][nan_ind]
                            nval = d_groups[ii + tm + 2][nan_ind]
                            tm += 2

                            # fig, ax = pf.plot_xsection(subsite, xtime, ypres, nval, clabel, ylabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            # pf.plot_profiles(nval, ypres, colors, ylabel, clabel, stdev=None)
                            # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9)

                            ind2 = cf.reject_outliers(nval, 5)
                            xD = nval[ind2]
                            yD = ypres[ind2]
                            nZ = colors[ind2]
                            outliers = str(len(nval) - len(xD))
                            leg_text = ('removed {} outliers (SD={})'.format(
                                outliers, stdev), )

                            ax.scatter(xD, yD, c=nZ, s=2, edgecolor='None')
                            ax.invert_yaxis()
                            ax.set_xlabel(clabel, fontsize=9)
                            ax.set_ylabel(ylabel, fontsize=9)
                            ax.legend(leg_text, loc='best', fontsize=6)
                            ax.set_title((title + '\n' + t0 + ' - ' + t1),
                                         fontsize=9)

                            l_arr.append(
                                len(nval)
                            )  #  count of data to filter out small groups
                            y_avg.append(ypres.mean())
                            n_avg.append(nval.mean())
                            n_min.append(nval.min())
                            n_max.append(nval.max())
                            n0_std.append(nval.mean() + 3 * nval.std())
                            n1_std.append(nval.mean() - 3 * nval.std())

                        ax.plot(n_avg, y_avg, '-k')
                        # ax.plot(n_min, y_avg, '-b')
                        # ax.plot(n_max, y_avg, '-b')
                        ax.fill_betweenx(y_avg,
                                         n0_std,
                                         n1_std,
                                         color='m',
                                         alpha=0.2)
                        sfile = '_'.join((sname, 'statistics'))
                        pf.save_fig(save_dir, sfile)