Пример #1
0
def align_datetime(data,
                   dim='time',
                   plev='plev',
                   times=(0, 12),
                   span=6,
                   freq='12h',
                   **kwargs):
    """ Standardize datetime to times per date, try to fill gaps

    Args:
        data (DataArray, Dataset): Input data
        dim (str): datetime dimension
        plev (str): pressure level dimension
        times (tuple): sounding times
        span (int): plus minus times (smaller than freq/2)
        freq (str): frequency of output times

    Returns:
        xarray.DataArray : datetime standardized DataArray

    """
    import numpy as np
    from pandas import DatetimeIndex
    from xarray import DataArray, Dataset
    from .. import fun as ff

    if not isinstance(data, (DataArray, Dataset)):
        raise ValueError('Requires a DataArray or Dataset', type(data))

    if dim not in data.dims:
        raise ValueError('Requires a datetime dimension', dim)

    if int(24 / (len(times) * 2)) < span:
        raise ValueError("Times and span do not agree!?", times, span)

    if int(24 / int(freq[:-1])) != len(times):
        raise ValueError("Times and freq do not match:", times, freq)

    if span > int(freq[:-1]) // 2:
        raise ValueError(
            "Frequency and Span need to be consistent (span < freq/2): ", freq,
            span)

    dates = data[dim].values.copy()
    #
    # Count levels per date
    #
    _fix_datetime = np.vectorize(ff.cal.fix_datetime)
    newdates = _fix_datetime(dates, span=span)  # (time: 33%)
    resolution = np.zeros(newdates.size)
    #
    # check for duplicates in standard launch times
    #
    u, c = np.unique(newdates, return_counts=True)
    conflicts = u[c > 1]
    if conflicts.size > 0:
        counts = _count_data(data, dim=dim, plev=plev)
        ff.message("Conflicts", conflicts.size, newdates.size, **kwargs)
        for i in conflicts:
            indices = np.where(newdates == i)[0]  # numbers  (time: 45%)
            #
            # Count available data (DataArray or Dataset)
            #
            # slow
            # counts = data.isel(**{dim: indices}).count(plev).values
            # counts = _count_data(data.isel(**{dim: indices}), dim=dim, plev=plev)
            # slow end
            icounts = counts[indices]
            #
            # offsets to standard launch time
            #
            offset = np.abs((dates[indices] - i) / np.timedelta64(1, 'h'))
            j = np.argsort(offset)  # sort time offsets (first we want)
            jmax = np.argmax(
                icounts[j])  # check if counts from other time is larger
            if jmax != 0:
                #
                # there is a sounding with more level data (+/- 1 hour)
                #
                if (offset[j][0] + 1) <= offset[j][jmax]:
                    # ok close enough
                    jj = j.copy()
                    jj[j ==
                       0] = jmax  # first pos is now at the position of the maximum
                    jj[j == jmax] = 0  # maximum is now first
                    j = jj
                #
                # there is a sounding with + 2 more levels
                #
                elif (icounts[j][0] + 2) <= icounts[j][jmax]:
                    # a lot more
                    jj = j.copy()
                    jj[j ==
                       0] = jmax  # first pos is now at the position of the maximum
                    jj[j == jmax] = 0  # maximum is now first
                    j = jj
                else:
                    pass  # keep time sorting

            for m, k in enumerate(offset[j]):
                if m == 0:
                    continue  # this is the minimum

                # change back the others or add a delay to remove duplicates
                if k == 0:
                    newdates[indices[j][m]] += np.timedelta64(
                        1, 'h')  # add offset
                    resolution[indices[j][m]] = 1  # add hour
                else:
                    newdates[indices[j][m]] = dates[indices[j]
                                                    [m]]  # revert back
                    resolution[indices[j][m]] = -1  # revert back
    #
    # recheck for standard times
    #
    idx_std = DatetimeIndex(newdates).hour.isin(times)
    u, c = np.unique(newdates[idx_std],
                     return_counts=True)  # check only standard times
    conflicts = u[c > 1]
    ff.message("Conflicts remain:", conflicts.size, idx_std.sum(),
               newdates.size)
    #
    # new dates / new object
    #
    data = data.assign_coords({dim: newdates})
    #
    # delay
    #
    nn = (resolution > 0).sum()
    nx = (~idx_std).sum()
    data['delay'] = (dim,
                     ((dates - newdates) / np.timedelta64(1, 'h')).astype(int)
                     )  # new coordinate for delays
    data.attrs['std_times'] = str(times)
    data['delay'].attrs['updated'] = nn
    data['delay'].attrs['missing'] = data['delay'].isnull().sum().values
    data['delay'].attrs['times'] = str(times)
    data['flag_stdtime'] = (dim, resolution.astype(int))
    data['flag_stdtime'].attrs.update({
        'units':
        '1',
        'standard_name':
        'flag_standard_time_conflict_resolution',
        'info':
        '0: preferred, -1: lesser candidate, 1: duplicate, less data'
    })
    ff.message('Updated [', nn, "] No Standard [", nx, "] [", newdates.size,
               "]", **kwargs)

    if not all(data[dim].values == np.sort(data[dim].values)):
        ff.message("Sorting by", dim, **kwargs)
        data = data.sortby(dim)
    return data