def quantiles_at_breakpoint(data, var, dvar=None, quantilen=None, ibreak=None, sample_size=730, borders=180, verbose=0):
    """Calculate Quantiles at the breakpoints
    """
    from departures import qmap_departure
    from support_functions import sample_indices, qstats
    funcid = '[QAB] '

    if not isinstance(var, str):
        raise ValueError(funcid + "var Requires  a string")

    if dvar is not None and not isinstance(dvar, str):
        raise ValueError(funcid + "dvar Requires  a string")

    if dvar is None:
        dvar = var

    print funcid + "Data from Variable: ", dvar
    if not isinstance(data, (pd.DataFrame, pd.Panel)):
        raise ValueError("Require a DataFrame or Panel as input")

    if quantilen is None:
        quantilen = np.arange(0, 101, 10)

    quantilen = quantilen[(quantilen < 100) & (quantilen > 0)]  # drop 0 and 100
    qss = sample_size / (len(quantilen) + 1) / 2  # sample size per quantile
    print funcid + "Quantilen: ", quantilen
    print funcid + "Global Sample size: %d , per quantile(%d): %d" % (sample_size, len(quantilen), qss)
    mlabels = ["Q%d" % i for i in quantilen]
    mlabels.append(">")

    if isinstance(data, pd.DataFrame):
        if not data.columns.isin([var, '%s_breaks' % var]).sum() == 2:
            raise ValueError(funcid + "Variable not found: %s or %s_breaks in %s" % (var, var, str(data.columns)))
        # convert to panel
        if 'p' not in data.columns:

            out = {}
            #  get Breakpoints
            int_breaks = np.where((data['%s_breaks' % var] > 0))[0]
            breaks = data.index[int_breaks]
            nb = len(breaks)
            if nb == 0:
                raise RuntimeError(funcid + "No Breakpoints found in %s and %s_breaks" % (var, var))

            print "Found Breaks: ", nb
            print str(breaks)
            if (int_breaks[-1] + sample_size) > data.shape[0]:
                print funcid + "Reference data set is shorter than 1 year"

            for ib in reversed(range(nb)):
                if ibreak is not None and ibreak != ib:
                    print funcid + "Looking for: ", breaks[ibreak], " at ", breaks[ib]
                    continue
                # ibiased is everything between breakpoints
                # isample is minus the borders -> used to calculate
                ibiased, isample, iref = sample_indices(int_breaks, ib, data.index,
                                                        sample_size=sample_size,
                                                        borders=borders,
                                                        recent=False,
                                                        verbose=verbose - 1)
                # Quantiles at the breakpoint
                b1, c1, quants1 = qstats(data[dvar].values[iref], quantilen, qss)
                b2, c2, quants2 = qstats(data[dvar].values[isample], quantilen, qss)

                if verbose > 0:
                    print funcid + " %s : %s " % (dvar, breaks[ib])
                    print funcid + " Qs(B): ", quants1
                    print funcid + " Qs(#): ", c1
                    print funcid + " Qs(B): ", quants2
                    print funcid + " Qs(#): ", c2

                out[str(breaks[ib])] = pd.DataFrame({'Ref': quants1.tolist(), 'Bias': quants2.tolist()}, index=mlabels)
            return out

        # when there are pressure levels
        data = data.reset_index().set_index(['date', 'p']).to_panel()

    else:
        if not data.items.isin([var, '%s_breaks' % var]).sum() == 2:
            raise ValueError(funcid + "Variable not found: %s or %s_breaks in %s" % (var, var, str(data.items)))

    # per level
    #  get Breakpoints
    int_breaks = np.where((data['%s_breaks' % var] > 0).any(1))[0]
    breaks = data.major_axis[int_breaks]
    nb = len(breaks)
    if nb == 0:
        raise RuntimeError(funcid + "No Breakpoints found in %s and %s_breaks" % (var, var))

    print "Found Breaks: ", nb
    print str(breaks)
    if (int_breaks[-1] + sample_size) > data.shape[0]:
        print funcid + "Reference data set is shorter than 1 year"

    out = {}

    for ib in reversed(range(nb)):
        if ibreak is not None and ibreak != ib:
            print funcid + "Looking for: ", breaks[ibreak], " at ", breaks[ib]
            continue
        # ibiased is everything between breakpoints
        # isample is minus the borders -> used to calculate
        ibiased, isample, iref = sample_indices(int_breaks, ib, data.major_axis,
                                                sample_size=sample_size,
                                                borders=borders,
                                                recent=False,
                                                verbose=verbose - 1)

        # Quantiles at the breakpoint
        def myqstats(x, quantilen, sample_size):
            c, y = qstats(x, quantilen, sample_size)
            return y

        quants1 = np.apply_along_axis(myqstats,
                                      0,
                                      data[dvar].values[iref],
                                      quantilen,
                                      qss)

        quants2 = np.apply_along_axis(myqstats,
                                      0,
                                      data[dvar].values[isample],
                                      quantilen,
                                      qss)
        out[str(breaks[ib])] = pd.Panel({'Ref': quants1, 'Bias': quants2}, major_axis=mlabels,
                                        minor_axis=data.minor_axis)

    return out
Exemplo n.º 2
0
def quantile_era_correction(data, var, rvar, bvar, quantilen=None, sample_size=730, borders=None,
                            bounded=None, database=False, verbose=0):
    from departures import qmap_era_departure
    from support_functions import sample_indices

    funcid = '[CQ] '

    if isinstance(var, str):
        var = [var]  # as list

    if isinstance(bvar, str):
        bvar = [bvar] * len(var)  # as list

    if isinstance(rvar, str):
        rvar = [rvar] * len(var)  # as list

    if quantilen is None:
        quantilen = np.arange(0, 101, 10)

    pressure_levels = True
    if isinstance(data, pd.DataFrame):
        if 'p' in data.columns:
            # 2D
            print funcid + " database detected"

            for ivar, jvar in zip(var, bvar):
                if not data.columns.isin([ivar, jvar]).sum() == 2:
                    raise ValueError(funcid + "Variable not found: %s" % (str(data.columns)))

            data = data.reset_index().set_index(['date', 'p']).to_panel()
        else:
            # 1D
            pressure_levels = False

    elif isinstance(data, pd.Panel):
        for ivar, jvar in zip(var, bvar):
            if not data.items.isin([ivar, jvar]).sum() == 2:
                raise ValueError(funcid + "Variable not found: %s" % (str(data.items)))

    else:
        raise ValueError("Require a DataFrame or Panel as input")

    if pressure_levels:
        data.major_axis.name = 'date'
        dates = data.major_axis

        # Druckflächen
        plevels = data.minor_axis.values
        if verbose > 0:
            print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa'

    else:
        dates = data.index

    if bounded is None:
        ubound = None
        lbound = None
    else:
        lbound, ubound = bounded

    # What else ?
    qss = sample_size / len(quantilen) / 2  #
    if verbose > 0:
        print funcid + "Adjusting sample_size from %d to %d " % (sample_size, qss)
        print funcid + "Quantiles: %d" % len(quantilen)

    breakpoint_stat = {}

    for ivar, zvar, ibvar in zip(var, rvar, bvar):
        # BREAKS
        if pressure_levels:
            int_breaks = np.where((data[ibvar] > 0).any(1))[0]
        else:
            int_breaks = np.where((data[ibvar] > 0))[0]

        breaks = dates[int_breaks]

        if (int_breaks[-1] + sample_size) > dates.shape[0]:
            print funcid + "Reference data set is shorter than 1 year"

        # copy
        data["%s_qecor" % ivar] = data[ivar].copy()
        xdata = data["%s_qecor" % ivar].values  # Numpy Array (time x p-levels)

        nb = len(breaks)
        # Correct zvar to fit var in Reference Period and use
        # zvar to quantile match the rest of the timeseries

        data['%s_adj' % zvar] = data[zvar].copy()
        ydata = data["%s_adj" % zvar].values  # Numpy Array (time x p-levels)
        # jvar = data.items.get_loc("%s_adj" % zvar)

        if pressure_levels:
            for i in range(xdata.shape[1]):
                # data[ivar].values[:, i],
                # data[zvar].values[:, i],
                q_dep = qmap_era_departure(xdata[:, i], ydata[:, i], slice(int_breaks[-1], None), quantilen, qss)
                if bounded is not None:
                    tmp_qad = ydata[:, i] + q_dep  # data.values[jvar, :, i] + q_dep
                    q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep)

                # data.iloc[jvar, :, i] += q_dep
                ydata[:, i] += q_dep
        else:
            # data[ivar].values, # data[zvar].values,
            q_dep = qmap_era_departure(xdata, ydata, slice(int_breaks[-1], None), quantilen, qss)
            if bounded is not None:
                tmp_qad = ydata + q_dep  # data.iloc[:, jvar] + q_dep
                q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep)

            # data.iloc[:, jvar] += q_dep
            ydata[:, i] += q_dep

        # Breakpoint Loop
        for ib in reversed(range(nb)):
            # ibiased is everything between breakpoints
            # isample is minus the borders -> used to calculate
            isample, ibiased, iref = sample_indices(int_breaks, ib, dates, sample_size=sample_size, borders=borders,
                                                    recent=False, verbose=verbose - 1)
            #
            if pressure_levels:
                # jvar = data.items.get_loc("%s_qecor" % ivar)
                for i in range(xdata.shape[1]):
                    # data["%s_adj" % zvar].values[ibiased, i],
                    # data[ivar].values[ibiased, i],
                    q_dep = qmap_era_departure(ydata[ibiased, i], xdata[ibiased, i], slice(None, None), quantilen, qss)
                    if bounded is not None:
                        tmp_qad = xdata[ibiased, i] + q_dep  # data.values[jvar, ibiased, i] + q_dep
                        q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep)

                    # data.values[jvar, ibiased, i] = (data.iloc[jvar, ibiased, i].values + q_dep)#[:, np.newaxis]
                    # data.iloc[jvar, ibiased, i] += q_dep  #[:, np.newaxis]  # array (time)
                    xdata[ibiased, i] += q_dep
            else:
                # jvar = data.columns.get_loc("%s_qecor" % ivar)
                # data["%s_adj" % zvar].values[ibiased],
                # data[ivar].values[ibiased],
                q_dep = qmap_era_departure(ydata[ibiased], xdata[ibiased], slice(None, None), quantilen, qss)
                if bounded is not None:
                    tmp_qad = xdata[ibiased] + q_dep  # data.values[ibiased, jvar] + q_dep
                    q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep)

                # data.iloc[ibiased, jvar] += q_dep  # array ( time )
                xdata[ibiased] += q_dep

            # nsample = data["%s_qecor" % ivar][isample].count().values
            # nref = data["%s_qecor" % ivar][iref].count().values
            nsample = np.isfinite(xdata[isample]).sum()
            nref = np.isfinite(xdata[iref]).sum()

            breakpoint_stat[str(breaks[ib])] = {'i': int_breaks[ib], 'isample': isample, 'ibiased': ibiased,
                                                'iref': iref, 'nref': nref, 'nsamp': nsample}
            # if verbose > 0:
            #     print funcid + " %s : %s  50%%: %9f (L%02d)" % (
            #         ivar, breaks[ib], np.nanmedian(q_dep), np.sum(np.any(q_dep != 0, axis=0)))

        data["%s_qecor" % ivar] = xdata
    if database:
        return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True)

    return breakpoint_stat, data
Exemplo n.º 3
0
def surrogate_quantile_correction(data, var, dvar, quantilen=None, sample_size=730, borders=180, database=False,
                                  func='nanmean', verbose=0):
    from departures import qmap_var_departure
    from support_functions import sample_indices

    funcid = '[CS] '

    if not isinstance(var, str) and not isinstance(dvar, str):
        raise ValueError("Only one variable allowed. As String")

    if quantilen is None:
        quantilen = np.arange(0, 100, 10)

    pressure_levels = True
    if isinstance(data, pd.DataFrame):
        if 'p' in data.columns:
            # 2D
            print funcid + " database detected"
            if var not in data.columns:
                raise ValueError(funcid + "Variable not found: %s in %s" % (var, str(data.columns)))
            if '%s_breaks' % var not in data.columns:
                raise ValueError(funcid + "Variable not found: %s_breaks in %s" % (var, str(data.columns)))

            if dvar not in data.columns:
                raise ValueError(funcid + "Variable not found: %s in %s" % (dvar, str(data.columns)))

            data = data.reset_index().set_index(['date', 'p']).to_panel()
        else:
            # 1D
            pressure_levels = False

    elif isinstance(data, pd.Panel):
        if var not in data.items:
            raise ValueError(funcid + "Variable not found: %s in %s" % (var, str(data.items)))

        if dvar not in data.items:
            raise ValueError(funcid + "Variable not found: %s in %s" % (dvar, str(data.items)))

        if '%s_breaks' % var not in data.items:
            raise ValueError(funcid + "Variable not found: %s_breaks in %s" % (var, str(data.items)))
    else:
        raise ValueError("Require a DataFrame or Panel as input")

    # always 2 variables -> dataframe at least

    if pressure_levels:
        data.major_axis.name = 'date'
        dates = data.major_axis

        # Druckflächen
        plevels = data.minor_axis.values
        if verbose > 0:
            print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa'
    else:
        dates = data.index

    # What else ?
    sample_size /= len(quantilen)

    # BREAKS
    if pressure_levels:
        int_breaks = np.where((data['%s_breaks' % var] > 0).any(1))[0]
    else:
        int_breaks = np.where((data['%s_breaks' % var] > 0))[0]

    breaks = dates[int_breaks]

    if (int_breaks[-1] + sample_size) > dates.shape[0]:
        print funcid + "Reference data set is shorter than 1 year"

    # copy
    data["%s_cor_%s" % (var, dvar)] = data[var]

    nb = len(breaks)

    for ib in reversed(range(nb)):

        if verbose > 0:
            print funcid + "Break: " + str(breaks[ib])

        ibiased, iref = sample_indices(int_breaks, ib, dates,
                                       sample_size=sample_size,
                                       borders=180,
                                       recent=False,
                                       verbose=verbose)
        if pressure_levels:
            ## Mittler Unterschied pro Quantile von einer anderen abhängigen variablen
            q_dep = np.empty_like(data[var].values[ibiased, :])
            for i in range(data.shape[2]):
                q_dep[:, i] = qmap_var_departure(data["%s_cor_%s" % (var, dvar)].values[:, i],
                                                 data[dvar].values[:, i],
                                                 iref,
                                                 ibiased,
                                                 ibiased,
                                                 quantilen,
                                                 sample_size,
                                                 verbose=verbose)
            data["%s_cor_%s" % (var, dvar)].values[ibiased, :] += q_dep  # one value per level
        else:
            q_dep = qmap_var_departure(data["%s_cor_%s" % (var, dvar)].values,
                                       data[dvar].values,
                                       iref,
                                       ibiased,
                                       ibiased,
                                       quantilen,
                                       sample_size,
                                       verbose=verbose,
                                       func=func)
            data["%s_cor_%s" % (var, dvar)].values[ibiased] += q_dep

        if verbose > 0:
            print funcid + " %s : %s  50%%: %f" % (var, breaks[ib], np.nanmedian(q_dep))

    if database:
        return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True)
    return data
Exemplo n.º 4
0
def mean_correction(data, var, breakvar, sample_size=730, borders=180, database=False, bounded=None, varcopy=True,
                    verbose=0):
    """ Mean Correction of breakpoints

    Parameters
    ----------
    data
    var
    breakvar
    sample_size
    borders
    database
    bounded
    varcopy
    verbose

    Returns
    -------
    stat, data
    """
    from departures import mean_departure
    from support_functions import sample_indices
    funcid = '[CM] '

    if isinstance(var, str):
        var = [var]  # as list

    if isinstance(breakvar, str):
        breakvar = [breakvar] * len(var)  # as list

    if bounded is None:
        ubound = None
        lbound = None
    else:
        lbound, ubound = bounded

    pressure_levels = True
    if isinstance(data, pd.DataFrame):
        if 'p' in data.columns:
            # 2D
            print funcid + " database detected > conversion to Panel"

            for ivar, jvar in zip(var, breakvar):
                if not data.columns.isin([ivar, jvar]).sum() == 2:
                    raise ValueError(funcid + "Variable not found: %s in %s" % (ivar, str(data.columns)))

            data.index.name = 'date'
            data = data.reset_index().set_index(['date', 'p']).to_panel()
        else:
            # only 1D
            pressure_levels = False

    elif isinstance(data, pd.Panel):
        for ivar, jvar in zip(var, breakvar):
            if not data.items.isin([ivar, jvar]).sum() == 2:
                raise ValueError(funcid + "Variable not found: %s in %s" % (ivar, str(data.items)))

    else:
        raise ValueError("Require a DataFrame or Panel as input")

    if pressure_levels:
        data.major_axis.name = 'date'
        dates = data.major_axis

        # Druckflächen
        plevels = data.minor_axis.values
        if verbose > 0:
            print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa'

    else:
        dates = data.index

    for ivar, ibvar in zip(var, breakvar):
        # BREAKS
        if pressure_levels:
            int_breaks = np.where((data[ibvar] > 0).any(1))[0]  # breakpoint in all levels
        else:
            int_breaks = np.where((data[ibvar] > 0))[0]
        breaks = dates[int_breaks]

        if (int_breaks[-1] + sample_size) > dates.shape[0]:
            print funcid + "Reference data set is shorter than 1 year"

        # Copy or use existing
        if not hasnames(data, '%s_mcor' % ivar) or varcopy:
            data["%s_mcor" % ivar] = data[ivar].copy()  # Make a copy

        nb = len(breaks)
        if verbose > 0:
            print funcid + " %s Found %d breakpoints" % (ivar, nb)

        breakpoint_stat = {}
        xdata = data["%s_mcor" % ivar].values  # Numpy Array (time x p-levels)

        for ib in reversed(range(nb)):

            # ibiased is everything between breakpoints
            # isample is minus the borders -> used to calculate
            isample, ibiased, iref = sample_indices(int_breaks, ib, dates, sample_size=sample_size, borders=borders,
                                                    recent=False, verbose=verbose - 1)
            if pressure_levels:
                # jvar = data.items.get_loc("%s_mcor" % ivar)  # index of variable
                # data["%s_mcor" % ivar].values,
                m_dep = np.apply_along_axis(mean_departure, 0, xdata, iref, isample, sample_size)
                # setting with ndarray requires precise shape conditions
                if bounded is not None:
                    tmp_qad = xdata[ibiased, :] + m_dep  # data.iloc[jvar, ibiased, :] + m_dep
                    m_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, m_dep)
                    xdata[ibiased, :] += m_dep  # has now the right shape
                else:
                # data.iloc[jvar, ibiased, :] = (data.iloc[jvar, ibiased, :].values + m_dep)[np.newaxis, ::]
                    xdata[ibiased, :] += m_dep[np.newaxis, ::]
                # one value per level, this can cause negative DPD values
            else:
                # jvar = data.columns.get_loc("%s_mcor" % ivar)
                # data["%s_mcor" % ivar].values,
                m_dep = mean_departure(xdata, iref, isample, sample_size)
                if bounded is not None:
                    tmp_qad = xdata[ibiased] + m_dep  # data.iloc[ibiased, jvar] + m_dep
                    m_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, m_dep)

                # data.iloc[ibiased, jvar] += m_dep  # one value per time
                xdata[ibiased] += m_dep

            # nsample = data["%s_mcor" % ivar][isample].count()
            # nref = data["%s_mcor" % ivar][iref].count()
            nsample = np.isfinite(xdata[isample]).sum()
            nref = np.isfinite(xdata[iref]).sum()

            breakpoint_stat[str(breaks[ib])] = {'i': int_breaks[ib], 'isample': isample, 'ibiased': ibiased,
                                                'iref': iref, 'mcor': m_dep, 'nref': nref, 'nsamp': nsample}
            if verbose > 0:
                print funcid + " %s : %s  50%%: %9f " % (ivar, breaks[ib], np.nanmedian(m_dep))

        data["%s_mcor" % ivar] = xdata  # fill in
    if database:
        return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True)

    return breakpoint_stat, data