Exemplo n.º 1
0
def rw_rake(wh, xmat, targets, options):

    a = timer()

    # update options with any user-supplied options
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options_all = {**options_defaults, **options}

    # convert dict to named tuple for ease of use
    opts = ut.dict_nt(options_all)

    g = raking.rake(wh=wh,
                    xmat=xmat,
                    targets=targets,
                    max_iter=opts.max_rake_iter)

    wh_opt = g * wh
    targets_opt = np.dot(xmat.T, wh_opt)
    b = timer()

    # create a named tuple of items to return
    fields = ('elapsed_seconds', 'wh_opt', 'targets_opt', 'g', 'opts')
    Result = namedtuple('Result', fields, defaults=(None, ) * len(fields))

    res = Result(elapsed_seconds=b - a,
                 wh_opt=wh_opt,
                 targets_opt=targets_opt,
                 g=g,
                 opts=opts)

    return res
def gec(wh, xmat, targets, options=None):

    a = timer()

    # update options with any user-supplied options
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options_all = {**options_defaults, **options}

    if options_all['objective'] == 'ENTROPY':
        options_all['objective'] = ENTROPY
    elif options_all['objective'] == 'QUADRATIC':
        options_all['objective'] = QUADRATIC

    # convert dict to named tuple for ease of use
    opts = ut.dict_nt(options_all)

    # small_positive = np.nextafter(np.float64(0), np.float64(1))
    wh = np.where(wh == 0, SMALL_POSITIVE, wh)
    wh = np.full(wh.shape, wh.mean())

    pop = wh.sum()
    tmeans = targets / pop

    # ompw:  optimal means-producing weights
    ompw, l2_norm = ec.maybe_exact_calibrate(
        covariates=xmat,
        target_covariates=tmeans.reshape((1, -1)),
        # baseline_weights=wh,
        # target_weights=np.array([[.25, .75]]), # target priorities
        # target_weights=target_weights,
        autoscale=opts.autoscale,  # doesn't always seem to work well
        # note that QUADRATIC weights often can be zero
        objective=opts.objective,  # ENTROPY or QUADRATIC
        increment=opts.increment)
    # print(l2_norm)

    # wh, when multiplied by g, will yield the targets
    g = ompw * pop / wh
    g = np.array(g, dtype=float).reshape((-1, ))  # djb
    wh_opt = g * wh
    targets_opt = np.dot(xmat.T, wh_opt)
    b = timer()

    # create a named tuple of items to return
    fields = ('elapsed_seconds', 'wh_opt', 'targets_opt', 'g', 'opts',
              'l2_norm')
    Result = namedtuple('Result', fields, defaults=(None, ) * len(fields))

    res = Result(elapsed_seconds=b - a,
                 wh_opt=wh_opt,
                 targets_opt=targets_opt,
                 g=g,
                 opts=opts,
                 l2_norm=l2_norm)

    return res
Exemplo n.º 3
0
def qmatrix(wh, xmat, geotargets,
            method='raking',
            options=None):
    """Docstring.

    """

    # TODO:

    a = timer()

    # define gfn, the function to use in the qmatrix loop
    #   gfn is passed:
        #  column of the Q matrix, representing an area
        #  wh-weighted xmat with good columns only
        # row of the geotargets matrix, representing an area, good columns only
        # optional parameter, objective, needed only for empirical calibration
    #   gfn returns g, ratio of new weights to old weights

    if method == 'raking':
        gfn = g_raking
        solver_defaults = raking_defaults
    elif method == 'empcal':
        gfn = g_ec
        solver_defaults = ec_defaults
    elif method == 'ipopt':
        gfn = g_ipopt
        solver_defaults = ipopt_defaults
    elif method == 'least_squares':
        gfn = g_lsq
        solver_defaults = lsq_defaults

    options_defaults = {**solver_defaults, **user_defaults}

    # update options with any user-supplied options
    # copy seemed safer than ** to me but I am not sure why
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options = {**options_defaults, **options}

    if method == 'empcal':
        # replace string name for obective with the corresponding empcal object
        if options_all['objective'] == 'ENTROPY':
            options_all['objective'] = ENTROPY
        elif options_all['objective'] == 'QUADRATIC':
            options_all['objective'] = QUADRATIC

    # create a dict that only has solver options, for passing to gfn
    user_keys = user_defaults.keys()
    solver_options = {key: value for key, value in options_all.items() if key not in user_keys}

    # convert options_all dict to named tuple for ease of use
    # uo = ut.dict_nt(user_options)
    # so = ut.dict_nt(solver_options)

    opts = ut.dict_nt(options_all)

    # if user_options is None:
    #     user_options = user_defaults
    # else:
    #     user_options = {**user_defaults, **user_options}

    # if solver_options is None:
    #     solver_options = solver_defaults
    # else:
    #     solver_options = {**solver_defaults, **solver_options}

    # unpack selected user options
    Q = opts.Q
    drops = opts.drops
    qmax_iter = opts.qmax_iter
    # independent means we do one iteration and use UNADJUSTED Q!!!
    if opts.independent:
        qmax_iter = 1  # should add warning if independent and qmax_iter !=1
    print(f'max Q iterations: {qmax_iter}')

    # constants
    # EPS = 1e-5  # acceptable weightsum error (tolerance) - 1e-5 in R code
    TOL_WTDIFF = 0.0005  # tolerance for difference between weight sum and 1
    TOL_TARGPCTDIFF = 1.0  # tolerance for geotargets percent difference

    # initialize stopping criteria values
    ediff = 1  # error, called ver in Toky R code
    iter = 1  # initialize iteration count called k in Toky R. R code
    iter_best = iter

    # difference in weights - Toky R. used sum of absolute weight differences,
    # I use largest absolute weight difference
    max_weight_absdiff = 1e9  # initial maximum % difference between sums of weights for a household and 100
    max_targ_abspctdiff = 1e9  # initial maximum % difference vs geotargets
    max_diff_best = max_targ_abspctdiff

    m = geotargets.shape[0]  # number of states
    n = wh.size  # number of households
    wh = wh.reshape((-1, 1))  # ensure the proper shape

    # If initial Q was not provided construct a Q
    if Q is None:
        Q = np.full((n, m), 1 / m)

    # compute xmat_wh before loop (calib calculates it in the loop)
    xmat_wh = xmat * wh  # shape -  n x number of geotargets

    # numbers of geotargets
    nt_per_area = geotargets.shape[1]
    nt_possible = nt_per_area * m
    if drops is None:
        drops = np.zeros(geotargets.shape, dtype=bool)  # all False
        nt_dropped = 0
    else:
        # nt_dropped = sum([len(x) for x in drops.values()])
        nt_dropped = drops.sum()
    nt_used = nt_possible - nt_dropped
    good_targets = np.logical_not(drops)


    # Making a copy of Q is crucial. We don't want to change the
    # original Q. Am I sure of this??
    Qmat = Q.copy()
    Q_best = Q.copy()
    Q_unadjusted = Q.copy()  # Q_unadjusted is Q prior to forced summation to 1

    print('')
    print_problem(wh, m, nt_per_area, nt_possible, nt_dropped, nt_used)

    h1 = "                  max weight      max target       p95 target"
    h2 = "   iteration        diff           pct diff         pct diff"
    print('\n')
    print(h1)
    print(h2, '\n')

    while not end_loop(iter, max_targ_abspctdiff, qmax_iter, TOL_TARGPCTDIFF):

        print(' '*3, end='')
        print('{:4d}'.format(iter), end='', flush=True)

        for j in range(m):  # j indexes areas
            # print(f'iter {iter:4d}, area {j:5d}')

            good_cols = good_targets[j, :]

            g = gfn(Qmat[:, j],
                    xmat_wh[:, good_cols],
                    geotargets[j, good_cols],
                    options=solver_options)

            # if method == 'raking' and g is None:
            #     # try to recover by using the alternate method
            #     g = g_ec(xmat_wh[:, good_cols], Q[:, j], geotargets[j, good_cols])
            if g is None:
                g = np.ones(n)

            if np.isnan(g).any() or np.isinf(g).any() or g.any() == 0:
                print('bad g')
                g = np.ones(g.size)
                # we'll need to do this one again
            else:
                pass
                # print("done with this area")
            # print(g)
            Qmat[:, j] = Qmat[:, j] * g.reshape(g.size, )  # end for loop for this area

        # print(Qmat)

        # when we arrive here we have completed all areas for this iteration
        # calc max weight difference BEFORE recalibrating Q
        abswtdiff = np.abs(Qmat.sum(axis=1) - 1)  # sum of weight-shares for each household
        max_weight_absdiff = abswtdiff.max()  # largest difference from 1 across all households
        print(' '*11, end='')
        print(f'{max_weight_absdiff:8.4f}', end='')
        if np.isinf(abswtdiff).any():
            # these weight shares are not good, do another iteration
            # ediff = EPS
            max_weight_absdiff = TOL_WTDIFF
            print("Existence of infinite coefficients --> non-convergence.")

        #print("Weight sums max percent difference: {}".format(maxadiff))  # ediff
        # if iter == 1:
        # Q_unadjusted = Qmat.copy()  # save for possible postprocessing
        if not opts.independent:  # update matrix to force summation to 1, if NOT independent
            Qmat = Qmat / Qmat.sum(axis=1)[:, None]  # Recalibrate Q. Note None so that we have proper broadcasting

        # calculate geotargets pct diff AFTER recalibrating Q
        # this is simply for interim reporting
        whs = np.multiply(Qmat, wh.reshape((-1, 1)))  # faster
        diff = np.dot(whs.T, xmat) - geotargets
        abspctdiff = np.abs(diff / geotargets * 100)
        max_targ_abspctdiff = abspctdiff[good_targets].max()

        ptile = np.quantile(abspctdiff[good_targets], (.95))
        print(' '*6, end='')
        print(f'{max_targ_abspctdiff:8.2f} %', end='')
        print(' '*7, end='')
        print(f'{ptile:8.2f} %')

        # final processing before next iteration
        if max_targ_abspctdiff < max_diff_best:
            Q_best = Qmat.copy()
            max_diff_best = max_targ_abspctdiff.copy()
            iter_best = iter
        iter = iter + 1
        # end while loop

    # WE ARE NOW DONE WITH ALL LOOPING AND WILL POST-PROCESS RESULTS
    # post-processing after exiting while loop, using Q_best, not Q
    # Q_best = Q_unadjusted  # djb!!
    whs_opt = np.multiply(Q_best, wh.reshape((-1, 1)))  # faster
    geotargets_opt = np.dot(whs_opt.T, xmat)
    diff = geotargets_opt - geotargets
    pctdiff = diff / geotargets * 100
    abspctdiff = np.abs(pctdiff)
    # calculate weight difference AFTER final calibration
    abswtdiff = np.abs(Q_best.sum(axis=1) - 1)  # sum of weight-shares for each household
    max_weight_absdiff = abswtdiff.max()  # largest diff from 1 across all households

    if iter > qmax_iter:
        print('\nMaximum number of iterations exceeded.\n')

    print('\n')
    print_problem(wh, m, nt_per_area, nt_possible, nt_dropped, nt_used)

    print(f'\nPost-calibration max abs diff between sum of household weights and 1, across households: {max_weight_absdiff:9.5f}')
    print()

    # compute and print good and all values for various quantiles
    p100a = abspctdiff.max()
    p100m = abspctdiff[good_targets].max()
    p99a = np.quantile(abspctdiff, (.99))
    p99m = np.quantile(abspctdiff[good_targets], (.99))
    p95a = np.quantile(abspctdiff, (.95))
    p95m = np.quantile(abspctdiff[good_targets], (.95))
    sspd = np.square(pctdiff).sum()
    print('Results for calculated targets versus desired targets:')
    print( '                                                              good             all\n')
    print(f'    Max abs percent difference                           {p100m:9.3f} %     {p100a:9.3f} %')
    print(f'    p99 of abs percent difference                        {p99m:9.3f} %     {p99a:9.3f} %')
    print(f'    p95 of abs percent difference                        {p95m:9.3f} %     {p95a:9.3f} %')
    print('\n')
    print(f'Sum of squared percentage differences:      {sspd:9.3g}')
    print(f'Number of iterations:                       {iter - 1:5d}')
    print(f'Best target difference found at iteration:  {iter_best:5d}')

    b = timer()
    print('\nElapsed time: {:8.1f} seconds'.format(b - a))

    # create a named tuple of items to return
    fields = ('elapsed_seconds',
              'whs_opt',
              'geotargets',
              'geotargets_opt',
              'Q_opt',
              'Q_unadjusted',
              'iter_opt')
    Result = namedtuple('Result', fields, defaults=(None,) * len(fields))

    res = Result(elapsed_seconds = b - a,
                 whs_opt = whs_opt,
                 geotargets = geotargets,
                 geotargets_opt = geotargets_opt,
                 Q_opt = Q_best,
                 Q_unadjusted = Q_unadjusted,
                 iter_opt = iter_best)
    return res
Exemplo n.º 4
0
def rw_minNLP(wh, xmat, targets, options=None):
    # minimize the change in the weights, measured by the ratio of new weight
    # to old weight minus 1, squared, subject to:
    # linear inequality constraints (the targets) and
    # bounds on the x variables

    a1 = timer()

    # update options with any user-supplied options
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options_all = {**options_defaults, **options}

    # rename dict keys as needed to reflect lsq naming
    options_all['maxiter'] = options_all.pop('max_iter')

    # convert dict to named tuple for ease of use
    opts = ut.dict_nt(options_all)

    # create a dict that only has solver options, for passing to minimize
    user_keys = user_defaults.keys()
    solver_options = {
        key: value
        for key, value in options_all.items() if key not in user_keys
    }

    # scale the targets to 100
    if opts.scaling is True:
        diff_weights = np.where(targets != 0, 100 / targets, 1)
    else:
        diff_weights = np.ones_like(targets)

    b = targets * diff_weights
    b
    tol = .0001
    clb = b - tol * np.abs(b)
    cub = b + tol * np.abs(b)

    wmat = xmat * diff_weights

    At = np.multiply(wh.reshape(-1, 1), wmat)
    A = At.T
    As = scipy.sparse.coo_matrix(A)

    lincon = scipy.optimize.LinearConstraint(As, clb, cub)

    bnds = scipy.optimize.Bounds(0, 1e5)

    x0 = np.ones_like(wh)

    nlp_info = minimize(
        xm1_sq,
        x0,
        method='trust-constr',
        bounds=bnds,
        constraints=lincon,
        jac=xm1_sq_grad,
        # hess='2-point',
        # hess=xm1_sq_hess,
        hessp=xm1_sq_hvp,
        options=solver_options)

    g = nlp_info.x
    wh_opt = wh * g
    targets_opt = np.dot(xmat.T, wh_opt)

    b1 = timer()

    # create a named tuple of items to return
    fields = ('elapsed_seconds', 'wh_opt', 'targets_opt', 'g', 'opts',
              'nlp_info')
    Result = namedtuple('Result', fields, defaults=(None, ) * len(fields))

    res = Result(elapsed_seconds=b1 - a1,
                 wh_opt=wh_opt,
                 targets_opt=targets_opt,
                 g=g,
                 opts=opts,
                 nlp_info=nlp_info)

    return res
Exemplo n.º 5
0
def rw_ipopt(wh, xmat, targets, options=None):
    r"""
    Build and solve the reweighting NLP.

    Good general settings seem to be:
        get_ccscale - use ccgoal=1, method='mean'
        get_objscale - use xbase=1.2, objgoal=100
        no other options set, besides obvious ones

    Important resources:
        https://pythonhosted.org/ipopt/reference.html#reference
        https://coin-or.github.io/Ipopt/OPTIONS.html
        ..\cyipopt\ipopt\ipopt_wrapper.py to see code from cyipopt author

    Parameters
    ----------
    wh : float
        DESCRIPTION.
    xmat : ndarray
        DESCRIPTION.
    targets : ndarray
        DESCRIPTION.
    xlb : TYPE, optional
        DESCRIPTION. The default is 0.1.
    xub : TYPE, optional
        DESCRIPTION. The default is 100.
    crange : TYPE, optional
        DESCRIPTION. The default is .03.
    max_iter : TYPE, optional
        DESCRIPTION. The default is 100.
    ccgoal : TYPE, optional
        DESCRIPTION. The default is 1.
    objgoal : TYPE, optional
        DESCRIPTION. The default is 100.
    quiet : TYPE, optional
        DESCRIPTION. The default is True.

    Returns
    -------
    x : TYPE
        DESCRIPTION.
    info : TYPE
        DESCRIPTION.

    """
    a = timer()
    n = xmat.shape[0]
    m = xmat.shape[1]

    # update options with any user-supplied options
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options_all = {**options_defaults, **options}

    # convert dict to named tuple for ease of use
    opts = ut.dict_nt(options_all)

    # constraint coefficients (constant)
    cc = (xmat.T * wh).T

    # scale constraint coefficients and targets
    ccscale = get_ccscale(cc, ccgoal=opts.ccgoal, method='mean')
    # ccscale = 1
    cc = cc * ccscale  # mult by scale to have avg derivative meet our goal
    targets_scaled = targets * ccscale  # djb do I need to copy?

    # IMPORTANT: define callbacks AFTER we have scaled cc and targets
    # because callbacks must be initialized with scaled cc
    callbacks = Reweight_callbacks(cc, opts.quiet)

    # x vector starting values, and lower and upper bounds
    x0 = np.ones(n)
    lb = np.full(n, opts.xlb)
    ub = np.full(n, opts.xub)

    # constraint lower and upper bounds
    cl = targets_scaled - abs(targets_scaled) * opts.crange
    cu = targets_scaled + abs(targets_scaled) * opts.crange

    nlp = ipopt.problem(n=n,
                        m=m,
                        problem_obj=callbacks,
                        lb=lb,
                        ub=ub,
                        cl=cl,
                        cu=cu)

    # objective function scaling - add to options dict
    # djb should I pass n and callbacks???
    objscale = get_objscale(objgoal=opts.objgoal,
                            xbase=1.2,
                            n=n,
                            callbacks=callbacks)
    options_all['obj_scaling_factor'] = objscale

    # create a dict that only has solver options, for passing to ipopt
    user_keys = user_defaults.keys()
    solver_options = {
        key: value
        for key, value in options_all.items() if key not in user_keys
    }

    for option, value in solver_options.items():
        nlp.addOption(option, value)

    if (not opts.quiet):
        print(f'\n {"":10} Iter {"":25} obj {"":22} infeas')

    # solve the problem
    g, ipopt_info = nlp.solve(x0)

    wh_opt = g * wh
    targets_opt = np.dot(xmat.T, wh_opt)
    b = timer()

    # create a named tuple of items to return
    fields = ('elapsed_seconds', 'wh_opt', 'targets_opt', 'g', 'opts',
              'ipopt_info')
    Result = namedtuple('Result', fields, defaults=(None, ) * len(fields))

    res = Result(elapsed_seconds=b - a,
                 wh_opt=wh_opt,
                 targets_opt=targets_opt,
                 g=g,
                 opts=opts,
                 ipopt_info=ipopt_info)

    return res
def rw_lsq(wh, xmat, targets,
           options=None):
    # minimize the sum of squared differences from the targets,
    # choosing x values that do so, where x is the ratio of new weight
    # to old weight
    # with bounds on the x values

    # this appears to be the best of the reweighting approaches

    a1 = timer()

    # update options with any user-supplied options
    if options is None:
        options_all = options_defaults.copy()
    else:
        options_all = options_defaults.copy()
        options_all.update(options)
        # options_all = {**options_defaults, **options}

    # convert dict to named tuple for ease of use
    opts = ut.dict_nt(options_all)

    h = wh.size

    # we are solving Ax = b, where
    #   b are the targets and
    #   A x multiplication gives calculated targets
    # using sparse matrix As instead of A

    # scale the targets to 100 or something similar
    # TODO: deal with targets that are zero
    if opts.scaling is True:
        scale_vector = np.abs(np.where(targets != 0, 100000.0 / targets, 1))
        # scale_vector = np.where(targets != 0, 0.1, 1)
    else:
        scale_vector = np.ones_like(targets)

    b = targets * scale_vector
    wmat = xmat * scale_vector

    At = np.multiply(wh.reshape(-1, 1), wmat)
    A = At.T

    if opts.method != 'bvls':
        As = scipy.sparse.coo_matrix(A)
    else:
        # sparse matrices not allowed with bvls
        As = A

    lb = np.full(h, opts.xlb)
    ub = np.full(h, opts.xub)

    if opts.tol is None:
        lsq_info = lsq_linear(As, b, bounds=(lb, ub),
                         method=opts.method,
                         lsmr_tol=opts.lsmr_tol,
                         max_iter=opts.max_iter,
                         verbose=opts.verbose)
    else:
        lsq_info = lsq_linear(As, b, bounds=(lb, ub),
                         method=opts.method,
                         tol=opts.tol,  # tol=1e-6,
                         lsmr_tol=opts.lsmr_tol,
                         max_iter=opts.max_iter,
                         verbose=opts.verbose)

    g = lsq_info.x
    wh_opt = wh * g
    targets_opt = np.dot(xmat.T, wh_opt)

    b1 = timer()

    # create a named tuple of items to return
    fields = ('elapsed_seconds',
              'wh_opt',
              'targets_opt',
              'g',
              'opts',
              'lsq_info')
    Result = namedtuple('Result', fields, defaults=(None,) * len(fields))

    res = Result(elapsed_seconds=b1 - a1,
                 wh_opt=wh_opt,
                 targets_opt=targets_opt,
                 g=g,
                 opts=opts,
                 lsq_info=lsq_info)

    return res