def weighted_percentile(universe, q=0.5): df = pd.DataFrame({ 'v': universe.MeanCoverage.tolist(), 'w': (universe.End - universe.Start).tolist() }) calc = wc.Calculator('w') # w designates weight return calc.quantile(df, 'v', q)
def weightedcalcs_quantiles(data, labels, weights, return_quantiles=False): calc = wc.Calculator("weights") num_categories = len(labels) breaks = linspace(0, 1, num_categories + 1) data_frame = pd.DataFrame({ 'weights': weights, 'data': data, }) quantiles = [ calc.quantile(data_frame, 'data', mybreak) for mybreak in breaks[1:] ] ret = zeros(len(data)) for i in range(0, len(quantiles) - 1): lower = quantiles[i] upper = quantiles[i + 1] ret[and_(data > lower, data <= upper)] = labels[i] if return_quantiles: return ret + 1, quantiles else: return ret + 1
def top_share(values, rank_from_top, weights=None): """ Args: values(np.array): Vector of values rank_from_top(float): Rank from top (bottom is 1 and top is 0) weights(np.array): Weights vector (Default value = None) Returns: """ if weights is None: weights = ones(len(values)) calc = wc.Calculator("weights") data_frame = pd.DataFrame({ 'weights': weights, 'data': values, }) quantile = calc.quantile(data_frame, 'data', 1 - rank_from_top) return ((data_frame["data"] >= quantile) * data_frame["data"] * data_frame["weights"]).sum() / (data_frame["data"] * data_frame["weights"]).sum()
list(itertools.product(*[occupancy_types, income_strata])), list(itertools.product(*[occupancy_types, ["COUNT"], income_strata]))) } long_lead_data.rename(mapper=count_rename_dict, axis=1, inplace=True) long_lead_data.columns = long_lead_data.columns.str.split(' ', expand=True) long_lead_data.columns.rename(names=['OCCUPANCY TYPE', 'INCOME STRATA'], level=[0, 2], inplace=True) wide_lead_data = long_lead_data.stack(level=[0, 2]) wide_lead_data["ENERGY BURDEN"] = 12 * ( wide_lead_data["ELEP"] + wide_lead_data["GASP"] + wide_lead_data["FULP"]) / wide_lead_data["HINCP"] features = [ 'TRACT', 'YBL INDEX', 'BLD INDEX', 'HFL INDEX', 'OCCUPANCY TYPE', 'INCOME STRATA' ] calc = wc.Calculator("COUNT") nicely_grouped = wide_lead_data.groupby(features) print("trying to calc") print(calc.mean(nicely_grouped, "ELEP").head()) #fully_aggregated = tract_aggregated.groupby("TRACT").apply(wtavg) #tract_aggregated.to_csv("tractnc2015_cleaned.csv") #fully_aggregated.to_csv("tractnc2015_aggregated.csv")
# PITFALL: This depends on some variables that were removed from python/build/ # in commit 2c82f5faad432aa1971bd940341adef2bf73ea02 exec(open("tax-proposal/2020-08-21/build.py").read()) from python.common.util import near import numpy as np import weightedcalcs as weightLib wc = weightLib.Calculator('weight') # `qs` is a quantile- (specifically percentile-) level data frame. qs = pd.DataFrame({"income q": np.arange(0, 1, 0.01)}) qs["income"] = (qs["income q"].apply(lambda q: wc.quantile(hh, "income", q))) qs = pd.concat([ qs, pd.DataFrame(data={ "income": [np.inf], "income q": [1] }, index=[100]) ]) qs["income q"] = (qs["income q"].apply(lambda x: round(100 * x))) qs = qs["income"] if True: def income_quantiles(incomes, thresholds): q = 0 acc = incomes.copy() for i in range(0, len(incomes)): if q < thresholds.index.max(): if incomes.iloc[i] >= thresholds.loc[q + 1]:
import unittest import weightedcalcs as wc import pandas as pd import sys import os calc = wc.Calculator("weights") class WCTest(unittest.TestCase): def test_mean(self): # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean assert (calc.mean( pd.DataFrame({ "values": [80, 90], "weights": [20, 30], }), "values") == 86) def test_mean_non_pandas(self): # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean assert (calc.mean({ "values": [80, 90], "weights": [20, 30], }, "values") == 86) def test_quantile(self): # Example via https://en.wikipedia.org/wiki/Weighted_median df = pd.DataFrame({ "values": [0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2], "weights": [0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2], })
"21": "Puno","22":"San Martin","23":"Tacna","24":"Tumbes","25":"Ucayali" }) sumaria.head(2) #Generamos variable "area" geografica sumaria["area"] = sumaria["estrato"].replace({"de 500,000 a más habitantes" : "Urbana", "de 100,000 a 499,999 habitantes" : "Urbana", "de 50,000 a 99,999 habitantes" : "Urbana", "de 20,000 a 49,999 habitantes" : "Urbana", "de 2,000 a 19,999 habitantes" : "Urbana", "de 500 a 1,999 habitantes" : "Rural", "Área de empadronamiento rural (aer) simple" : "Rural", "Área de empadronamiento rural (aer) compuesto" : "Rural" }) #sumaria["area"] = np.where(sumaria["estrato"] <= 5, "Urbana", "Rural") sumaria["area"].value_counts() #Generamos tablas sin ponderador pd.crosstab(sumaria["dpto"], sumaria["pc_pobre"]) #Generamos tablas sin ponderador pd.crosstab([sumaria["area"],sumaria["estrsocial"]], sumaria["pc_pobre"] , margins=True) #Tasa de pobreza usando factor expansión / ponderador sumaria["facpop"] = sumaria["factor07"]*sumaria["mieperho"] calc = wc.Calculator("facpop") #Distribución de variable "pc_pobre" calc.distribution(sumaria, "pc_pobre").round(3).sort_values(ascending=False)