def _build(): """Utility function for building a Selenium Chrome driver""" chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") try: load_dotenv(Path(get_project_root(), ".env")) chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_PATH") return webdriver.Chrome( executable_path=os.environ.get("CHROMEDRIVER_PATH"), options=chrome_options) except (WebDriverException, TypeError): chromedriver_autoinstaller.install() return webdriver.Chrome(options=chrome_options)
def income_household(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get average household income. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly average household income : pd.DataFrame """ name = "income_household" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: raw = pd.read_excel(urls[name]["dl"]["main"], sheet_name="Mensual", skiprows=5, index_col=0).dropna(how="all") except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["main"], verify=certificate) raw = pd.read_excel(BytesIO(r.content), sheet_name="Mensual", skiprows=5, index_col=0).dropna(how="all") else: raise err raw.index = pd.to_datetime(raw.index) output = raw.loc[~pd.isna(raw.index)] output.index = output.index + MonthEnd(0) output.columns = ["Total país", "Montevideo", "Interior: total", "Interior: localidades de más de 5 mil hab.", "Interior: localidades pequeñas y rural"] missing = pd.read_excel(urls[name]["dl"]["missing"], index_col=0, header=0).iloc[:, 10:13] missing.columns = output.columns[:3] output = output.append(missing, sort=False) output = output.apply(pd.to_numeric, errors="coerce") if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) output = ops._revise(new_data=output, prev_data=previous_data, revise_rows=revise_rows) metadata._set(output, area="Ingresos", currency="UYU", inf_adj="No", unit="Pesos", seas_adj="NSA", ts_type="Flujo", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def long_rates(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get 10-year government bonds interest rates. Countries/aggregates selected are US, Germany, France, Italy, Spain United Kingdom, Japan and China. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Daily 10-year government bonds interest rates : pd.DataFrame """ name = "global_long_rates" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output bonds = [] load_dotenv(Path(get_project_root(), ".env")) fred_api_key = os.environ.get("FRED_API_KEY") r = requests.get(f"{urls[name]['dl']['fred']}DGS10&api_key=" f"{fred_api_key}&file_type=json") us = pd.DataFrame.from_records(r.json()["observations"]) us = us[["date", "value"]].set_index("date") us.index = pd.to_datetime(us.index) us.columns = ["United States"] bonds.append(us.apply(pd.to_numeric, errors="coerce").dropna()) for country, sid in zip([ "Germany", "France", "Italy", "Spain", "United Kingdom", "Japan", "China" ], ["23693", "23778", "23738", "23806", "23673", "23901", "29227"]): end_date_dt = dt.datetime(2000, 1, 1) start_date_dt = dt.datetime(2000, 1, 1) aux = [] while end_date_dt < dt.datetime.now(): end_date_dt = start_date_dt + dt.timedelta(days=5000) params = { "curr_id": sid, "smlID": str(randint(1000000, 99999999)), "header": f"{country} 10-Year Bond Yield Historical Data", "st_date": start_date_dt.strftime("%m/%d/%Y"), "end_date": end_date_dt.strftime("%m/%d/%Y"), "interval_sec": "Daily", "sort_col": "date", "sort_ord": "DESC", "action": "historical_data" } r = requests.post(urls["global_long_rates"]["dl"]["main"], headers=investing_headers, data=params) aux.append( pd.read_html(r.content, match="Price", index_col=0, parse_dates=True)[0]) start_date_dt = end_date_dt + dt.timedelta(days=1) aux = pd.concat(aux, axis=0)[["Price"]].sort_index() aux.columns = [country] bonds.append(aux) output = bonds[0].join(bonds[1:], how="left") output = output.interpolate(method="linear", limit_area="inside") output.columns = [ "Estados Unidos", "Alemania", "Francia", "Italia", "España", "Reino Unido", "Japón", "China" ] if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) output = ops._revise(new_data=output, prev_data=previous_data, revise_rows=revise_rows) metadata._set(output, area="Global", currency="USD", inf_adj="No", seas_adj="NSA", unit="Tasa", ts_type="-", cumperiods=1) metadata._modify_multiindex( output, levels=[3], new_arrays=[["USD", "EUR", "EUR", "EUR", "EUR", "GBP", "JPY", "CNY"]]) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def gdp(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get seasonally adjusted real quarterly GDP for select countries. Countries/aggregates are US, EU-27, Japan and China. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Quarterly real GDP in seasonally adjusted terms : pd.DataFrame """ name = "global_gdp" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output chn_y = dt.datetime.now().year + 1 chn_r = requests.get(f"{urls[name]['dl']['chn_oecd']}{chn_y}-Q4") chn_json = chn_r.json() chn_datasets = [] for dataset, start in zip(["0", "1"], ["2011-03-31", "1993-03-31"]): raw = chn_json["dataSets"][0]["series"][f"0:0:{dataset}:0"][ "observations"] values = [x[0] for x in raw.values()] df = pd.DataFrame(data=values, index=pd.date_range(start=start, freq="Q-DEC", periods=len(values)), columns=["China"]) chn_datasets.append(df) chn_qoq = chn_datasets[0] chn_yoy = chn_datasets[1] chn_obs = pd.read_excel(urls["global_gdp"]["dl"]["chn_obs"], index_col=0).dropna(how="all", axis=1).dropna(how="all", axis=0) chn_obs = chn_obs.loc[(chn_obs.index > "2011-01-01") & (chn_obs.index < "2016-01-01")] chn_yoy["volume"] = chn_obs for row in reversed(range(len(chn_yoy.loc[chn_yoy.index < "2011-01-01"]))): if pd.isna(chn_yoy.iloc[row, 1]): chn_yoy.iloc[row, 1] = (chn_yoy.iloc[row + 4, 1] / (1 + chn_yoy.iloc[row + 4, 0] / 100)) chn_yoy = chn_yoy[["volume"]].loc[chn_yoy.index < "2016-01-01"] metadata._set(chn_yoy) chn_sa = decompose(chn_yoy[["volume"]].loc[chn_yoy.index < "2016-01-01"], component="seas", method="x13") chn_sa = pd.concat([chn_sa, chn_qoq], axis=1) for row in range(len(chn_sa)): if not pd.isna(chn_sa.iloc[row, 1]): chn_sa.iloc[row, 0] = (chn_sa.iloc[row - 1, 0] * (1 + chn_sa.iloc[row, 1] / 100)) chn = chn_sa.iloc[:, [0]].div(10) gdps = [] load_dotenv(Path(get_project_root(), ".env")) fred_api_key = os.environ.get("FRED_API_KEY") for series in ["GDPC1", "CLVMNACSCAB1GQEU272020", "JPNRGDPEXP"]: r = requests.get(f"{urls[name]['dl']['fred']}{series}&api_key=" f"{fred_api_key}&file_type=json") aux = pd.DataFrame.from_records(r.json()["observations"]) aux = aux[["date", "value"]].set_index("date") aux.index = pd.to_datetime(aux.index) aux.index = aux.index.shift(3, freq="M") + MonthEnd(0) aux.columns = [series] aux = aux.apply(pd.to_numeric, errors="coerce") if series == "GDPC1": aux = aux.div(4) elif series == "CLVMNACSCAB1GQEU272020": aux = aux.div(1000) gdps.append(aux) gdps = pd.concat(gdps, axis=1) output = pd.concat([gdps, chn], axis=1) output.columns = ["Estados Unidos", "Unión Europea", "Japón", "China"] if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) output = ops._revise(new_data=output, prev_data=previous_data, revise_rows=revise_rows) metadata._set(output, area="Global", currency="USD", inf_adj="Const.", unit="Miles de millones", seas_adj="SA", ts_type="Flujo", cumperiods=1) metadata._modify_multiindex(output, levels=[3], new_arrays=[["USD", "EUR", "JPY", "CNY"]]) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def cpi_measures(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """ Get core CPI, Winsorized CPI, tradabe CPI, non-tradable CPI and residual CPI. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly CPI measures : pd.DataFrame """ name = "cpi_measures" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: xls_10_14 = pd.ExcelFile(urls[name]["dl"]["2010-14"]) xls_15 = pd.ExcelFile(urls[name]["dl"]["2015-"]) prod_97 = (pd.read_excel( urls[name]["dl"]["1997"], skiprows=5).dropna(how="any").set_index( "Rubros, Agrupaciones, Subrubros, Familias y Artículos").T) except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["2010-14"], verify=certificate) xls_10_14 = pd.ExcelFile(BytesIO(r.content)) r = requests.get(urls[name]["dl"]["2015-"], verify=certificate) xls_15 = pd.ExcelFile(BytesIO(r.content)) r = requests.get(urls[name]["dl"]["1997"], verify=certificate) prod_97 = (pd.read_excel(BytesIO( r.content), skiprows=5).dropna(how="any").set_index( "Rubros, Agrupaciones, Subrubros, Familias y Artículos").T) else: raise err weights_97 = (pd.read_excel(urls[name]["dl"]["1997_weights"], index_col=0).drop_duplicates( subset="Descripción", keep="first")) weights = pd.read_excel(xls_10_14, sheet_name=xls_10_14.sheet_names[0], usecols="A:C", skiprows=13, index_col=0).dropna(how="any") weights.columns = ["Item", "Weight"] weights_8 = weights.loc[weights.index.str.len() == 8] sheets = [] for excel_file in [xls_10_14, xls_15]: for sheet in excel_file.sheet_names: raw = pd.read_excel(excel_file, sheet_name=sheet, usecols="D:IN", skiprows=8).dropna(how="all") proc = raw.loc[:, raw.columns.str.contains("Indice|Índice")].dropna( how="all") sheets.append(proc.T) complete_10 = pd.concat(sheets) complete_10 = complete_10.iloc[:, 1:] complete_10.columns = [weights["Item"], weights.index] complete_10.index = pd.date_range(start="2010-12-31", periods=len(complete_10), freq="M") diff_8 = complete_10.loc[:, complete_10.columns.get_level_values( level=1).str.len() == 8].pct_change() win = pd.DataFrame(winsorize(diff_8, limits=(0.05, 0.05), axis=1)) win.index = diff_8.index win.columns = diff_8.columns.get_level_values(level=1) cpi_win = win.mul(weights_8.loc[:, "Weight"].T) cpi_win = cpi_win.sum(axis=1).add(1).cumprod().mul(100) weights_97["Weight"] = (weights_97["Rubro"].fillna( weights_97["Agrupación, subrubro, familia"]).fillna( weights_97["Artículo"]).drop( columns=["Rubro", "Agrupación, subrubro, familia", "Artículo"]) ) prod_97 = prod_97.loc[:, list(cpi_details["1997_base"].keys())] prod_97.index = pd.date_range(start="1997-03-31", periods=len(prod_97), freq="M") weights_97 = (weights_97[weights_97["Descripción"].isin( cpi_details["1997_weights"])].set_index("Descripción").drop( columns=["Rubro", "Agrupación, subrubro, " "familia", "Artículo"])).div(100) weights_97.index = prod_97.columns prod_10 = complete_10.loc[:, list(cpi_details["2010_base"].keys())] prod_10 = prod_10.loc[:, ~prod_10.columns.get_level_values( level=0).duplicated()] prod_10.columns = prod_10.columns.get_level_values(level=0) weights_10 = (weights.loc[weights["Item"].isin( list(cpi_details["2010_base"].keys()))].drop_duplicates( subset="Item", keep="first")).set_index("Item") items = [] weights = [] for item, weight, details in zip([prod_10, prod_97], [weights_10, weights_97], ["2010_base", "1997_base"]): for tradable in [True, False]: items.append(item.loc[:, [ k for k, v in cpi_details[details].items() if v["Tradable"] is tradable ]]) aux = weight.loc[[ k for k, v in cpi_details[details].items() if v["Tradable"] is tradable ]] weights.append(aux.div(aux.sum())) for core in [True, False]: items.append(item.loc[:, [ k for k, v in cpi_details[details].items() if v["Core"] is core ]]) aux = weight.loc[[ k for k, v in cpi_details[details].items() if v["Core"] is core ]] weights.append(aux.div(aux.sum())) intermediate = [] for item, weight in zip(items, weights): intermediate.append(item.mul(weight.squeeze()).sum(1)) output = [] for x, y in zip(intermediate[:4], intermediate[4:]): aux = pd.concat([ y.pct_change().loc[y.index < "2011-01-01"], x.pct_change().loc[x.index > "2011-01-01"] ]) output.append(aux.fillna(0).add(1).cumprod().mul(100)) cpi_re = cpi(update_loc=update_loc, save_loc=save_loc, only_get=True) cpi_re = cpi_re.loc[cpi_re.index >= "1997-03-31"] output = pd.concat([cpi_re] + output + [cpi_win], axis=1) output.columns = [ "Índice de precios al consumo: total", "Índice de precios al consumo: transables", "Índice de precios al consumo: no transables", "Índice de precios al consumo: subyacente", "Índice de precios al consumo: residual", "Índice de precios al consumo: Winsorized 0.05" ] output = output.apply(pd.to_numeric, errors="coerce") metadata._set(output, area="Precios y salarios", currency="-", inf_adj="No", unit="2010-12=100", seas_adj="NSA", ts_type="-", cumperiods=1) output = transform.rebase(output, start_date="2010-12-01", end_date="2010-12-31") if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) output = ops._revise(new_data=output, prev_data=previous_data, revise_rows=revise_rows) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def cpi(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get CPI data. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly CPI index : pd.DataFrame """ name = "cpi" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: cpi = pd.read_excel(urls[name]["dl"]["main"], skiprows=7, usecols="A:B", index_col=0).dropna() except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["main"], verify=certificate) cpi = pd.read_excel(BytesIO(r.content), skiprows=7, usecols="A:B", index_col=0).dropna() else: raise err cpi.columns = ["Índice de precios al consumo"] cpi.rename_axis(None, inplace=True) cpi.index = cpi.index + MonthEnd(1) if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) cpi = ops._revise(new_data=cpi, prev_data=previous_data, revise_rows=revise_rows) cpi = cpi.apply(pd.to_numeric, errors="coerce") metadata._set(cpi, area="Precios", currency="-", inf_adj="No", unit="2010-10=100", seas_adj="NSA", ts_type="-", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=cpi, name=name) return cpi
def nxr_monthly(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get monthly nominal exchange rate data. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly nominal exchange rates : pd.DataFrame Sell rate, monthly average and end of period. """ name = "nxr_monthly" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: nxr_raw = pd.read_excel(urls[name]["dl"]["main"], skiprows=4, index_col=0, usecols="A,C,F") except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["main"], verify=certificate) nxr_raw = pd.read_excel(BytesIO(r.content), skiprows=4, index_col=0, usecols="A,C,F") else: raise err nxr = nxr_raw.dropna(how="any", axis=0) nxr.columns = [ "Tipo de cambio venta, fin de período", "Tipo de cambio venta, promedio" ] nxr.index = nxr.index + MonthEnd(1) nxr = nxr.apply(pd.to_numeric, errors="coerce") if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) nxr = ops._revise(new_data=nxr, prev_data=previous_data, revise_rows=revise_rows) metadata._set(nxr, area="Precios", currency="UYU/USD", inf_adj="No", unit="-", seas_adj="NSA", ts_type="-", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=nxr, name=name) return nxr
def rates_people(update_loc: Union[str, PathLike, Engine, Connection, None] = None, save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = True) -> pd.DataFrame: """ Get labor data, both rates and persons. Extends national data between 1991 and 2005 with data for jurisdictions with more than 5,000 inhabitants. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default True If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Labor market data : pd.DataFrame """ name = "labor_rates_people" rates = labor_rates(update_loc=update_loc, only_get=only_get) rates = rates.loc[:, [ "Tasa de actividad: total", "Tasa de empleo: total", "Tasa de desempleo: total" ]] try: act_5000 = pd.read_excel(urls[name]["dl"]["act_5000"], sheet_name="Mensual", index_col=0, skiprows=8, usecols="A:B").dropna(how="any") emp_5000 = pd.read_excel(urls[name]["dl"]["emp_5000"], sheet_name="Mensual", index_col=0, skiprows=8, usecols="A:B").dropna(how="any") des_5000 = pd.read_excel(urls[name]["dl"]["des_5000"], sheet_name="Mensual", index_col=0, skiprows=7, usecols="A:B").dropna(how="any") working_age = pd.read_excel(urls[name]["dl"]["population"], skiprows=7, index_col=0, nrows=92).dropna(how="all") except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["act_5000"], verify=certificate) act_5000 = pd.read_excel(BytesIO(r.content), sheet_name="Mensual", index_col=0, skiprows=8, usecols="A:B").dropna(how="any") r = requests.get(urls[name]["dl"]["emp_5000"], verify=certificate) emp_5000 = pd.read_excel(BytesIO(r.content), sheet_name="Mensual", index_col=0, skiprows=8, usecols="A:B").dropna(how="any") r = requests.get(urls[name]["dl"]["des_5000"], verify=certificate) des_5000 = pd.read_excel(BytesIO(r.content), sheet_name="Mensual", index_col=0, skiprows=7, usecols="A:B").dropna(how="any") r = requests.get(urls[name]["dl"]["population"], verify=certificate) working_age = pd.read_excel(BytesIO(r.content), skiprows=7, index_col=0, nrows=92).dropna(how="all") else: raise err for df in [act_5000, emp_5000, des_5000]: df.index = df.index + MonthEnd(0) rates_5000 = pd.concat([act_5000, emp_5000, des_5000], axis=1) rates_prev = rates_5000.loc[rates_5000.index < "2006-01-31"] rates_prev.columns = rates.columns rates = pd.concat([rates_prev, rates]) rates.columns = rates.columns.set_levels( rates.columns.levels[0].str.replace(": total", ""), level=0) ages = list(range(14, 90)) + ["90 y más"] working_age = working_age.loc[ages].sum() working_age.index = pd.date_range(start="1996-06-30", end="2050-06-30", freq="A-JUN") monthly_working_age = working_age.resample("M").interpolate("linear") monthly_working_age = monthly_working_age.reindex(rates.index) persons = rates.iloc[:, [0, 1]].div(100).mul(monthly_working_age, axis=0) persons["Desempleados"] = rates.iloc[:, 2].div(100).mul(persons.iloc[:, 0]) persons.columns = ["Activos", "Empleados", "Desempleados"] metadata._set(persons, area="Mercado laboral", currency="-", inf_adj="No", unit="Personas", seas_adj="NSA", ts_type="-", cumperiods=1) output = pd.concat([rates, persons], axis=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def hours(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get average hours worked data. Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly hours worked : pd.DataFrame """ name = "hours_worked" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: raw = pd.read_excel(urls[name]["dl"]["main"], sheet_name="Mensual", skiprows=5, index_col=0).dropna(how="all") prev_hours = pd.read_excel(urls[name]["dl"]["historical"], index_col=0, skiprows=8).dropna(how="all").iloc[:, [0]] except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["main"], verify=certificate) raw = pd.read_excel(BytesIO(r.content), sheet_name="Mensual", skiprows=5, index_col=0).dropna(how="all") r = requests.get(urls[name]["dl"]["historical"], verify=certificate) prev_hours = pd.read_excel(BytesIO(r.content), index_col=0, skiprows=8).dropna(how="all").iloc[:, [0]] else: raise err raw.index = pd.to_datetime(raw.index) output = raw.loc[~pd.isna(raw.index)] output.index = output.index + MonthEnd(0) output.columns = [ "Total", "Industrias manufactureras", "Electricidad, gas, agua y saneamiento", "Construcción", "Comercio", "Transporte y almacenamiento", "Alojamiento y servicios de comidas", "Información y comunicación", "Actividades financieras", "Actividades inmobiliarias y administrativas", "Actividades profesionales", "Administración pública y seguridad social", "Enseñanza", "Salud", "Arte y otros servicios", "Act. de hogares como empleadores", "Agro, forestación, pesca y minería" ] prev_hours = prev_hours.loc[~prev_hours.index.str.contains("-|Total")] prev_hours.index = pd.date_range(start="2006-01-31", freq="M", periods=len(prev_hours)) prev_hours = prev_hours.loc[prev_hours.index < "2011-01-01"] prev_hours.columns = ["Total"] output = prev_hours.append(output, sort=False) if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) output = ops._revise(new_data=output, prev_data=previous_data, revise_rows=revise_rows) metadata._set(output, area="Mercado laboral", currency="-", inf_adj="No", unit="Horas por semana", seas_adj="NSA", ts_type="Stock", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=output, name=name) return output
def labor_rates(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get labor market data (LFPR, employment and unemployment). Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly participation, employment and unemployment rates : pd.DataFrame """ name = "labor_rates" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: labor_raw = pd.read_excel(urls[name]["dl"]["main"], skiprows=39).dropna(axis=0, thresh=2) except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["main"], verify=certificate) labor_raw = pd.read_excel(BytesIO(r.content), skiprows=39).dropna(axis=0, thresh=2) else: raise err labor = labor_raw[~labor_raw["Unnamed: 0"].str. contains("-|/|Total", regex=True)] labor.index = pd.date_range(start="2006-01-01", periods=len(labor), freq="M") labor = labor.drop(columns="Unnamed: 0") labor.columns = [ "Tasa de actividad: total", "Tasa de actividad: hombres", "Tasa de actividad: mujeres", "Tasa de empleo: total", "Tasa de empleo: hombres", "Tasa de empleo: mujeres", "Tasa de desempleo: total", "Tasa de desempleo: hombres", "Tasa de desempleo: mujeres" ] missing = pd.read_excel(urls[name]["dl"]["missing"], index_col=0, header=0).iloc[:, :9] missing.columns = labor.columns labor = labor.append(missing) labor = labor.loc[~labor.index.duplicated(keep="first")] if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) labor = ops._revise(new_data=labor, prev_data=previous_data, revise_rows=revise_rows) labor = labor.apply(pd.to_numeric, errors="coerce") metadata._set(labor, area="Mercado laboral", currency="-", inf_adj="No", unit="Tasa", seas_adj="NSA", ts_type="-", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=labor, name=name) return labor
def nominal_wages(update_loc: Union[str, PathLike, Engine, Connection, None] = None, revise_rows: Union[str, int] = "nodup", save_loc: Union[str, PathLike, Engine, Connection, None] = None, only_get: bool = False) -> pd.DataFrame: """Get nominal general, public and private sector wages data Parameters ---------- update_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to find a CSV for updating, SQLAlchemy connection or engine object, or ``None``, don't update. revise_rows : {'nodup', 'auto', int} Defines how to process data updates. An integer indicates how many rows to remove from the tail of the dataframe and replace with new data. String can either be ``auto``, which automatically determines number of rows to replace from the inferred data frequency, or ``nodup``, which replaces existing periods with new data. save_loc : str, os.PathLike, SQLAlchemy Connection or Engine, or None, \ default None Either Path or path-like string pointing to a directory where to save the CSV, SQL Alchemy connection or engine object, or ``None``, don't save. only_get : bool, default False If True, don't download data, retrieve what is available from ``update_loc``. Returns ------- Monthly wages separated by public and private sector : pd.DataFrame """ name = "nominal_wages" if only_get is True and update_loc is not None: output = ops._io(operation="update", data_loc=update_loc, name=name) if not output.equals(pd.DataFrame()): return output try: historical = pd.read_excel(urls[name]["dl"]["historical"], skiprows=8, usecols="A:B") current = pd.read_excel(urls[name]["dl"]["current"], skiprows=8, usecols="A,C:D") except URLError as err: if "SSL: CERTIFICATE_VERIFY_FAILED" in str(err): certificate = Path(get_project_root(), "utils", "files", "ine_certs.pem") r = requests.get(urls[name]["dl"]["historical"], verify=certificate) historical = pd.read_excel(BytesIO(r.content), skiprows=8, usecols="A:B") r = requests.get(urls[name]["dl"]["current"], verify=certificate) current = pd.read_excel(BytesIO(r.content), skiprows=8, usecols="A,C:D") else: raise err historical = historical.dropna(how="any").set_index("Unnamed: 0") current = current.dropna(how="any").set_index("Unnamed: 0") wages = pd.concat([historical, current], axis=1) wages.index = wages.index + MonthEnd(1) wages.columns = [ "Índice medio de salarios", "Índice medio de salarios privados", "Índice medio de salarios públicos" ] if update_loc is not None: previous_data = ops._io(operation="update", data_loc=update_loc, name=name) wages = ops._revise(new_data=wages, prev_data=previous_data, revise_rows=revise_rows) wages = wages.apply(pd.to_numeric, errors="coerce") metadata._set(wages, area="Mercado laboral", currency="UYU", inf_adj="No", unit="2008-07=100", seas_adj="NSA", ts_type="-", cumperiods=1) if save_loc is not None: ops._io(operation="save", data_loc=save_loc, data=wages, name=name) return wages