def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0] data["date"] = data.REPORT_DATE.apply(lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Add level1 keys subregion1s = country_subregion1s(aux["metadata"], "AU") data = table_merge([data, subregion1s], left_on="CODE", right_on="subregion1_code", how="left") # Country-level record has CODE AUS country_mask = data["CODE"] == "AUS" data.loc[country_mask, "key"] = "AU" # Only keep country and subregion1 rows data = data[data.key != None] data = table_rename( data, { "date": "date", "key": "key", "VACC_DOSE_CNT": "total_vaccine_doses_administered", "VACC_PEOPLE_CNT": "total_persons_fully_vaccinated", }, drop=True) # remove rows without vaccination data data.dropna(subset=["total_vaccine_doses_administered", "total_persons_fully_vaccinated"], how="all", inplace=True) # based on the assumption two doses = fully vaccinated(since Australia is using Pfizer and AZ) data["total_persons_vaccinated"] = estimate_total_persons_vaccinated(data) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: features = json.load(fd)["features"] records = {"hospitalized": [], "intensive_care": [], "ventilator": []} for record in features: if record["SERIE"] == "HPT": statistic = "hospitalized" elif record["SERIE"] == "CSR": statistic = "intensive_care" elif record["SERIE"] == "CCR": statistic = "ventilator" else: self.log_error(f"Unknown statistic type: {statistic}") continue records[statistic].append( { "date": datetime.fromtimestamp(record["FECHA"] / 1000).date().isoformat(), f"current_{statistic}": record["CV19"], } ) dataframes = [] for df in records.values(): dataframes.append(DataFrame.from_records(df).groupby("date").sum().reset_index()) data = table_merge(dataframes, how="outer") data["key"] = "ES_CN" return data
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: tables = [df for name, df in dataframes.items() if name != "geo"] column_adapter = dict(_column_adapter, state="idxs", date=f"date_new_confirmed") data = table_rename(concat(tables), column_adapter=column_adapter, drop=True) # Correct data types where necessary data["idxs"] = data["idxs"].astype(str) data["age"] = data["age"].apply(lambda x: None if x < 0 else x) data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get) # Convert to our preferred time series format data = convert_cases_to_time_series(data, ["idxs"]) # Geo name lookup geo_col_adapter = {"state": "subregion1_name", "district": "subregion2_name"} geo = table_rename(dataframes["geo"], geo_col_adapter, drop=False) geo["idxs"] = geo["idxs"].astype(str) geo["subregion1_name"] = geo["subregion1_name"].str.replace("W.P. ", "") geo = geo.groupby(["subregion1_name", "idxs"]).first().reset_index() data = table_merge([data, geo], on=["idxs"], how="inner") # Since only the cases have district level data, ignore it data["country_code"] = "MY" data["subregion2_name"] = None return data
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Convert the raw data into numeric values for df in dataframes.values(): df["entries"] = df["entries"].apply(safe_int_cast) data = table_merge( [ table_rename(df, dict(_column_adapter, entries=name), drop=True) for name, df in dataframes.items() ], on=["date", "subregion1_code"], how="outer", ) # Make sure all records have the country code and match subregion1 only data["key"] = None data["country_code"] = "CH" data["subregion2_code"] = None data["locality_code"] = None # Country-level records have a known key country_mask = data["subregion1_code"] == "CH" data.loc[country_mask, "key"] = "CH" # Principality of Liechtenstein is not in CH but is in the data as FL country_mask = data["subregion1_code"] == "FL" data.loc[country_mask, "key"] = "LI" # Output the results return data
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes["fullyVaccPersons"], { "date": "date", "geoRegion": "subregion1_code", "type": "_statistic", "entries": "_new_count", "sumTotal": "_total_count", }, drop=True, ) # Combine all the different variable indicators tables = [] for col, var in { "COVID19AtLeastOneDosePersons": "persons_vaccinated", "COVID19FullyVaccPersons": "persons_fully_vaccinated", }.items(): adapter = { "_new_count": f"new_{var}", "_total_count": f"total_{var}" } subset = data.loc[data["_statistic"] == col].drop( columns=["_statistic"]) tables.append(subset.rename(columns=adapter)) data = table_merge(tables, on=["date", "subregion1_code"], how="outer") # Output the results return _output_ch_data(data)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge([ melt(dataframes[name], id_vars=["Date"], var_name="match_string", value_name=value) for name, value in [( "confirmed", "new_confirmed"), ("deceased", "total_deceased")] ]) data["country_code"] = "JP" # Get date in ISO format data = data.rename(columns={"Date": "date"}) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) # Country-level uses the label "ALL" country_mask = data["match_string"] == "ALL" country = data.loc[country_mask] data = data.loc[~country_mask] country["key"] = "JP" # Output the results return concat([country, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ dataframes["confirmed_deceased_recovered"].rename( columns=COMMON_COLUMNS, ), dataframes["tested"].rename(columns={ "TestGesamt": "total_tested", "MeldeDatum": "Time" }) ], how="outer", ) # Convert date to ISO format data["date"] = data["Time"].apply( lambda x: datetime_isoformat(x, "%d.%m.%Y %H:%M:%S")) # Create the key from the state ID data["key"] = data["BundeslandID"].apply(lambda x: f"AT_{x}") data.loc[data["key"] == "AT_10", "key"] = "AT" # Output the results return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: features = json.load(fd)["features"] records = {"confirmed": [], "deceased": [], "recovered": []} for record in features: if record["TIPO"] == "Casos": statistic = "confirmed" elif record["TIPO"] == "Fallecidos": statistic = "deceased" elif record["TIPO"] == "Recuperados": statistic = "recovered" else: self.log_error(f"Unknown statistic type: {statistic}") continue records[statistic].append({ "date": datetime.fromtimestamp(record["FECHA"] / 1000).date().isoformat(), "subregion2_code": record["CODMUN"], "subregion2_name": record["MUNICIPIO"], f"new_{statistic}": record["CV19_DIA"], f"total_{statistic}": record["CV19_AC"], "_island": record["ISLA"], }) dataframes = [DataFrame.from_records(df) for df in records.values()] data = table_merge(dataframes, how="outer") data["key"] = "ES_CN_" + data["subregion2_code"].astype(str) # Add the country and region code to all records data["country_code"] = "ES" data["subregion1_code"] = "CN" # Aggregate by island and map to known key islands = (data.drop( columns=["key", "subregion2_code", "subregion2_name"]).groupby( ["date", "_island"]).sum().reset_index()) islands["key"] = "ES_CN_" + islands["_island"].apply(_island_map.get) # Aggregate the entire autonomous community l1 = islands.drop( columns=["key", "_island"]).groupby("date").sum().reset_index() l1["key"] = "ES_CN" # Drop bogus values data = data[data["subregion2_code"] != 0] islands = islands[~islands["key"].isna()] return concat([data, islands, l1])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Total": "new_confirmed", "Region": "match_string" }, drop=True, ), table_rename( dataframes["deceased"], # The file name indicates the counts are cumulative, but they are not { "Fecha": "date", "Total": "total_deceased", "Region": "match_string" }, drop=True, ), table_rename( dataframes["tested"], { "Fecha": "date", "numero": "new_tested", "Region": "match_string" }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Extract cities from the regions city = _extract_cities(data) # Make sure all records have country code and no subregion code or key data["country_code"] = "CL" data["key"] = None data["subregion2_code"] = None # Country is reported as "Total" data.loc[data["match_string"] == "Total", "key"] = "CL" # Drop bogus records from the data data.dropna(subset=["date", "match_string"], inplace=True) return concat([data, city])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data_list = [] for statistic, source_file in sources.items(): with open(source_file, "r") as fd: df = DataFrame.from_records(json.load(fd)["values"]) data_list.append(table_rename(df, {"value": statistic})) data = table_merge(data_list, how="outer") data["key"] = "RO" return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_merge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Casos confirmados": "total_confirmed", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), table_rename( dataframes["deceased"], { "Fecha": "date", "Casos fallecidos": "total_deceased", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Parse region codes as strings data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) ) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5) ) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get) # Extract cities from the municipalities city = _extract_cities(data) # We can build the key for the data directly from the subregion codes data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"] # Drop bogus records from the data data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([data, city])
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts): # Read all files in the eurostat folder and merge them together eurostat_directory = SRC / "data" / "eurostat" dataframes = [ read_file(file_name) for file_name in eurostat_directory.glob("*.csv") ] data = table_merge(dataframes, how="outer").dropna(subset=["key"]) # Use only keys available in metadata return data.merge(aux["metadata"][["key"]], how="inner")
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename(dataframes[0], _column_adapter, drop=True) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # add location keys subregion1s = country_subregion1s(aux["metadata"], "IN") data = table_merge([data, subregion1s[["key", "subregion1_name"]]], on=["subregion1_name"], how="inner") return data
def _get_data(url_tpl: str, subregion_code_col: str, subregion_code_to_api_id_map: Dict[str, int], subregions: DataFrame) -> DataFrame: subregion_codes = subregions[subregion_code_col].values map_func = partial(_get_records, url_tpl, subregion_code_to_api_id_map) data = DataFrame.from_records(sum(thread_map(map_func, subregion_codes), [])) data['date'] = data.apply(lambda r: _indonesian_date_to_isoformat(r.tgl), axis=1) # add location keys data = table_merge( [data, subregions], left_on="subregion_code", right_on=subregion_code_col, how="left") data = table_rename(data, _col_name_map, drop=True) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Keep only columns we can process data = table_merge( [_parse_pivot(df, name) for name, df in dataframes.items()], how="outer") data = data[[ "date", "country_code", "match_string", "new_confirmed", "new_deceased" ]] return data.fillna(0)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: tables = [ table_rename(table, _column_adapter, drop=True) for table in dataframes.values() ] data = table_merge(tables, on="date", how="outer") data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) data["key"] = "US_CA_SFO" return data
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_merge( [ table_rename( dataframes['vaccDosesAdministered'], { "date": "date", "geoRegion": "subregion1_code", "sumTotal": "total_vaccine_doses_administered", }, drop=True, ), table_rename( dataframes['fullyVaccPersons'], { "date": "date", "geoRegion": "subregion1_code", "sumTotal": "total_persons_fully_vaccinated", }, drop=True, ), ], on=["date", "subregion1_code"], how="outer", ) # Assuming fully and partially vaccinated persons have 2 and 1 doses respectively, # total_persons_partially_vaccinated = total_vaccine_doses_administered - 2 * total_persons_fully_vaccinated # Therefore, total_persons_vaccinated = total_persons_partially_vaccinated + total_persons_fully_vaccinated # = total_vaccine_doses_administered - total_persons_fully_vaccinated data["total_persons_vaccinated"] = data["total_vaccine_doses_administered"] - data["total_persons_fully_vaccinated"] # Make sure all records have the country code and match subregion1 only data["key"] = None data["country_code"] = "CH" data["subregion2_code"] = None data["locality_code"] = None # Country-level records have a known key country_mask = data["subregion1_code"] == "CH" data.loc[country_mask, "key"] = "CH" # Principality of Liechtenstein is not in CH but is in the data as FL country_mask = data["subregion1_code"] == "FL" data.loc[country_mask, "key"] = "LI" # Output the results return data
def _process_inputs(dataframes: Dict[Any, DataFrame]) -> DataFrame: # Combine all tables data = table_merge(dataframes.values(), how="outer") data = table_rename(data, _column_adapter, drop=True) data["country_code"] = "MY" # Remove records with no date data = data.dropna(subset=["date"]) # Fix the subregion names to match our index if "subregion1_name" in data.columns: data["subregion1_name"] = data["subregion1_name"].str.replace("W.P. ", "") # Add up different categories if "new_tested_1" in data.columns and "new_tested_2" in data.columns: data["new_tested"] = data["new_tested_1"] + data["new_tested_2"] return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0] # Flatten the table data = melt(data, id_vars=["State"], var_name="date", value_name='total_vaccine_doses_administered') data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # add location keys subregion1s = country_subregion1s(aux["metadata"], "IN") data = table_merge([data, subregion1s[['key', 'subregion1_name']]], left_on="State", right_on='subregion1_name', how="inner") return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: tables = [] rename_opts = dict(drop=True, remove_regex=r"[^0-9a-z\s]") name_map = { "Cases": "confirmed", "Deaths": "deceased", "Hospitalizations": "hospitalized" } for sheet_name, stat_name in name_map.items(): col_name = f"_{stat_name}_" col_adapter = { k: v.replace("_stat_", col_name) for k, v in _col_adapter_base.items() } table = table_rename(dataframes[0][sheet_name], col_adapter, **rename_opts) table["date"] = table["date"].apply(lambda x: str(x)[:10]) tables.append(table) data = table_merge(tables, how="outer", on=["date", "subregion2_name"]) state = data.drop(columns=["subregion2_name"]).groupby( ["date"]).sum().reset_index() state["key"] = "US_WA" data = data[data["subregion2_name"] != "Unassigned"] data["country_code"] = "US" data["subregion1_code"] = "WA" for df in (state, data): df["age_bin_00"] = "0-11" df["age_bin_01"] = "12-19" df["age_bin_02"] = "20-34" df["age_bin_03"] = "35-49" df["age_bin_04"] = "50-64" df["age_bin_05"] = "65-79" df["age_bin_06"] = "80-" return concat([state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ table_rename( dataframes["confirmed"], _column_adapter, drop=True), table_rename( dataframes["deceased"], _column_adapter, drop=True), ], how="outer", ) # Province names are sometimes codes (but not always compliant with ISO codes) data["subregion1_code"] = data["subregion1_name"].apply( _province_map.get) data.drop(columns=["subregion1_name"], inplace=True) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Aggregate subregion1 level l1_index = ["date", "subregion1_code"] l1 = data.drop( columns=["match_string"]).groupby(l1_index).sum().reset_index() # Make sure all records have the country code and subregion2_name l1["country_code"] = "CA" l1["subregion2_name"] = None data["country_code"] = "CA" data["subregion2_name"] = "" # Remove bogus data data = data[data["match_string"] != "Not Reported"] # Output the results return concat([l1, data])
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename(dataframes[0], _column_adapter, drop=True) # Convert date to ISO format data["date"] = data["year"].apply( lambda x: datetime.datetime.strptime(str(x), "%Y")) data["date"] = data["date"] + data["week"].apply( lambda x: datetime.timedelta(weeks=x)) data["date"] = data["date"].apply(lambda x: x.date().isoformat()) data = data.drop(columns=["week", "year"]) # Process 1-dose and 2-dose separately data_1_dose = data[data["_dose_type"].str.slice(-1) == "1"].drop( columns=["_dose_type"]) data_2_dose = data[data["_dose_type"].str.slice(-1) == "2"].drop( columns=["_dose_type"]) data_1_dose = data_1_dose.rename( columns={"_total_doses": "total_persons_vaccinated"}) data_2_dose = data_2_dose.rename( columns={"_total_doses": "total_persons_fully_vaccinated"}) data = table_merge([data_1_dose, data_2_dose], how="outer") # Make sure only subregion1 matches data["key"] = None data["country_code"] = "SE" data["subregion2_code"] = None data["locality_code"] = None # Country totals are reported using a special name data.loc[data["match_string"] == "| Sverige |", "key"] = "SE" # Estimate the total doses from person counts data["total_vaccine_doses_administered"] = ( data["total_persons_vaccinated"] + data["total_persons_fully_vaccinated"]) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: column_adapter = { "Requiring inpatient care": "new_hospitalized", "Discharged from hospital or released from treatment": "new_recovered", } tables = [] for col_prev, col_value in column_adapter.items(): keep_cols = [ col for col in dataframes[0].columns if col_prev in col ] df = dataframes[0][["Date"] + keep_cols] df = melt(df, id_vars=["Date"], var_name="match_string", value_name=col_value) df["match_string"] = df["match_string"].apply( lambda x: x.split(" ")[0][1:-1]) tables.append(df) data = table_merge(tables) data["country_code"] = "JP" # Get date in ISO format data = data.rename(columns={"Date": "date"}) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) # Country-level uses the label "ALL" country_mask = data["match_string"] == "ALL" country = data.loc[country_mask] data = data.loc[~country_mask] country["key"] = "JP" # Output the results return concat([country, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ Covid19ZaCumulativeDataSource._parse_variable(df, name) for name, df in dataframes.items() ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Country-level records should have "total" region name country_mask = data["subregion1_code"] == "total" data.loc[country_mask, "key"] = "ZA" # All other records can provide their own key directly data.loc[~country_mask, "key"] = "ZA_" + data.subregion1_code # Output the results return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: sheets = [] sheet_processors = { "Trends": TexasDataSource._parse_trends, "Tests by Day": TexasDataSource._parse_tests, "Hospitalization by Day": TexasDataSource._parse_hospitalized, } for sheet_name, sheet_processor in sheet_processors.items(): df = sheet_processor(read_file(sources[0], sheet_name=sheet_name)) df["date"] = df["date"].apply(safe_str_cast) df["date"] = df["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) df = df.dropna(subset=["date"]) sheets.append(df) data = table_merge(sheets, how="outer") for col in data.columns: if col != "date": data[col] = data[col].apply(safe_float_cast).astype(float) data["key"] = "US_TX" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases_confirmed = table_rename( dataframes["confirmed"], _column_adapter, drop=True).rename(columns={"date": "date_new_confirmed"}) cases_deceased = table_rename( dataframes["deceased"], _column_adapter, drop=True).rename(columns={"date": "date_new_deceased"}) # Translate sex label for df in (cases_confirmed, cases_deceased): df["sex"] = df["sex"].apply({ "MASCULINO": "male", "FEMENINO": "female" }.get) # Convert to time series index_columns = ["subregion1_name", "province_name", "subregion2_name"] data_confirmed = convert_cases_to_time_series(cases_confirmed, index_columns) data_deceased = convert_cases_to_time_series(cases_deceased, index_columns) # Join into a single dataset data = table_merge([data_confirmed, data_deceased], how="outer") # Remove bogus records data.dropna(subset=["date"], inplace=True) # Set country code and get date in ISO format data["country_code"] = "PE" data["date"] = data["date"].apply(safe_int_cast) data["date"] = data["date"].apply(safe_str_cast) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Properly capitalize department to allow for exact matching data["subregion1_name"] = data["subregion1_name"].apply( lambda x: _department_map.get(x, x.title())) # Lima region and lima department are mixed in data, we can distinguish based on province # Sometimes region is something different, so for Lima province we only need `province_name` lima_region_mask = data["subregion1_name"].str.lower() == "lima" lima_province_mask = data["province_name"].str.lower() == "lima" data.loc[lima_province_mask, "subregion1_name"] = "Metropolitan Municipality of Lima" data.loc[lima_region_mask & ~lima_province_mask, "subregion1_name"] = "Lima Region" # Aggregate by admin level 1 subregion1 = (data.drop( columns=["subregion2_name", "province_name"]).groupby( ["date", "country_code", "subregion1_name", "age", "sex"]).sum().reset_index()) subregion1["subregion2_name"] = None # Try to match based on subregion2_name using fuzzy matching, and set subregion2_name to # an empty string to turn off exact matching data = data.rename(columns={"subregion2_name": "match_string"}) data["subregion2_name"] = "" # Convert other text fields to lowercase for consistent processing data["match_string"] = data["match_string"].apply(fuzzy_text) data["province_name"] = data["province_name"].apply(fuzzy_text) # Drop bogus records data = data[~data["match_string"].isna()] data = data[~data["match_string"]. isin(["", "eninvestigacion", "extranjero"])] # Because we are skipping provinces and going directly from region to district, there are # some name collisions which we have to disambiguate manually for province1, province2, district in [ ("lima", "canete", "sanluis"), ("lima", "yauyos", "miraflores"), ("ica", "chincha", "pueblonuevo"), ("canete", "huarochiri", "sanantonio"), ("bolognesi", "huaylas", "huallanca"), ("lucanas", "huancasancos", "sancos"), ("santacruz", "cutervo", "santacruz"), ("yauli", "jauja", "yauli"), ("yauli", "jauja", "paccha"), ("huarochiri", "yauyos", "laraos"), ("elcollao", "melgar", "santarosa"), ]: for province in (province1, province2): mask = (data["province_name"] == province) & (data["match_string"] == district) data.loc[mask, "match_string"] = f"{district}, {province}" # Output the results return concat([subregion1, data])