示例#1
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = dataframes[0].rename(columns={
            "Fecha": "date"
        }).set_index("date")

        deceased_columns = [col for col in data.columns if col.endswith("_D")]
        confirmed_columns = [col[:-2] for col in deceased_columns]

        deceased = data[deceased_columns]
        confirmed = data[confirmed_columns]
        deceased.columns = confirmed.columns

        deceased = pivot_table(
            deceased,
            pivot_name="subregion1_code").rename(columns={"value": "deceased"})
        confirmed = pivot_table(confirmed,
                                pivot_name="subregion1_code").rename(
                                    columns={"value": "confirmed"})

        data = confirmed.merge(deceased).sort_values(
            ["date", "subregion1_code"])

        # Output the results
        data = grouped_diff(data, ["subregion1_code", "date"])
        data["country_code"] = "MX"
        return data
示例#2
0
    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0]
        column_tokens = ["confirmed_", "deaths_", "recovered_"]
        data = data[[col for col in data.columns if any(token in col for token in column_tokens)]]
        data = data.drop(
            columns=["cases_confirmed_new", "cases_unconfirmed_new", "deaths_new", "recovered_new"]
        )
        data["date"] = dataframes[0].date.apply(lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        subsets = []
        for token in column_tokens:
            df = data[["date"] + [col for col in data.columns if token in col]]
            df = pivot_table(df.set_index("date"), pivot_name="match_string")
            df.match_string = df.match_string.apply(lambda x: x.split("_", 2)[1])
            df = df.rename(columns={"value": token.split("_")[0]})
            subsets.append(df)

        data = subsets[0]
        for df in subsets[1:]:
            data = data.merge(df, how="outer")
        data = data.rename(columns={"deaths": "deceased"})

        data = data[data.match_string != "unconfirmed"]
        data = grouped_diff(data, ["match_string", "date"])
        data["country_code"] = "PT"
        return data
示例#3
0
    def _parse(file_path: str, sheet_name: str, value_name: str):
        data = read_file(file_path, sheet_name=sheet_name)

        data.columns = [col.replace("NHS ", "").replace(
            " total", "") for col in data.iloc[1]]
        # Drop Golden Jubilee National Hospital - it has no hospitalizations and does not fit
        # any current matches in metadata.csv.
        data = data.drop(columns=["Golden Jubilee National Hospital"])
        data = data.iloc[2:].rename(columns={"Date": "date"})

        data = pivot_table(data.set_index("date"), pivot_name="match_string")
        data = data.rename(columns={"value": value_name})
        data[value_name] = data[value_name].replace(
            "*", None).apply(safe_float_cast).astype(float)

        # Get date in ISO format
        data.date = data.date.apply(lambda x: x.date().isoformat())

        # Add metadata
        data["key"] = None
        data["country_code"] = "GB"
        data["subregion1_code"] = "SCT"
        l2_mask = data.match_string == "Scotland"
        data.loc[l2_mask, "key"] = "GB_SCT"

        return data
示例#4
0
    def _parse(file_path: str, sheet_name: str, value_name: str):
        data = read_file(file_path, sheet_name=sheet_name)
        data.columns = [
            col.replace("NHS ", "").replace(" total", "")
            for col in data.iloc[1]
        ]
        data = data.iloc[2:].rename(columns={"Date": "date"})

        data = pivot_table(data.set_index("date"), pivot_name="match_string")
        data = data.rename(columns={"value": value_name})
        data[value_name] = data[value_name].replace(
            "*", None).apply(safe_float_cast).astype(float)

        # Get date in ISO format
        data.date = data.date.apply(lambda x: x.date().isoformat())

        # Compute cumsum of values
        data = grouped_cumsum(data, ["match_string", "date"])

        # Add metadata
        data["key"] = None
        data["country_code"] = "GB"
        data["subregion1_code"] = "SCT"
        l2_mask = data.match_string == "Scotland"
        data.loc[l2_mask, "key"] = "GB_SCT"

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        deceased = table_rename(dataframes["deceased"],
                                {"FECHA / CCAA": "date"})
        deceased = pivot_table(deceased.set_index("date"),
                               value_name="new_deceased",
                               pivot_name="match_string")

        # Convert dates to ISO format
        deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10])
        deceased["date"] = deceased["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        # Add the country code to all records and declare matching as subregion1
        deceased["country_code"] = "ES"
        deceased["subregion2_code"] = None
        deceased["locality_code"] = None

        # Country level is declared as "espana"
        deceased["key"] = None
        deceased.loc[deceased["match_string"] == "espana", "key"] = "ES"

        # Output the results
        return deceased.dropna(subset=["date"])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = pivot_table(dataframes[0].set_index("date"),
                           pivot_name="match_string").rename(
                               columns={"value": "total_confirmed"})

        # Remove cities from output
        data = data[~data.match_string.isin(["La Guaira", "Los Roques"])]

        # Add country code and return
        data["country_code"] = "VE"
        return data
def _parse_pivot(data: DataFrame, name: str):

    # Remove bogus values
    data = data.iloc[:, :-4]

    # Convert date to ISO format
    data["date"] = data["date"].apply(lambda x: datetime_isoformat(str(x), "%Y%m%d"))
    data = pivot_table(data.set_index("date")).rename(
        columns={"value": name, "pivot": "match_string"}
    )

    # Add the country code to all records
    data["country_code"] = "JP"

    # Output the results
    return data
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0], sheet_name="Antal per dag region").rename(
            columns={"Statistikdatum": "date"})

        # Get date in ISO format
        data.date = data.date.astype(str)

        # Unpivot the regions which are columns
        data.columns = [col.replace("_", " ") for col in data.columns]
        data = data.drop(columns=["Totalt antal fall"]).set_index("date")
        data = pivot_table(data, pivot_name="match_string")

        data["country_code"] = "SE"
        return data.rename(columns={"value": "new_confirmed"})
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        deceased = table_rename(dataframes["deceased"],
                                {"FECHA / CCAA": "date"})
        deceased = pivot_table(deceased.set_index("date"),
                               value_name="new_deceased",
                               pivot_name="match_string")

        # Convert dates to ISO format
        deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10])

        # Add the country code to all records
        deceased["country_code"] = "ES"

        # Output the results
        return deceased
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = None
        for column_name, df in dataframes.items():
            df = df.rename(columns={"Fecha": "date"}).set_index("date")
            df = pivot_table(df, pivot_name="match_string").rename(
                columns={"value": column_name})
            if data is None:
                data = df
            else:
                data = data.merge(df, how="left")

        # Country-level have a specific label
        data["country_code"] = "MX"
        data["subregion2_code"] = None
        data.loc[data.match_string == "Nacional", "key"] = "MX"

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        df = dataframes[0]
        df.columns = df.iloc[0]
        df = df.rename(columns={"Provinsi": "date"})
        df = df.iloc[1:].set_index("date")

        df = df[df.columns.dropna()]
        df = pivot_table(df.transpose(), pivot_name="match_string")
        df["date"] = df["date"].apply(_parse_date)
        df = df.dropna(subset=["date"])
        df = df.rename(columns={"value": "total_confirmed"})
        df["total_confirmed"] = df["total_confirmed"].apply(
            safe_int_cast).astype("Int64")

        df = df[["date", "match_string", "total_confirmed"]]
        df = df[df["match_string"] != "Total"]
        df = df[df["match_string"] != "Dalam proses investigasi"]
        df["country_code"] = "ID"
        return df
示例#12
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = None
        ordered_columns = [
            "confirmed", "deceased", "tested", "hospitalized", "intensive_care"
        ]
        for column_name, df in zip(ordered_columns, dataframes):
            df = df.rename(columns={"Fecha": "date"}).set_index("date")
            df = pivot_table(df, pivot_name="match_string").rename(
                columns={"value": column_name})
            if data is None:
                data = df
            else:
                data = data.merge(df, how="left")

        # Compute the cumsum of data
        data = grouped_cumsum(data, ["match_string", "date"])
        data["country_code"] = "MX"

        # Country-level have a specific label
        data.loc[data.match_string == "Nacional", "key"] = "MX"

        return data
示例#13
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        if parse_opts.get("debug"):
            print("File name:", sources[0])

        # Get the file contents from source
        html_content = open(sources[0]).read()

        # We need to set locale in order to parse dates properly
        locale.setlocale(locale.LC_TIME, parse_opts.get("locale", "en_US") + ".UTF-8")

        # Tables keep changing order, so iterate through all until one looks good
        table_count = count_html_tables(html_content, selector="table.wikitable")

        data: DataFrame = None
        for table_index in range(table_count):
            data = read_html(
                html_content,
                header=True,
                selector="table.wikitable",
                parser=wiki_html_cell_parser,
                table_index=table_index,
                skiprows=parse_opts.get("skiprows", 0),
            )

            if parse_opts.get("debug"):
                print("\n[%d] Data:" % (table_index + 1))
                print(data.columns)
                print(data.head(50))

            # Some of the tables are in Spanish
            data = data.rename(columns={"Fecha": "Date"})

            # Set first date column as index, drop other date columns
            columns_lowercase = [(col or "").lower() for col in data.columns]
            date_index = columns_lowercase.index("date") if "date" in columns_lowercase else 0
            del_index = [i for i, col in enumerate(columns_lowercase) if col == "date"][1:]
            data = data.iloc[:, [i for i, _ in enumerate(data.columns) if i not in del_index]]
            data = data.set_index(data.columns[date_index])
            # data = data.iloc[:, :-parse_opts.get('skipcols', 0)]
            if parse_opts.get("droprows") is not None:
                try:
                    data = data.drop(parse_opts["droprows"].split(","))
                except:
                    pass

            # Pivot the table to fit our preferred format
            data = pivot_table(data, pivot_name="subregion")
            data = data[~data["subregion"].isna()]

            if parse_opts.get("debug"):
                print("\n[%d] Pivoted:" % (table_index + 1))
                print(data.head(50))

            # Make sure all dates include year
            date_format = parse_opts["date_format"]
            if "%Y" not in date_format:
                date_format = date_format + "-%Y"
                data["date"] = data["date"].astype(str) + "-%d" % datetime.datetime.now().year

            # Parse into datetime object, drop if not possible
            data["date"] = data["date"].apply(lambda date: safe_datetime_parse(date, date_format))
            data = data[~data["date"].isna()]

            # If the dataframe is not empty, then we found a good one
            if len(data) > 10 and len(data["subregion"].unique()) > 3:
                break

        # Make sure we have *some* data
        assert data is not None and len(data) > 0

        # Convert all dates to ISO format
        data["date"] = data["date"].apply(lambda date: date.date().isoformat())

        # Get the confirmed and deaths data from the table
        data["confirmed"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[0]))
        data["deceased"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[1]))

        # Add up all the rows with same Date and subregion
        data = data.sort_values(["date", "subregion"])
        data = (
            data.drop(columns=["value"])
            .groupby(["subregion", "date"])
            .agg(self._aggregate_region_values)
        )
        data = data.reset_index().sort_values(["date", "subregion"])

        # Add the appropriate columns to the data
        value_columns = ["confirmed", "deceased"]
        for col in value_columns:
            data["new_" + col] = None
            data["total_" + col] = None

        # Iterate over the individual subregions to process the values per group
        for region in data["subregion"].unique():
            mask = data["subregion"] == region

            for column in value_columns:

                # We can forward-fill values if data is cumsum
                if parse_opts.get("cumsum"):
                    data.loc[mask, column] = data.loc[mask, column].ffill()

                # Fill NA with zero to allow for column-wide operations
                zero_filled = data.loc[mask, column].fillna(0)

                # Only perform operation if the column is not all NaN
                if sum(zero_filled) > 0:
                    # Compute diff of the values region by region if required
                    if parse_opts.get("cumsum"):
                        data.loc[mask, "new_" + column] = zero_filled.diff()
                        data.loc[mask, "total_" + column] = zero_filled
                    # If values are already new daily counts, then empty most likely means zero
                    else:
                        data.loc[mask, "new_" + column] = zero_filled
                        data.loc[mask, "total_" + column] = zero_filled.cumsum()

        # Get rid of rows which have all null values
        data = data.dropna(how="all", subset=value_columns)

        # Add the country code to all records
        data["country_code"] = parse_opts["country"]

        # Labels can be any arbitrary column name
        data = data.rename(columns={"subregion": "match_string"})

        # Drop columns if requested
        if "drop_column" in parse_opts:
            data = data.drop(columns=[parse_opts["drop_column"]])

        # Filter out the appropriate levels
        aggregation_level = parse_opts.get("aggregation_level", 1)
        if aggregation_level == 1:
            null_column = "subregion2_code"
        elif aggregation_level == 2:
            null_column = "subregion1_code"
        data[null_column] = None

        # Remove known values that are just noise
        data["_match_string"] = data["match_string"].apply(lambda x: x.lower())
        data = data[
            ~data["_match_string"].isin(
                [
                    "cml",
                    "new",
                    "total",
                    "tests",
                    "deaths",
                    "abroad",
                    "airport",
                    "current",
                    "newcases",
                    "acumulado",
                    "repatriated",
                    "totaltested",
                    "confirmed cases",
                    "unassigned\ncases",
                    "airport screening",
                ]
            )
        ]
        data = data[~data["_match_string"].apply(lambda x: "princess" in x or "total" in x)]

        # Output the results
        if parse_opts.get("debug"):
            print("\nOutput:")
            print(data.head(50))

        return data
示例#14
0
 def _parse_variable(data: DataFrame, var_name: str) -> DataFrame:
     data = data.drop(columns=["YYYYMMDD", "UNKNOWN", "source"])
     return pivot_table(data.set_index("date"),
                        pivot_name="subregion1_code",
                        value_name=var_name)