예제 #1
0
def test_read_parametric(sample_pridb):
    param = sample_pridb.read_parametric()

    assert len(param) == len(PARAMETRIC_EXPECTED)
    assert param.index.name == "set_id"
    assert param.index.dtype == int64
    assert dict(param.dtypes) == {
        "param_id": Int64Dtype(),
        "time": float64,
        "param_id": Int64Dtype(),
        "pctd": Int64Dtype(),
        "pcta": Int64Dtype(),
    }
예제 #2
0
def test_read(sample_pridb):
    df = sample_pridb.read()

    assert df.index.name == "set_id"
    assert df.index.dtype == int64
    assert dict(df.dtypes) == {
        "set_type": int64,
        "time": float64,
        "channel": Int64Dtype(),
        "param_id": Int64Dtype(),
        "threshold": float64,
        "amplitude": float64,
        "rise_time": float64,
        "cascade_counts": Int64Dtype(),
        "cascade_energy": float64,
        "cascade_hits": Int64Dtype(),
        "cascade_signal_strength": float64,
        "counts": Int64Dtype(),
        "duration": float64,
        "energy": float64,
        "rms": float64,
        "signal_strength": float64,
        "trai": Int64Dtype(),
        "pctd": Int64Dtype(),
        "pcta": Int64Dtype(),
    }
예제 #3
0
def test_read_markers(sample_pridb):
    markers = sample_pridb.read_markers()

    assert len(markers) == len(LABELS_EXPECTED)
    assert markers.index.name == "set_id"
    assert markers.index.dtype == int64
    assert dict(markers.dtypes) == {
        "time": float64,
        "set_type": Int64Dtype(),
        "number": Int64Dtype(),
        "data": dtype("O"),
    }

    labels = list(markers["data"])
    assert labels == LABELS_EXPECTED
예제 #4
0
class TemplatePipelineChain(PipelineChain):
    """
    Pipeline chain for `template` data, which will output a table with the schema described below.
    For very simple pipelines (e.g. single source) this class can be placed in the same file as the
    one defining the pipeline. See [MetadataPipelineChain] for an example of a very simple pipeline.
    """

    schema: Dict[str, type] = {
        "date": str,
        "key": str,
        "column1": Int64Dtype(),
        "column2": str,
    }
    """ Defines the schema of the output table, dtypes str, float and Int64 are supported """

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [
        (
            SourceNamePipeline(),
            {
                "parse_opts": {},
                "merge_opts": {},
                "filter_func": None
            },
        ),
        (RExamplePipeline(), {}),
    ]
    """ Defines the pipelines to be run in order to produce the combined, full output """
예제 #5
0
파일: pipeline.py 프로젝트: zhanghegui/data
 def _parse_dtype(dtype_name: str) -> type:
     if dtype_name == "str":
         return str
     if dtype_name == "int":
         return Int64Dtype()
     if dtype_name == "float":
         return float
     raise TypeError(f"Unsupported dtype: {dtype_name}")
예제 #6
0
class EconomyPipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "key": str,
        "gdp": Int64Dtype(),
        "gdp_per_capita": Int64Dtype(),
    }

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [(
        WikidataPipeline(),
        {
            "parse_opts": {
                "properties": {
                    "gdp": "P2131",
                    "gdp_per_capita": "P2132",
                }
            }
        },
    )]
예제 #7
0
 def integral_extension_dtypes(self):
     return ([
         "Int8",
         "Int16",
         "Int32",
         "Int64",
         Int8Dtype(),
         Int16Dtype(),
         Int32Dtype(),
         Int64Dtype(),
     ] if extension_dtypes_available else [])
예제 #8
0
    def test_as_spark_type_extension_dtypes(self):
        from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype

        type_mapper = {
            Int8Dtype(): ByteType(),
            Int16Dtype(): ShortType(),
            Int32Dtype(): IntegerType(),
            Int64Dtype(): LongType(),
        }

        for extension_dtype, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(extension_dtype), spark_type)
예제 #9
0
class GeographyPipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "key": str,
        "latitude": float,
        "longitude": float,
        "elevation": Int64Dtype(),
        "area": Int64Dtype(),
    }

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [(
        WikidataPipeline(),
        {
            "parse_opts": {
                "properties": {
                    "latitude": "P625",
                    "longitude": "P625",
                    "elevation": "P2044",
                    "area": "P2046",
                }
            }
        },
    )]
예제 #10
0
def spark_type_to_pandas_dtype(
    spark_type: types.DataType, *, use_extension_dtypes: bool = False
) -> Dtype:
    """Return the given Spark DataType to pandas dtype."""

    if use_extension_dtypes and extension_dtypes_available:
        # IntegralType
        if isinstance(spark_type, types.ByteType):
            return Int8Dtype()
        elif isinstance(spark_type, types.ShortType):
            return Int16Dtype()
        elif isinstance(spark_type, types.IntegerType):
            return Int32Dtype()
        elif isinstance(spark_type, types.LongType):
            return Int64Dtype()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(spark_type, types.BooleanType):
                return BooleanDtype()
            # StringType
            elif isinstance(spark_type, types.StringType):
                return StringDtype()

        # FractionalType
        if extension_float_dtypes_available:
            if isinstance(spark_type, types.FloatType):
                return Float32Dtype()
            elif isinstance(spark_type, types.DoubleType):
                return Float64Dtype()

    if isinstance(
        spark_type,
        (
            types.DateType,
            types.NullType,
            types.ArrayType,
            types.MapType,
            types.StructType,
            types.UserDefinedType,
        ),
    ):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
예제 #11
0
class IndexPipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "key": str,
        "wikidata": str,
        "datacommons": str,
        "country_code": str,
        "country_name": str,
        "subregion1_code": str,
        "subregion1_name": str,
        "subregion2_code": str,
        "subregion2_name": str,
        "3166-1-alpha-2": str,
        "3166-1-alpha-3": str,
        "aggregation_level": Int64Dtype(),
    }

    pipelines: List[Tuple[DataPipeline, Dict[str,
                                             Any]]] = [(IndexPipeline(), {})]
예제 #12
0
def parse_dtype(dtype_name: str) -> Any:
    """
    Parse a dtype name into its pandas name. Only the following dtypes are supported in
    our table schemas:

    | column type label | pandas dtype |
    | ----------------- | ------------ |
    | str               | str          |
    | int               | Int64        |
    | float             | float        |

    Arguments:
        dtype_name: label of the dtype object
    Returns:
        type: dtype object
    """
    if dtype_name == "str":
        return "str"
    if dtype_name == "int":
        return Int64Dtype()
    if dtype_name == "float":
        return "float"
    raise TypeError(f"Unsupported dtype: {dtype_name}")
예제 #13
0
class DemographicsPipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "key": str,
        "population": Int64Dtype(),
        "life_expectancy": float,
        "human_development_index": float,
    }

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [
        (
            WikidataPipeline(),
            {
                "parse_opts": {
                    "properties": {
                        "population": "P1082",
                        "life_expectancy": "P2250",
                        "human_development_index": "P1081",
                    }
                }
            },
        )
    ]
예제 #14
0
def test_read_hits(sample_pridb):
    hits = sample_pridb.read_hits()

    assert len(hits) == len(HITS_EXPECTED)
    assert hits.index.name == "set_id"
    assert hits.index.dtype == int64
    assert dict(hits.dtypes) == {
        "time": float64,
        "channel": Int64Dtype(),
        "param_id": Int64Dtype(),
        "threshold": float64,
        "amplitude": float64,
        "rise_time": float64,
        "cascade_counts": Int64Dtype(),
        "cascade_energy": float64,
        "cascade_hits": Int64Dtype(),
        "cascade_signal_strength": float64,
        "counts": Int64Dtype(),
        "duration": float64,
        "energy": float64,
        "rms": float64,
        "signal_strength": float64,
        "trai": Int64Dtype(),
    }
예제 #15
0
def test_london_cleaner():
    unclean_input = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["»A Smith, Matthew (GBR) \n", "»Aalders, Jennifer (GBR) \n"],
            "Sex": ["M", "W"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["Runner Number40546", "Runner Number23235"],
            "Category": ["18-39", pd.NA],
            "Finish": ["0 days 03:59:33", "0 days 06:22:20"],
            "Year": [2021, 2021],
        }
    )

    exp_output = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["A Smith Matthew", "Aalders Jennifer"],
            "Sex": ["M", "F"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["40546", "23235"],
            "Category": ["18-39", "Unknown"],
            "Finish": [
                pd.Timedelta("0 days 03:59:33"),
                pd.Timedelta("0 days 06:22:20"),
            ],
            "Year": [2021, 2021],
            "Country": ["GBR", "GBR"],
            "FirstName": ["Matthew", "Jennifer"],
            "LastName": ["A Smith", "Aalders"],
            "DSQ": [False, False],
            "Finish (Total Seconds)": [14373.0, 22940.0],
        }
    ).astype(
        {
            "Place (Overall)": Int64Dtype(),
            "Place (Gender)": Int64Dtype(),
            "Place (Category)": Int64Dtype(),
            "Name": dtype("O"),
            "Sex": dtype("O"),
            "Club": dtype("O"),
            "Running Number": dtype("O"),
            "Category": CategoricalDtype(
                categories=[
                    "18-39",
                    "40-44",
                    "45-49",
                    "50-54",
                    "55-59",
                    "60-64",
                    "65-69",
                    "70+",
                    "70-74",
                    "75-79",
                    "80-84",
                    "85+",
                    "80+",
                    "Unknown",
                ],
                ordered=False,
            ),
            "Finish": dtype("<m8[ns]"),
            "Year": Int64Dtype(),
            "Country": dtype("O"),
            "FirstName": dtype("O"),
            "LastName": dtype("O"),
            "DSQ": dtype("bool"),
            "Finish (Total Seconds)": dtype("float64"),
        }
    )

    actual_output = london_cleaner(unclean_input)

    pd.testing.assert_frame_equal(actual_output, exp_output, check_categorical=False)
def test_output_attributes(scraper_output):
    results = scraper_output
    exp_cols = [
        "Place (Overall)",
        "Place (Gender)",
        "Place (Category)",
        "Name",
        "Sex",
        "Club",
        "Running Number",
        "Category",
        "Finish",
        "Year",
        "Country",
        "FirstName",
        "LastName",
        "DSQ",
        "Finish (Total Seconds)",
    ]

    exp_dtypes = pd.Series({
        "Place (Overall)":
        Int64Dtype(),
        "Place (Gender)":
        Int64Dtype(),
        "Place (Category)":
        dtype("float64"),
        "Name":
        dtype("O"),
        "Sex":
        dtype("O"),
        "Club":
        dtype("O"),
        "Running Number":
        dtype("O"),
        "Category":
        CategoricalDtype(
            categories=[
                "18-39",
                "40-44",
                "45-49",
                "50-54",
                "55-59",
                "60-64",
                "65-69",
                "70+",
                "70-74",
                "75-79",
                "80-84",
                "85+",
                "80+",
                "Unknown",
            ],
            ordered=False,
        ),
        "Finish":
        dtype("<m8[ns]"),
        "Year":
        Int64Dtype(),
        "Country":
        dtype("O"),
        "FirstName":
        dtype("O"),
        "LastName":
        dtype("O"),
        "DSQ":
        dtype("bool"),
        "Finish (Total Seconds)":
        dtype("float64"),
    })

    exp_rows_min = 1000  # One sex for one year should give at least this many

    assert exp_cols == list(results.columns), "Expected columns not found"
    assert exp_rows_min <= results.shape[
        0], "Less than minimum expected number of rows"

    assert exp_dtypes.values.tolist() == results.dtypes.values.tolist()
예제 #17
0
class EpidemiologyPipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "date": str,
        "key": str,
        "new_confirmed": Int64Dtype(),
        "new_deceased": Int64Dtype(),
        "new_recovered": Int64Dtype(),
        "new_tested": Int64Dtype(),
        "total_confirmed": Int64Dtype(),
        "total_deceased": Int64Dtype(),
        "total_recovered": Int64Dtype(),
        "total_tested": Int64Dtype(),
    }

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [
        # Start with yesterday's data to make sure that we carry over datapoints in case the data
        # source has gone offline or is temporarily unavailable
        # (OpenCovid19Pipeline(), {}),
        # Data sources for all countries level 1
        (OurWorldInDataPipeline(), {}),
        (ECDCPipeline(), {}),
        # Data sources for AR level 2
        (
            WikipediaPipeline(
                "{}/{}/Argentina_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {
                "parse_opts": {
                    "date_format": "%d %b",
                    "country": "AR",
                    "skiprows": 1,
                    "cumsum": True,
                }
            },
        ),
        # Data sources for AT level 2
        (
            Covid19EuDataPipeline("AT"),
            # Remove dates with known bad data
            # TODO: apply patch to make up for missing dates
            {"filter_func": lambda x: not x.date in ["2020-04-14", "2020-04-15"]},
        ),
        # Data sources for AU level 2
        (Covid19AuPipeline(), {}),
        (
            WikipediaPipeline(
                "{}/{}/Australia_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%d %B", "country": "AU", "cumsum": True}},
        ),
        # Data sources for BO level 2
        (
            WikipediaPipeline(
                "{}/{}/Bolivia_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {
                "parse_opts": {
                    "date_format": "%b %d",
                    "country": "BO",
                    "skiprows": 1,
                    "droprows": "Date(2020)",
                }
            },
        ),
        # Data sources for BR level 2
        (Covid19BrazilTimeseriesPipeline(), {}),
        # Data sources for CA level 2
        (CanadaPipeline(), {}),
        # Data sources for CH level 2
        (OpenZHPipeline(), {}),
        # Data sources for CL level 2
        (
            WikipediaPipeline(
                "{}/{}/Chile_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%Y-%m-%d", "country": "CL", "skiprows": 1}},
        ),
        # Data sources for CN level 2
        (DXYPipeline(), {"parse_opts": {"country_name": "China"}}),
        # Data sources for CO levels 2 + 3
        (ColombiaPipeline(), {}),
        # Data sources for CZ level 2
        (Covid19EuDataPipeline("CZ"), {}),
        # Data sources for DE level 2
        (Covid19GermanyPipeline(), {}),
        # Data sources for ES levels 1 + 2
        # (DatadistaPipeline(), {}),
        (ISCIIIPipeline(), {}),
        # Data sources for FR level 2
        (
            WikipediaPipeline(
                "{}/{}/France_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%Y-%m-%d", "country": "FR", "skiprows": 1}},
        ),
        (FranceCovid19Pipeline(), {}),
        # Data sources for GB lebels 2 + 3
        (Covid19UkDataL2Pipeline(), {}),
        (Covid19UkDataL3Pipeline(), {}),
        # Data sources for ID level 2
        (CatchmeupPipeline(), {}),
        # Data sources for IN level 2
        (
            WikipediaPipeline("{}/2020_coronavirus_pandemic_in_India".format(_wiki_base_url)),
            {"parse_opts": {"date_format": "%b-%d", "country": "IN", "skiprows": 1}},
        ),
        # Data sources for IT level 2
        (PcmDpcL1Pipeline(), {}),
        (PcmDpcL2Pipeline(), {}),
        # Data sources for JP level 2
        (
            WikipediaPipeline(
                "{}/{}/Japan_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%Y/%m/%d", "country": "JP", "skiprows": 2}},
        ),
        (Jp2019NcovJapanByDate(), {}),
        # Data sources for KR level 2
        (
            WikipediaPipeline(
                "{}/{}/South_Korea_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%Y-%m-%d", "country": "KR", "skiprows": 1}},
        ),
        # Data sources for MY level 2
        (
            WikipediaPipeline("{}/2020_coronavirus_pandemic_in_Malaysia".format(_wiki_base_url)),
            {
                "parse_opts": {
                    "date_format": "%d/%m",
                    "country": "MY",
                    "cumsum": True,
                    "drop_column": "deceased",
                }
            },
        ),
        # Data sources for MX level 2
        (MexicoCovid19Pipeline(), {}),
        # Data sources for NL levels 2 + 3
        (CoronaWatchNlPipeline(), {}),
        # Data sources for NO level 2
        (Covid19EuDataPipeline("NO"), {}),
        # Data sources for PE level 2
        (
            WikipediaPipeline(
                "https://es.wikipedia.org/wiki/Pandemia_de_enfermedad_por_coronavirus_de_2020_en_Per%C3%BA"
            ),
            {
                "parse_opts": {
                    "date_format": "%d de %B",
                    "country": "PE",
                    "locale": "es_ES",
                    "skiprows": 1,
                }
            },
        ),
        # Data sources for PK level 2
        (
            WikipediaPipeline(
                "{}/{}/Pakistan_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {
                "parse_opts": {
                    "date_format": "%b %d",
                    "country": "PK",
                    "skiprows": 1,
                    "cumsum": True,
                }
            },
        ),
        # Data sources for PL level 2
        (Covid19EuDataPipeline("PL"), {}),
        # Data sources for PT level 2
        (Covid19PtPipeline(), {}),
        # Data sources for RU level 2
        (
            WikipediaPipeline(
                "{}/{}/Russia_medical_cases".format(_wiki_base_url, _wiki_template_path)
            ),
            {"parse_opts": {"date_format": "%d %b", "country": "RU", "skiprows": 1}},
        ),
        # Data sources for SE level 2
        (Covid19EuDataPipeline("SE"), {}),
        # Data sources for SI level 1
        (SloveniaPipeline(), {}),
        # Data sources for US levels 2 + 3
        (CovidTrackingPipeline(), {}),
        (NytCovidL2Pipeline(), {}),
        (NytCovidL3Pipeline(), {}),
    ]
예제 #18
0
def test_registry_byte_size_dtype(sound_subreg):
    from pandas import Int64Dtype

    assert sound_subreg["byte_size"].dtype == Int64Dtype()
예제 #19
0
class OxfordGovernmentResponsePipelineChain(PipelineChain):

    schema: Dict[str, type] = {
        "date": str,
        "key": str,
        "school_closing": Int64Dtype(),
        "workplace_closing": Int64Dtype(),
        "cancel_public_events": Int64Dtype(),
        "restrictions_on_gatherings": Int64Dtype(),
        "public_transport_closing": Int64Dtype(),
        "stay_at_home_requirements": Int64Dtype(),
        "restrictions_on_internal_movement": Int64Dtype(),
        "international_travel_controls": Int64Dtype(),
        "income_support": Int64Dtype(),
        "debt_relief": Int64Dtype(),
        "fiscal_measures": Int64Dtype(),
        "international_support": Int64Dtype(),
        "public_information_campaigns": Int64Dtype(),
        "testing_policy": Int64Dtype(),
        "contact_tracing": Int64Dtype(),
        "emergency_investment_in_healthcare": Int64Dtype(),
        "investment_in_vaccines": Int64Dtype(),
        "stringency_index": float,
    }

    pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [
        (OxfordGovernmentResponsePipeline(), {})
    ]