示例#1
0
def test_write_constraint(specimen):
    """'constraint' argument to writer.write_dataset."""
    with specimen("ng-ts.xml") as f:
        msg = sdmx.read_sdmx(f)

    # Fetch the message's DSD
    assert msg.structure.is_external_reference
    # NB the speciment included in tests/data has 'ECB_EXR_NG' as the
    #    data structure ID; but a query against the web service gives
    #    'ECB_EXR1' for the same data structure.
    id = "ECB_EXR1"
    dsd = (
        sdmx.Client(msg.structure.maintainer.id).get("datastructure", id).structure[id]
    )

    # Create a ContentConstraint
    cc = dsd.make_constraint({"CURRENCY": "JPY+USD"})

    # Write the message without constraint
    s1 = sdmx.to_pandas(msg)
    assert len(s1) == 12
    assert set(s1.index.to_frame()["CURRENCY"]) == {"CHF", "GBP", "JPY", "USD"}

    # Writing using constraint produces a fewer items; only those matching the
    # constraint
    s2 = sdmx.to_pandas(msg, constraint=cc)
    assert len(s2) == 6
    assert set(s2.index.to_frame()["CURRENCY"]) == {"JPY", "USD"}
示例#2
0
def test_doc_example():
    """Code from example.rst."""
    import sdmx

    estat = sdmx.Client("ESTAT")

    metadata = estat.datastructure("DSD_une_rt_a")

    for cl in "CL_AGE", "CL_UNIT":
        print(sdmx.to_pandas(metadata.codelist[cl]))

    resp = estat.data("une_rt_a",
                      key={"GEO": "EL+ES+IE"},
                      params={"startPeriod": "2007"})

    data = sdmx.to_pandas(resp).xs("Y15-74", level="AGE", drop_level=False)

    data.loc[("A", "Y15-74", "PC_ACT", "T")]

    # Further checks per https://github.com/dr-leo/pandaSDMX/issues/157

    # DimensionDescriptor for the structure message
    dd1 = metadata.structure.DSD_une_rt_a.dimensions

    # DimensionDescriptor retrieved whilst validating the data message
    dd2 = resp.data[0].structured_by.dimensions

    # DimensionDescriptors have same ID, components and order
    assert dd1 == dd2

    # One SeriesKey from the data message
    sk = list(resp.data[0].series.keys())[0]

    # Key values have same order as in the DSD
    assert dd1.order_key(sk) == sk
示例#3
0
 def test_freq_in_series_attribute(self, req):
     # Test that we don't have regression on Issues #39 and #41
     # INSEE time series provide the FREQ value as attribute on the series
     # instead of a dimension. This caused a runtime error when writing as
     # pandas dataframe.
     data_response = sdmx.read_sdmx(SERIES["UNEMPLOYMENT_CAT_A_B_C"]["data-fp"])
     sdmx.to_pandas(data_response)
示例#4
0
 def test_write2pandas(self, msg):
     df = sdmx.to_pandas(msg, attributes="")
     assert isinstance(df, pd.Series)
     assert df.shape == (12, )
     # with metadata
     df = sdmx.to_pandas(msg, attributes="osgd")
     assert df.shape == (12, 8)
     assert df.iloc[1].OBS_STATUS == "A"
示例#5
0
    def test_to_pandas(self, msg):
        # Single data series is converted to pd.Series
        data_series = sdmx.to_pandas(msg.data[0])
        assert isinstance(data_series, pd.Series)

        # When len(msg.data) is 1, the data series in a single Dataset are
        # unwrapped automatically
        assert len(msg.data) == 1
        data_series2 = sdmx.to_pandas(msg.data)  # NB no '[0]' index
        pdt.assert_series_equal(data_series, data_series2)
示例#6
0
def test_write_data_arguments():
    msg = sdmx.read_sdmx(test_files(kind="data")["argvalues"][0])

    # Attributes must be a string
    with raises(TypeError):
        sdmx.to_pandas(msg, attributes=2)

    # Attributes must contain only 'dgso'
    with raises(ValueError):
        sdmx.to_pandas(msg, attributes="foobarbaz")
示例#7
0
def test_write_data_arguments(specimen):
    # The identity here is not important; any non-empty DataMessage will work
    with specimen("INSEE/CNA-2010-CONSO-SI-A17.xml") as f:
        msg = sdmx.read_sdmx(f)

    # Attributes must be a string
    with raises(TypeError):
        sdmx.to_pandas(msg, attributes=2)

    # Attributes must contain only 'dgso'
    with raises(ValueError):
        sdmx.to_pandas(msg, attributes="foobarbaz")
示例#8
0
    def test_endpoint(self, cache_path, client, endpoint, args):
        # See sdmx.testing._generate_endpoint_tests() for values of `endpoint`
        cache = cache_path.with_suffix(f".{endpoint}.xml")
        result = client.get(endpoint, tofile=cache, **args)

        # For debugging
        # print(cache, cache.read_text(), result, sep='\n\n')
        # assert False

        sdmx.to_pandas(result)

        del result
示例#9
0
    def test_endpoints(self, req, endpoint, args):
        # See pytest_generate_tests() for values of 'endpoint'
        cache = self._cache_path.with_suffix(f".{endpoint}.xml")
        result = req.get(endpoint, tofile=cache, **args)

        # For debugging
        # print(cache, cache.read_text(), result, sep='\n\n')
        # assert False

        sdmx.to_pandas(result)

        del result
示例#10
0
def test_exr_constraints():
    with specimen("1/structure-full.xml") as f:
        m = sdmx.read_sdmx(f)
    ECB_EXR1 = m.structure["ECB_EXR1"]

    # Test DimensionDescriptor
    dd = ECB_EXR1.dimensions

    # Correct order
    assert dd[0].id == "FREQ"

    # Correct number of dimensions
    assert len(dd.components) == 6

    # Dimensions can be retrieved by name; membership can be tested
    assert "W" in dd.get("FREQ")

    # Similar tests for AttributeDescriptor
    ad = ECB_EXR1.attributes
    assert len(ad.components) == 24
    assert ad[-1].id == "UNIT_MULT"
    assert "5" in ad.get("UNIT_MULT")

    pytest.xfail("constrained codes not implemented")
    assert len(m._constrained_codes), 14

    assert "W" not in m._constrained_codes.FREQ

    key = {"FREQ": ["W"]}

    assert m.in_codes(key)

    assert not m.in_constraints(key, raise_error=False)

    with pytest.raises(ValueError):
        m.in_constraints(key)

    assert m.in_constraints({"CURRENCY": ["CHF"]})

    # test with invalid key
    with pytest.raises(TypeError):
        m._in_constraints({"FREQ": "A"})

    # structure writer with constraints
    out = sdmx.to_pandas(m)
    cl = out.codelist
    assert cl.shape == (3555, 2)

    # unconstrained codelists
    out = sdmx.to_pandas(m, constraint=False)
    cl = out.codelist
    assert cl.shape, (4177, 2)
示例#11
0
def test_flat():
    # Create a bare Message
    msg = DataMessage()

    # Recreate the content from exr-flat.json
    header = Header(
        id="62b5f19d-f1c9-495d-8446-a3661ed24753",
        prepared="2012-11-29T08:40:26Z",
        sender=model.Agency(id="ECB"),
    )
    msg.header = header

    ds = DataSet()

    # Create a Key and attributes
    key = Key(
        FREQ="D",
        CURRENCY="NZD",
        CURRENCY_DENOM="EUR",
        EXR_TYPE="SP00",
        EXR_SUFFIX="A",
        TIME_PERIOD="2013-01-18",
    )
    obs_status = DataAttribute(id="OBS_STATUS")
    attr = {"OBS_STATUS": AttributeValue(value_for=obs_status, value="A")}

    ds.obs.append(
        Observation(dimension=key, value=1.5931, attached_attribute=attr))

    key = key.copy(TIME_PERIOD="2013-01-21")
    ds.obs.append(
        Observation(dimension=key, value=1.5925, attached_attribute=attr))

    key = key.copy(CURRENCY="RUB", TIME_PERIOD="2013-01-18")
    ds.obs.append(
        Observation(dimension=key, value=40.3426, attached_attribute=attr))

    key = key.copy(TIME_PERIOD="2013-01-21")
    ds.obs.append(
        Observation(dimension=key, value=40.3000, attached_attribute=attr))

    msg.data.append(ds)

    # Write to pd.Dataframe
    df1 = sdmx.to_pandas(msg)

    with specimen("flat.json") as f:
        ref = sdmx.read_sdmx(f)
    df2 = sdmx.to_pandas(ref)

    assert_pd_equal(df1, df2)
示例#12
0
def test_write_conceptscheme(specimen):
    with specimen("common-structure.xml") as f:
        msg = sdmx.read_sdmx(f)
        data = sdmx.to_pandas(msg)

    cdc = data["concept_scheme"]["CROSS_DOMAIN_CONCEPTS"]
    assert cdc.loc["UNIT_MEASURE", "name"] == "Unit of Measure"
示例#13
0
def test_doc_howto_timeseries(specimen):
    with specimen("sg-ts.xml") as f:
        ds = sdmx.read_sdmx(f).data[0]

    # Convert to pd.Series and unstack the time dimension to columns
    base = sdmx.to_pandas(ds)
    s1 = base.unstack("TIME_PERIOD")

    # DatetimeIndex on columns
    s1.columns = pd.to_datetime(s1.columns)
    assert isinstance(s1.columns, pd.DatetimeIndex)

    # DatetimeIndex on index
    s2 = base.unstack("TIME_PERIOD").transpose()
    s2.index = pd.to_datetime(s2.index)
    assert isinstance(s2.index, pd.DatetimeIndex)

    # Same with pd.PeriodIndex
    s3 = s1.to_period(axis=1)
    assert isinstance(s3.columns, pd.PeriodIndex)
    assert s3.columns.freqstr == "M"

    s4 = s2.to_period(axis=0)
    assert isinstance(s4.index, pd.PeriodIndex)
    assert s4.index.freqstr == "M"
示例#14
0
def test_write_dataflow():
    # Read the INSEE dataflow definition
    with specimen("INSEE/dataflow") as f:
        msg = sdmx.read_sdmx(f)

    # Convert to pandas
    result = sdmx.to_pandas(msg, include="dataflow")

    # Number of Dataflows described in the file
    assert len(result["dataflow"]) == 663

    # ID and names of first Dataflows
    mbop = "Monthly Balance of Payments - "
    expected = pd.Series({
        "ACT-TRIM-ANC":
        "Activity by sex and age - Quarterly series",
        "BPM6-CCAPITAL":
        "{}Capital account".format(mbop),
        "BPM6-CFINANCIER":
        "{}Financial account".format(mbop),
        "BPM6-CTRANSACTION":
        "{}Current transactions account".format(mbop),
        "BPM6-TOTAL":
        "{}Overall total and main headings".format(mbop),
    })
    assert_pd_equal(result["dataflow"].head(), expected)
示例#15
0
def get_sdmx(source=None, **args):
    """Retrieve data from *source* using :mod:`sdmx`.

    Arguments
    ---------
    source : str
        Name of a data source recognized by ``sdmx1``, e.g. 'OECD'.
    args
        Other arguments to :meth:`sdmx.Request.get`.

    Returns
    -------
    pandas.DataFrame
    """
    # SDMX client for the data source
    req = sdmx.Request(source=source)

    # commented: for debugging
    # args.setdefault('tofile', 'debug.json')

    # Retrieve the data
    msg = req.get(resource_type="data", **args)

    # Convert to pd.DataFrame, preserving attributes
    df = sdmx.to_pandas(msg, attributes="dgso")
    index_cols = df.index.names

    # Reset index, use categoricals
    return df.reset_index().astype({c: "category" for c in index_cols})
示例#16
0
    def test_pandas(self, msg):
        data = msg.data[0]

        series_keys = list(data.series.keys())

        # Number of series in dataframe
        assert len(series_keys) == 4

        # Convert the observations for one SeriesKey to a pd.Series
        s3_key = series_keys[3]
        s3 = sdmx.to_pandas(data.series[s3_key])
        assert isinstance(s3, pd.Series)

        # Test a particular value
        assert s3[0] == 1.2894

        # Length of index
        assert len(s3.index.names) == 6

        # Convert again, with attributes
        pd_data = sdmx.to_pandas(data, attributes="osgd")

        # Select one SeriesKey's data out of the DataFrame
        keys, levels = zip(*[(kv.value, kv.id) for kv in s3_key])
        s3 = pd_data.xs(keys, level=levels, drop_level=False)

        # Get the value of the first observation
        assert s3.iloc[0].value == 1.2894

        # Length of index
        assert len(s3.index.names) == 6

        # Number of attributes available
        assert len(set(s3.columns) - {"value"}) == 7

        # Access an attribute of the first value.
        # NB that this uses…
        # 1. the *pandas* attribute access shorthand, NOT DictLike:
        #    "s3.iloc[0]" is a single row of s3, i.e. a pd.Series; and
        #    ".OBS_STATUS" accesses the ps.Series element associated with that
        #    key in the index
        # 2. the AttributeValue.__eq__() comparison operator;
        #    s3.iloc[0].OBS_STATUS is a full AttributeValue, rather than a str.
        assert s3.iloc[0].OBS_STATUS == "A"
        assert s3.iloc[0].OBS_STATUS.value_for == "OBS_STATUS"  # consistency!
示例#17
0
def test_write_categoryscheme(specimen):
    with specimen("IPI-2010-A21-structure.xml") as f:
        msg = sdmx.read_sdmx(f)
        data = sdmx.to_pandas(msg)

    cs = data["category_scheme"]["CLASSEMENT_DATAFLOWS"]

    assert cs.loc["COMPTA-NAT", "name"] == "National accounts (GDP, consumption...)"

    # Children appear
    assert cs.loc["CNA-PIB-2005", "parent"] == "CNA-PIB"
示例#18
0
def test_write_data(specimen, path):
    msg = sdmx.read_sdmx(path)

    result = sdmx.to_pandas(msg)

    expected = specimen.expected_data(path)
    if expected is not None:
        print(expected, result, sep="\n")
    assert_pd_equal(expected, result)

    # TODO incomplete
    assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)
示例#19
0
def test_write_codelist(specimen):
    # Retrieve codelists from a test specimen and convert to pandas
    with specimen("common-structure.xml") as f:
        dsd_common = sdmx.read_sdmx(f)
    codelists = sdmx.to_pandas(dsd_common)["codelist"]

    # File contains 5 code lists
    assert len(codelists) == 5

    # Code lists have expected number of items
    assert len(codelists["CL_FREQ"]) == 8

    # Items names can be retrieved by ID
    freq = codelists["CL_FREQ"]
    assert freq["A"] == "Annual"

    # Non-hierarchical code list has a string name
    assert freq.name == "Code list for Frequency (FREQ)"

    # Hierarchical code list
    with specimen("codelist_partial.xml") as f:
        msg = sdmx.read_sdmx(f)

    # Convert single codelist
    CL_AREA = sdmx.to_pandas(msg.codelist["CL_AREA"])

    # Hierichical list has a 'parent' column; parent of Africa is the World
    assert CL_AREA.loc["002", "parent"] == "001"

    # Pandas features can be used to merge parent names
    area_hierarchy = pd.merge(
        CL_AREA,
        CL_AREA,
        how="left",
        left_on="parent",
        right_index=True,
        suffixes=("", "_parent"),
    )
    assert area_hierarchy.loc["002", "name_parent"] == "World"
示例#20
0
    def test_writer(self, msg):
        cls_as_dfs = sdmx.to_pandas(msg.codelist)

        # Number of codes expected in each Codelist
        count = {
            "CL_FREQ": 6,
            "CL_GEO": 41,
            "CL_OBS_FLAG": 10,
            "CL_OBS_STATUS": 3,
            "CL_PRODMILK": 12,
            "CL_UNIT": 1,
        }

        assert all(len(df) == count[id] for id, df in cls_as_dfs.items())
示例#21
0
    def test_pandas(self, msg):
        data = msg.data[0]

        # Expected number of observations and series
        assert len(data.obs) == 12
        assert len(data.series) == 4

        # Single series can be converted to pandas
        s3 = sdmx.to_pandas(data.series[3], attributes="")
        assert isinstance(s3, pd.Series)
        # With expected values
        assert s3[0] == 1.2894

        # Single series can be converted with attributes
        s3_attr = sdmx.to_pandas(data.series[3], attributes="osgd")

        # yields a DataFrame
        assert isinstance(s3_attr, pd.DataFrame)
        assert s3_attr.shape == (3, 8)

        assert s3_attr.iloc[0].value == 1.2894

        # Attributes of observations can be accessed
        assert s3_attr.iloc[0].OBS_STATUS == "A"
示例#22
0
def test_request_preview_data():
    ECB = sdmx.Client("ECB")

    # List of keys can be retrieved
    keys = ECB.preview_data("EXR")
    assert isinstance(keys, list)

    # Count of keys can be determined
    assert len(keys) > 1000

    # A filter can be provided, resulting in fewer keys
    keys = ECB.preview_data("EXR", {"CURRENCY": "CAD+CHF+CNY"})
    assert len(keys) == 24

    # Result can be converted to pandas object
    keys_pd = sdmx.to_pandas(keys)
    assert isinstance(keys_pd, pd.DataFrame)
    assert len(keys_pd) == 24
示例#23
0
def test_write_agencyscheme(specimen):
    # Convert an agency scheme
    with specimen("ECB/orgscheme.xml") as f:
        msg = sdmx.read_sdmx(f)
        data = sdmx.to_pandas(msg)

    assert data["organisation_scheme"]["AGENCIES"]["ESTAT"] == "Eurostat"

    # to_pandas only returns keys for non-empty attributes of StructureMessage
    # https://github.com/dr-leo/pandaSDMX/issues/90
    assert set(data.keys()) == {"organisation_scheme"}

    # Attribute access works
    assert data.organisation_scheme.AGENCIES.ESTAT == "Eurostat"

    with pytest.raises(AttributeError):
        data.codelist
    with pytest.raises(AttributeError):
        data.dataflow
    with pytest.raises(AttributeError):
        data.structure
示例#24
0
def test_doc_usage_data():
    """Code examples in usage.rst."""
    ecb = Client("ECB")

    data_response = ecb.data(
        resource_id="EXR",
        key={"CURRENCY": "USD+JPY"},
        params={
            "startPeriod": "2016",
            "endPeriod": "2016-12-31"
        },
    )
    # # Commented: do the same without triggering requests for validation
    # data_response = ecb.data(resource_id='EXR', key='.JPY+USD...',
    #                          params={'startPeriod': '2016',
    #                                  'endPeriod': '2016-12-31'})
    data = data_response.data[0]

    assert type(data) is GenericDataSet

    # This message doesn't explicitly specify the remaining dimensions; unless
    # they are inferred from the SeriesKeys, then the DimensionDescriptor is
    # not complete
    # assert data.structured_by.dimensions[-1] == 'TIME_PERIOD'
    # data.dim_at_obs

    series_keys = list(data.series)

    assert len(series_keys) == 16

    series_keys[5]

    assert sorted(set(sk.FREQ.value
                      for sk in data.series)) == "A D H M Q".split()

    daily = sdmx.to_pandas(data).xs("D", level="FREQ")
    assert len(daily) == 514

    assert_pd_equal(daily.tail().values,
                    np.array([1.0446, 1.0445, 1.0401, 1.0453, 1.0541]))
示例#25
0
def test_doc_index1():
    """First code example in index.rst."""
    estat = Client("ESTAT")
    flow_response = estat.dataflow("une_rt_a")

    with pytest.raises(TypeError):
        # This presumes the DataStructureDefinition instance can conduct a
        # network request for its own content
        structure_response = flow_response.dataflow.une_rt_a.structure(
            request=True, target_only=False)

    # Same effect
    structure_response = estat.get(
        "datastructure", flow_response.dataflow.une_rt_a.structure.id)

    # Even better: Client.get(…) should examine the class and ID of the object
    # structure = estat.get(flow_response.dataflow.une_rt_a.structure)

    # Show some codelists
    s = sdmx.to_pandas(structure_response)
    expected = pd.Series(
        {
            "AT": "Austria",
            "BE": "Belgium",
            "BG": "Bulgaria",
            "CH": "Switzerland",
            "CY": "Cyprus",
        },
        name="GEO",
    ).rename_axis("CL_GEO")

    # Codelists are converted to a DictLike
    assert isinstance(s.codelist, DictLike)

    # Same effect
    assert_pd_equal(s.codelist["CL_GEO"].sort_index().head(), expected)
示例#26
0
def test_writer_structure(path):
    msg = sdmx.read_sdmx(path)

    sdmx.to_pandas(msg)
示例#27
0
 def test_write2pandas(self, msg):
     data_series = sdmx.to_pandas(msg, attributes="")
     assert isinstance(data_series, pd.Series)
示例#28
0
 def test_dataframe(self, msg):
     data = msg.data[0]
     s = sdmx.to_pandas(data, attributes="")
     assert isinstance(s, pd.Series)
     assert len(s) == 12
示例#29
0
def make_template(output_path: Path = None, verbose: bool = True):
    """Generate a data template.

    Outputs files containing all keys specified for the iTEM ``HISTORICAL`` data
    structure definition. The file is produced in two formats:

    - :file:`*.csv`: comma-separated values
    - :file:`*.xlsx`: Microsoft Excel.

    …and in three variants:

    - :file:`full.*`: with full dimensionality for every concept.
    - :file:`condensed.*`: with a reduced number of dimensions, with labels for some
      dimensions combining labels for others in shorter, conventional, human-readable
      form.
    - :file:`index.*`: an index or map between the two above versions.

    See also
    --------
    .collapse
    """
    # TODO Use SDMX constraints to filter on concepts that are parents of other concepts

    sm = generate()

    ds = merge_dsd(
        sm,
        "HISTORICAL",
        [
            "GDP",
            "POPULATION",
            "PRICE_FUEL",
            "PRICE_POLLUTANT",
            "ACTIVITY_VEHICLE",
            "ACTIVITY",
            "ENERGY",
            "EMISSIONS",
            "ENERGY_INTENSITY",
            "SALES",
            "STOCK",
            "LOAD_FACTOR",
        ],
    )

    # Convert to pd.DataFrame
    df0 = sdmx.to_pandas(ds).reset_index()

    # Save in multiple formats
    output_path = output_path or paths["output"]
    log.info(f"Output to {output_path}/{{index,template}}.{{csv,xlsx}}")

    # "Index" format: only simple replacements, full dimensionality
    df1 = df0.replace({
        "_Z": "",
        np.NaN: "",
        "(REF_AREA)": "…",
        "(TIME_PERIOD)": "…"
    })

    df1.to_csv(output_path / "full.csv")
    df1.to_excel(output_path / "full.xlsx")

    # "Template" format: more human-readable

    # Use names instead of IDs for labels in these dimensions
    replacements = name_for_id(
        sm.structure["HISTORICAL"],
        ("AUTOMATION FLEET FUEL MODE OPERATOR POLLUTANT SERVICE TECHNOLOGY VARIABLE "
         "VEHICLE").split(),
    )
    # Rename all columns except "Value" using data structure info
    columns = dict()
    for dim_id in df1.columns:
        try:
            name = (
                sm.structure["HISTORICAL"].dimensions.get(
                    dim_id).concept_identity.name.localized_default(
                    )  # type: ignore [union-attr]
            )
        except (KeyError, AttributeError):
            # Use the dimension ID in title case for VARIABLE and VALUE, which do not
            # have a .concept_identity
            name = dim_id.title()
        finally:
            columns[dim_id] = name

    # Apply replacements; use collapse() above to reduce number of columns
    df2 = df1.replace(replacements).apply(collapse,
                                          axis=1).rename(columns=columns)

    df2.to_csv(output_path / "condensed.csv", index=False)
    df2.to_excel(output_path / "condensed.xlsx", index=False)

    # Output the index
    df3 = pd.concat({"FULL": df0, "CONDENSED": df1}, axis=1)
    df3.to_csv(output_path / "index.csv")
    df3.to_excel(output_path / "index.xlsx")
示例#30
0
def test_write_data_attributes(path):
    msg = sdmx.read_sdmx(path)

    result = sdmx.to_pandas(msg, attributes="osgd")
    # TODO incomplete
    assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)