def test_write_constraint(specimen): """'constraint' argument to writer.write_dataset.""" with specimen("ng-ts.xml") as f: msg = sdmx.read_sdmx(f) # Fetch the message's DSD assert msg.structure.is_external_reference # NB the speciment included in tests/data has 'ECB_EXR_NG' as the # data structure ID; but a query against the web service gives # 'ECB_EXR1' for the same data structure. id = "ECB_EXR1" dsd = ( sdmx.Client(msg.structure.maintainer.id).get("datastructure", id).structure[id] ) # Create a ContentConstraint cc = dsd.make_constraint({"CURRENCY": "JPY+USD"}) # Write the message without constraint s1 = sdmx.to_pandas(msg) assert len(s1) == 12 assert set(s1.index.to_frame()["CURRENCY"]) == {"CHF", "GBP", "JPY", "USD"} # Writing using constraint produces a fewer items; only those matching the # constraint s2 = sdmx.to_pandas(msg, constraint=cc) assert len(s2) == 6 assert set(s2.index.to_frame()["CURRENCY"]) == {"JPY", "USD"}
def test_doc_example(): """Code from example.rst.""" import sdmx estat = sdmx.Client("ESTAT") metadata = estat.datastructure("DSD_une_rt_a") for cl in "CL_AGE", "CL_UNIT": print(sdmx.to_pandas(metadata.codelist[cl])) resp = estat.data("une_rt_a", key={"GEO": "EL+ES+IE"}, params={"startPeriod": "2007"}) data = sdmx.to_pandas(resp).xs("Y15-74", level="AGE", drop_level=False) data.loc[("A", "Y15-74", "PC_ACT", "T")] # Further checks per https://github.com/dr-leo/pandaSDMX/issues/157 # DimensionDescriptor for the structure message dd1 = metadata.structure.DSD_une_rt_a.dimensions # DimensionDescriptor retrieved whilst validating the data message dd2 = resp.data[0].structured_by.dimensions # DimensionDescriptors have same ID, components and order assert dd1 == dd2 # One SeriesKey from the data message sk = list(resp.data[0].series.keys())[0] # Key values have same order as in the DSD assert dd1.order_key(sk) == sk
def test_freq_in_series_attribute(self, req): # Test that we don't have regression on Issues #39 and #41 # INSEE time series provide the FREQ value as attribute on the series # instead of a dimension. This caused a runtime error when writing as # pandas dataframe. data_response = sdmx.read_sdmx(SERIES["UNEMPLOYMENT_CAT_A_B_C"]["data-fp"]) sdmx.to_pandas(data_response)
def test_write2pandas(self, msg): df = sdmx.to_pandas(msg, attributes="") assert isinstance(df, pd.Series) assert df.shape == (12, ) # with metadata df = sdmx.to_pandas(msg, attributes="osgd") assert df.shape == (12, 8) assert df.iloc[1].OBS_STATUS == "A"
def test_to_pandas(self, msg): # Single data series is converted to pd.Series data_series = sdmx.to_pandas(msg.data[0]) assert isinstance(data_series, pd.Series) # When len(msg.data) is 1, the data series in a single Dataset are # unwrapped automatically assert len(msg.data) == 1 data_series2 = sdmx.to_pandas(msg.data) # NB no '[0]' index pdt.assert_series_equal(data_series, data_series2)
def test_write_data_arguments(): msg = sdmx.read_sdmx(test_files(kind="data")["argvalues"][0]) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes="foobarbaz")
def test_write_data_arguments(specimen): # The identity here is not important; any non-empty DataMessage will work with specimen("INSEE/CNA-2010-CONSO-SI-A17.xml") as f: msg = sdmx.read_sdmx(f) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes="foobarbaz")
def test_endpoint(self, cache_path, client, endpoint, args): # See sdmx.testing._generate_endpoint_tests() for values of `endpoint` cache = cache_path.with_suffix(f".{endpoint}.xml") result = client.get(endpoint, tofile=cache, **args) # For debugging # print(cache, cache.read_text(), result, sep='\n\n') # assert False sdmx.to_pandas(result) del result
def test_endpoints(self, req, endpoint, args): # See pytest_generate_tests() for values of 'endpoint' cache = self._cache_path.with_suffix(f".{endpoint}.xml") result = req.get(endpoint, tofile=cache, **args) # For debugging # print(cache, cache.read_text(), result, sep='\n\n') # assert False sdmx.to_pandas(result) del result
def test_exr_constraints(): with specimen("1/structure-full.xml") as f: m = sdmx.read_sdmx(f) ECB_EXR1 = m.structure["ECB_EXR1"] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == "FREQ" # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert "W" in dd.get("FREQ") # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == "UNIT_MULT" assert "5" in ad.get("UNIT_MULT") pytest.xfail("constrained codes not implemented") assert len(m._constrained_codes), 14 assert "W" not in m._constrained_codes.FREQ key = {"FREQ": ["W"]} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({"CURRENCY": ["CHF"]}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({"FREQ": "A"}) # structure writer with constraints out = sdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = sdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id="62b5f19d-f1c9-495d-8446-a3661ed24753", prepared="2012-11-29T08:40:26Z", sender=model.Agency(id="ECB"), ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key( FREQ="D", CURRENCY="NZD", CURRENCY_DENOM="EUR", EXR_TYPE="SP00", EXR_SUFFIX="A", TIME_PERIOD="2013-01-18", ) obs_status = DataAttribute(id="OBS_STATUS") attr = {"OBS_STATUS": AttributeValue(value_for=obs_status, value="A")} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY="RUB", TIME_PERIOD="2013-01-18") ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = sdmx.to_pandas(msg) with specimen("flat.json") as f: ref = sdmx.read_sdmx(f) df2 = sdmx.to_pandas(ref) assert_pd_equal(df1, df2)
def test_write_conceptscheme(specimen): with specimen("common-structure.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cdc = data["concept_scheme"]["CROSS_DOMAIN_CONCEPTS"] assert cdc.loc["UNIT_MEASURE", "name"] == "Unit of Measure"
def test_doc_howto_timeseries(specimen): with specimen("sg-ts.xml") as f: ds = sdmx.read_sdmx(f).data[0] # Convert to pd.Series and unstack the time dimension to columns base = sdmx.to_pandas(ds) s1 = base.unstack("TIME_PERIOD") # DatetimeIndex on columns s1.columns = pd.to_datetime(s1.columns) assert isinstance(s1.columns, pd.DatetimeIndex) # DatetimeIndex on index s2 = base.unstack("TIME_PERIOD").transpose() s2.index = pd.to_datetime(s2.index) assert isinstance(s2.index, pd.DatetimeIndex) # Same with pd.PeriodIndex s3 = s1.to_period(axis=1) assert isinstance(s3.columns, pd.PeriodIndex) assert s3.columns.freqstr == "M" s4 = s2.to_period(axis=0) assert isinstance(s4.index, pd.PeriodIndex) assert s4.index.freqstr == "M"
def test_write_dataflow(): # Read the INSEE dataflow definition with specimen("INSEE/dataflow") as f: msg = sdmx.read_sdmx(f) # Convert to pandas result = sdmx.to_pandas(msg, include="dataflow") # Number of Dataflows described in the file assert len(result["dataflow"]) == 663 # ID and names of first Dataflows mbop = "Monthly Balance of Payments - " expected = pd.Series({ "ACT-TRIM-ANC": "Activity by sex and age - Quarterly series", "BPM6-CCAPITAL": "{}Capital account".format(mbop), "BPM6-CFINANCIER": "{}Financial account".format(mbop), "BPM6-CTRANSACTION": "{}Current transactions account".format(mbop), "BPM6-TOTAL": "{}Overall total and main headings".format(mbop), }) assert_pd_equal(result["dataflow"].head(), expected)
def get_sdmx(source=None, **args): """Retrieve data from *source* using :mod:`sdmx`. Arguments --------- source : str Name of a data source recognized by ``sdmx1``, e.g. 'OECD'. args Other arguments to :meth:`sdmx.Request.get`. Returns ------- pandas.DataFrame """ # SDMX client for the data source req = sdmx.Request(source=source) # commented: for debugging # args.setdefault('tofile', 'debug.json') # Retrieve the data msg = req.get(resource_type="data", **args) # Convert to pd.DataFrame, preserving attributes df = sdmx.to_pandas(msg, attributes="dgso") index_cols = df.index.names # Reset index, use categoricals return df.reset_index().astype({c: "category" for c in index_cols})
def test_pandas(self, msg): data = msg.data[0] series_keys = list(data.series.keys()) # Number of series in dataframe assert len(series_keys) == 4 # Convert the observations for one SeriesKey to a pd.Series s3_key = series_keys[3] s3 = sdmx.to_pandas(data.series[s3_key]) assert isinstance(s3, pd.Series) # Test a particular value assert s3[0] == 1.2894 # Length of index assert len(s3.index.names) == 6 # Convert again, with attributes pd_data = sdmx.to_pandas(data, attributes="osgd") # Select one SeriesKey's data out of the DataFrame keys, levels = zip(*[(kv.value, kv.id) for kv in s3_key]) s3 = pd_data.xs(keys, level=levels, drop_level=False) # Get the value of the first observation assert s3.iloc[0].value == 1.2894 # Length of index assert len(s3.index.names) == 6 # Number of attributes available assert len(set(s3.columns) - {"value"}) == 7 # Access an attribute of the first value. # NB that this uses… # 1. the *pandas* attribute access shorthand, NOT DictLike: # "s3.iloc[0]" is a single row of s3, i.e. a pd.Series; and # ".OBS_STATUS" accesses the ps.Series element associated with that # key in the index # 2. the AttributeValue.__eq__() comparison operator; # s3.iloc[0].OBS_STATUS is a full AttributeValue, rather than a str. assert s3.iloc[0].OBS_STATUS == "A" assert s3.iloc[0].OBS_STATUS.value_for == "OBS_STATUS" # consistency!
def test_write_categoryscheme(specimen): with specimen("IPI-2010-A21-structure.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cs = data["category_scheme"]["CLASSEMENT_DATAFLOWS"] assert cs.loc["COMPTA-NAT", "name"] == "National accounts (GDP, consumption...)" # Children appear assert cs.loc["CNA-PIB-2005", "parent"] == "CNA-PIB"
def test_write_data(specimen, path): msg = sdmx.read_sdmx(path) result = sdmx.to_pandas(msg) expected = specimen.expected_data(path) if expected is not None: print(expected, result, sep="\n") assert_pd_equal(expected, result) # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)
def test_write_codelist(specimen): # Retrieve codelists from a test specimen and convert to pandas with specimen("common-structure.xml") as f: dsd_common = sdmx.read_sdmx(f) codelists = sdmx.to_pandas(dsd_common)["codelist"] # File contains 5 code lists assert len(codelists) == 5 # Code lists have expected number of items assert len(codelists["CL_FREQ"]) == 8 # Items names can be retrieved by ID freq = codelists["CL_FREQ"] assert freq["A"] == "Annual" # Non-hierarchical code list has a string name assert freq.name == "Code list for Frequency (FREQ)" # Hierarchical code list with specimen("codelist_partial.xml") as f: msg = sdmx.read_sdmx(f) # Convert single codelist CL_AREA = sdmx.to_pandas(msg.codelist["CL_AREA"]) # Hierichical list has a 'parent' column; parent of Africa is the World assert CL_AREA.loc["002", "parent"] == "001" # Pandas features can be used to merge parent names area_hierarchy = pd.merge( CL_AREA, CL_AREA, how="left", left_on="parent", right_index=True, suffixes=("", "_parent"), ) assert area_hierarchy.loc["002", "name_parent"] == "World"
def test_writer(self, msg): cls_as_dfs = sdmx.to_pandas(msg.codelist) # Number of codes expected in each Codelist count = { "CL_FREQ": 6, "CL_GEO": 41, "CL_OBS_FLAG": 10, "CL_OBS_STATUS": 3, "CL_PRODMILK": 12, "CL_UNIT": 1, } assert all(len(df) == count[id] for id, df in cls_as_dfs.items())
def test_pandas(self, msg): data = msg.data[0] # Expected number of observations and series assert len(data.obs) == 12 assert len(data.series) == 4 # Single series can be converted to pandas s3 = sdmx.to_pandas(data.series[3], attributes="") assert isinstance(s3, pd.Series) # With expected values assert s3[0] == 1.2894 # Single series can be converted with attributes s3_attr = sdmx.to_pandas(data.series[3], attributes="osgd") # yields a DataFrame assert isinstance(s3_attr, pd.DataFrame) assert s3_attr.shape == (3, 8) assert s3_attr.iloc[0].value == 1.2894 # Attributes of observations can be accessed assert s3_attr.iloc[0].OBS_STATUS == "A"
def test_request_preview_data(): ECB = sdmx.Client("ECB") # List of keys can be retrieved keys = ECB.preview_data("EXR") assert isinstance(keys, list) # Count of keys can be determined assert len(keys) > 1000 # A filter can be provided, resulting in fewer keys keys = ECB.preview_data("EXR", {"CURRENCY": "CAD+CHF+CNY"}) assert len(keys) == 24 # Result can be converted to pandas object keys_pd = sdmx.to_pandas(keys) assert isinstance(keys_pd, pd.DataFrame) assert len(keys_pd) == 24
def test_write_agencyscheme(specimen): # Convert an agency scheme with specimen("ECB/orgscheme.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) assert data["organisation_scheme"]["AGENCIES"]["ESTAT"] == "Eurostat" # to_pandas only returns keys for non-empty attributes of StructureMessage # https://github.com/dr-leo/pandaSDMX/issues/90 assert set(data.keys()) == {"organisation_scheme"} # Attribute access works assert data.organisation_scheme.AGENCIES.ESTAT == "Eurostat" with pytest.raises(AttributeError): data.codelist with pytest.raises(AttributeError): data.dataflow with pytest.raises(AttributeError): data.structure
def test_doc_usage_data(): """Code examples in usage.rst.""" ecb = Client("ECB") data_response = ecb.data( resource_id="EXR", key={"CURRENCY": "USD+JPY"}, params={ "startPeriod": "2016", "endPeriod": "2016-12-31" }, ) # # Commented: do the same without triggering requests for validation # data_response = ecb.data(resource_id='EXR', key='.JPY+USD...', # params={'startPeriod': '2016', # 'endPeriod': '2016-12-31'}) data = data_response.data[0] assert type(data) is GenericDataSet # This message doesn't explicitly specify the remaining dimensions; unless # they are inferred from the SeriesKeys, then the DimensionDescriptor is # not complete # assert data.structured_by.dimensions[-1] == 'TIME_PERIOD' # data.dim_at_obs series_keys = list(data.series) assert len(series_keys) == 16 series_keys[5] assert sorted(set(sk.FREQ.value for sk in data.series)) == "A D H M Q".split() daily = sdmx.to_pandas(data).xs("D", level="FREQ") assert len(daily) == 514 assert_pd_equal(daily.tail().values, np.array([1.0446, 1.0445, 1.0401, 1.0453, 1.0541]))
def test_doc_index1(): """First code example in index.rst.""" estat = Client("ESTAT") flow_response = estat.dataflow("une_rt_a") with pytest.raises(TypeError): # This presumes the DataStructureDefinition instance can conduct a # network request for its own content structure_response = flow_response.dataflow.une_rt_a.structure( request=True, target_only=False) # Same effect structure_response = estat.get( "datastructure", flow_response.dataflow.une_rt_a.structure.id) # Even better: Client.get(…) should examine the class and ID of the object # structure = estat.get(flow_response.dataflow.une_rt_a.structure) # Show some codelists s = sdmx.to_pandas(structure_response) expected = pd.Series( { "AT": "Austria", "BE": "Belgium", "BG": "Bulgaria", "CH": "Switzerland", "CY": "Cyprus", }, name="GEO", ).rename_axis("CL_GEO") # Codelists are converted to a DictLike assert isinstance(s.codelist, DictLike) # Same effect assert_pd_equal(s.codelist["CL_GEO"].sort_index().head(), expected)
def test_writer_structure(path): msg = sdmx.read_sdmx(path) sdmx.to_pandas(msg)
def test_write2pandas(self, msg): data_series = sdmx.to_pandas(msg, attributes="") assert isinstance(data_series, pd.Series)
def test_dataframe(self, msg): data = msg.data[0] s = sdmx.to_pandas(data, attributes="") assert isinstance(s, pd.Series) assert len(s) == 12
def make_template(output_path: Path = None, verbose: bool = True): """Generate a data template. Outputs files containing all keys specified for the iTEM ``HISTORICAL`` data structure definition. The file is produced in two formats: - :file:`*.csv`: comma-separated values - :file:`*.xlsx`: Microsoft Excel. …and in three variants: - :file:`full.*`: with full dimensionality for every concept. - :file:`condensed.*`: with a reduced number of dimensions, with labels for some dimensions combining labels for others in shorter, conventional, human-readable form. - :file:`index.*`: an index or map between the two above versions. See also -------- .collapse """ # TODO Use SDMX constraints to filter on concepts that are parents of other concepts sm = generate() ds = merge_dsd( sm, "HISTORICAL", [ "GDP", "POPULATION", "PRICE_FUEL", "PRICE_POLLUTANT", "ACTIVITY_VEHICLE", "ACTIVITY", "ENERGY", "EMISSIONS", "ENERGY_INTENSITY", "SALES", "STOCK", "LOAD_FACTOR", ], ) # Convert to pd.DataFrame df0 = sdmx.to_pandas(ds).reset_index() # Save in multiple formats output_path = output_path or paths["output"] log.info(f"Output to {output_path}/{{index,template}}.{{csv,xlsx}}") # "Index" format: only simple replacements, full dimensionality df1 = df0.replace({ "_Z": "", np.NaN: "", "(REF_AREA)": "…", "(TIME_PERIOD)": "…" }) df1.to_csv(output_path / "full.csv") df1.to_excel(output_path / "full.xlsx") # "Template" format: more human-readable # Use names instead of IDs for labels in these dimensions replacements = name_for_id( sm.structure["HISTORICAL"], ("AUTOMATION FLEET FUEL MODE OPERATOR POLLUTANT SERVICE TECHNOLOGY VARIABLE " "VEHICLE").split(), ) # Rename all columns except "Value" using data structure info columns = dict() for dim_id in df1.columns: try: name = ( sm.structure["HISTORICAL"].dimensions.get( dim_id).concept_identity.name.localized_default( ) # type: ignore [union-attr] ) except (KeyError, AttributeError): # Use the dimension ID in title case for VARIABLE and VALUE, which do not # have a .concept_identity name = dim_id.title() finally: columns[dim_id] = name # Apply replacements; use collapse() above to reduce number of columns df2 = df1.replace(replacements).apply(collapse, axis=1).rename(columns=columns) df2.to_csv(output_path / "condensed.csv", index=False) df2.to_excel(output_path / "condensed.xlsx", index=False) # Output the index df3 = pd.concat({"FULL": df0, "CONDENSED": df1}, axis=1) df3.to_csv(output_path / "index.csv") df3.to_excel(output_path / "index.xlsx")
def test_write_data_attributes(path): msg = sdmx.read_sdmx(path) result = sdmx.to_pandas(msg, attributes="osgd") # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)