def test_write_constraint(): """'constraint' argument to writer.write_dataset.""" with specimen("ng-ts.xml") as f: msg = pandasdmx.read_sdmx(f) # Fetch the message's DSD assert msg.structure.is_external_reference # NB the speciment included in tests/data has 'ECB_EXR_NG' as the # data structure ID; but a query against the web service gives # 'ECB_EXR1' for the same data structure. id = "ECB_EXR1" dsd = ( pandasdmx.Request(msg.structure.maintainer.id) .get("datastructure", id) .structure[id] ) # Create a ContentConstraint cc = dsd.make_constraint({"CURRENCY": "JPY+USD"}) # Write the message without constraint s1 = pandasdmx.to_pandas(msg) assert len(s1) == 12 assert set(s1.index.to_frame()["CURRENCY"]) == {"CHF", "GBP", "JPY", "USD"} # Writing using constraint produces a fewer items; only those matching the # constraint s2 = pandasdmx.to_pandas(msg, constraint=cc) assert len(s2) == 6 assert set(s2.index.to_frame()["CURRENCY"]) == {"JPY", "USD"}
def test_freq_in_series_attribute(self, req): # Test that we don't have regression on Issues #39 and #41 # INSEE time series provide the FREQ value as attribute on the series # instead of a dimension. This caused a runtime error when writing as # pandas dataframe. data_response = pandasdmx.read_sdmx(SERIES["UNEMPLOYMENT_CAT_A_B_C"]["data-fp"]) pandasdmx.to_pandas(data_response)
def test_write_codelist(): # Retrieve codelists from a test specimen and convert to pandas with specimen('common-structure.xml') as f: dsd_common = sdmx.read_sdmx(f) codelists = sdmx.to_pandas(dsd_common)['codelist'] # File contains 5 code lists assert len(codelists) == 5 # Code lists have expected number of items assert len(codelists['CL_FREQ']) == 8 # Items names can be retrieved by ID freq = codelists['CL_FREQ'] assert freq['A'] == 'Annual' # Non-hierarchical code list has a string name assert freq.name == 'Code list for Frequency (FREQ)' # Hierarchical code list with specimen('codelist_partial.xml') as f: msg = sdmx.read_sdmx(f) # Convert single codelist CL_AREA = sdmx.to_pandas(msg.codelist['CL_AREA']) # Hierichical list has a 'parent' column; parent of Africa is the World assert CL_AREA.loc['002', 'parent'] == '001' # Pandas features can be used to merge parent names area_hierarchy = pd.merge(CL_AREA, CL_AREA, how='left', left_on='parent', right_index=True, suffixes=('', '_parent')) assert area_hierarchy.loc['002', 'name_parent'] == 'World'
def test_write_constraint(): """'constraint' argument to writer.write_dataset.""" with specimen('ng-ts.xml') as f: msg = sdmx.read_sdmx(f) # Fetch the message's DSD assert msg.structure.is_external_reference # NB the speciment included in tests/data has 'ECB_EXR_NG' as the # data structure ID; but a query against the web service gives # 'ECB_EXR1' for the same data structure. id = 'ECB_EXR1' dsd = sdmx.Request(msg.structure.maintainer.id) \ .get('datastructure', id) \ .structure[id] # Create a ContentConstraint cc = dsd.make_constraint({'CURRENCY': 'JPY+USD'}) # Write the message without constraint s1 = sdmx.to_pandas(msg) assert len(s1) == 12 assert set(s1.index.to_frame()['CURRENCY']) == {'CHF', 'GBP', 'JPY', 'USD'} # Writing using constraint produces a fewer items; only those matching the # constraint s2 = sdmx.to_pandas(msg, constraint=cc) assert len(s2) == 6 assert set(s2.index.to_frame()['CURRENCY']) == {'JPY', 'USD'}
def test_write2pandas(self, msg): df = pandasdmx.to_pandas(msg, attributes="") assert isinstance(df, pd.Series) assert df.shape == (12,) # with metadata df = pandasdmx.to_pandas(msg, attributes="osgd") assert df.shape == (12, 8) assert df.iloc[1].OBS_STATUS == "A"
def test_write_data_arguments(): msg = sdmx.read_sdmx(test_files(kind='data')['argvalues'][0]) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes='foobarbaz')
def test_to_pandas(self, msg): # Single data series is converted to pd.Series data_series = sdmx.to_pandas(msg.data[0]) assert isinstance(data_series, pd.Series) # When len(msg.data) is 1, the data series in a single Dataset are # unwrapped automatically assert len(msg.data) == 1 data_series2 = sdmx.to_pandas(msg.data) # NB no '[0]' index pdt.assert_series_equal(data_series, data_series2)
def test_write_data_arguments(): msg = pandasdmx.read_sdmx(test_files(kind="data")["argvalues"][0]) # Attributes must be a string with raises(TypeError): pandasdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): pandasdmx.to_pandas(msg, attributes="foobarbaz")
def test_write2pandas(self, msg): df = sdmx.to_pandas(msg, attributes='') assert isinstance(df, pd.Series) assert df.shape == (12, ) # with metadata df = sdmx.to_pandas(msg, attributes='osgd') df, mdf = df.iloc[:, 0], df.iloc[:, 1:] assert mdf.shape == (12, 7) assert mdf.iloc[1].OBS_STATUS == 'A'
def test_endpoints(self, req, endpoint, args): # See pytest_generate_tests() for values of 'endpoint' cache = self._cache_path.with_suffix(f".{endpoint}.xml") result = req.get(endpoint, tofile=cache, **args) # For debugging # print(cache, cache.read_text(), result, sep='\n\n') # assert False pandasdmx.to_pandas(result) del result
def test_exr_constraints(): with specimen("1/structure-full.xml") as f: m = pandasdmx.read_sdmx(f) ECB_EXR1 = m.structure["ECB_EXR1"] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == "FREQ" # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert "W" in dd.get("FREQ") # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == "UNIT_MULT" assert "5" in ad.get("UNIT_MULT") pytest.xfail("constrained codes not implemented") assert len(m._constrained_codes), 14 assert "W" not in m._constrained_codes.FREQ key = {"FREQ": ["W"]} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({"CURRENCY": ["CHF"]}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({"FREQ": "A"}) # structure writer with constraints out = pandasdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = pandasdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_exr_constraints(): with specimen('1/structure-full.xml') as f: m = sdmx.read_sdmx(f) ECB_EXR1 = m.structure['ECB_EXR1'] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == 'FREQ' # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert 'W' in dd.get('FREQ') # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == 'UNIT_MULT' assert '5' in ad.get('UNIT_MULT') pytest.xfail('constrained codes not implemented') # TODO assert len(m._constrained_codes), 14 assert 'W' not in m._constrained_codes.FREQ key = {'FREQ': ['W']} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({'CURRENCY': ['CHF']}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({'FREQ': 'A'}) # structure writer with constraints out = sdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = sdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id="62b5f19d-f1c9-495d-8446-a3661ed24753", prepared="2012-11-29T08:40:26Z", sender=model.Agency(id="ECB"), ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key( FREQ="D", CURRENCY="NZD", CURRENCY_DENOM="EUR", EXR_TYPE="SP00", EXR_SUFFIX="A", TIME_PERIOD="2013-01-18", ) obs_status = DataAttribute(id="OBS_STATUS") attr = {"OBS_STATUS": AttributeValue(value_for=obs_status, value="A")} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY="RUB", TIME_PERIOD="2013-01-18") ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = pandasdmx.to_pandas(msg) with specimen("flat.json") as f: ref = pandasdmx.read_sdmx(f) df2 = pandasdmx.to_pandas(ref) assert_pd_equal(df1, df2)
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id='62b5f19d-f1c9-495d-8446-a3661ed24753', prepared='2012-11-29T08:40:26Z', sender='ECB', ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key(FREQ='D', CURRENCY='NZD', CURRENCY_DENOM='EUR', EXR_TYPE='SP00', EXR_SUFFIX='A', TIME_PERIOD='2013-01-18') obs_status = DataAttribute(id='OBS_STATUS') attr = {'OBS_STATUS': AttributeValue(value_for=obs_status, value='A')} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD='2013-01-21') ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY='RUB', TIME_PERIOD='2013-01-18') ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD='2013-01-21') ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = sdmx.to_pandas(msg) with specimen('flat.json') as f: ref = sdmx.read_sdmx(f) df2 = sdmx.to_pandas(ref) assert_pd_equal(df1, df2)
def test_doc_index1(): """First code example in index.rst.""" estat = Request('ESTAT') flow_response = estat.dataflow('une_rt_a') with pytest.raises(TypeError): # This presumes the DataStructureDefinition instance can conduct a # network request for its own content structure_response = flow_response.dataflow.une_rt_a.structure( request=True, target_only=False) # Same effect structure_response = estat.get( 'datastructure', flow_response.dataflow.une_rt_a.structure.id) # Even better: Request.get(…) should examine the class and ID of the object # structure = estat.get(flow_response.dataflow.une_rt_a.structure) # Show some codelists s = sdmx.to_pandas(structure_response) expected = pd.Series({ 'AT': 'Austria', 'BE': 'Belgium', 'BG': 'Bulgaria', 'CH': 'Switzerland', 'CY': 'Cyprus', }, name='GEO') \ .rename_axis('CL_GEO') # Codelists are converted to a DictLike assert isinstance(s.codelist, DictLike) # Same effect assert_pd_equal(s.codelist['CL_GEO'].sort_index().head(), expected)
def test_write_dataflow(): # Read the INSEE dataflow definition with specimen('INSEE/dataflow') as f: msg = sdmx.read_sdmx(f) # Convert to pandas result = sdmx.to_pandas(msg, include='dataflow') # Number of Dataflows described in the file assert len(result['dataflow']) == 663 # ID and names of first Dataflows mbop = 'Monthly Balance of Payments - ' expected = pd.Series({ 'ACT-TRIM-ANC': 'Activity by sex and age - Quarterly series', 'BPM6-CCAPITAL': '{}Capital account'.format(mbop), 'BPM6-CFINANCIER': '{}Financial account'.format(mbop), 'BPM6-CTRANSACTION': '{}Current transactions account'.format(mbop), 'BPM6-TOTAL': '{}Overall total and main headings'.format(mbop), }) assert_pd_equal(result['dataflow'].head(), expected)
def _get_dimensions_1(structure_message): # pandasdmx v1 (dsd, ) = structure_message.structure.values() # unpack exactly 1 table dimension_names = pandasdmx.to_pandas(dsd.dimensions) codelist = pandasdmx.to_pandas(structure_message.codelist) dimension_items = { name.lower(): codelist[f"CL_{name}"] for name in dimension_names if name not in _EXCLUDED_DIMENSIONS } dimensions = (pd.concat(dimension_items).rename_axis( ["dimension", "code"]).rename("label").to_frame()) return dimensions
def test_doc_usage_data(): """Code examples in usage.rst.""" ecb = Request('ECB') data_response = ecb.data(resource_id='EXR', key={'CURRENCY': 'USD+JPY'}, params={'startPeriod': '2016', 'endPeriod': '2016-12-31'}) # # Commented: do the same without triggering requests for validation # data_response = ecb.data(resource_id='EXR', key='.JPY+USD...', # params={'startPeriod': '2016', # 'endPeriod': '2016-12-31'}) data = data_response.data[0] assert type(data) is DataSet # This message doesn't explicitly specify the remaining dimensions; unless # they are inferred from the SeriesKeys, then the DimensionDescriptor is # not complete # assert data.structured_by.dimensions[-1] == 'TIME_PERIOD' # data.dim_at_obs series_keys = list(data.series) assert len(series_keys) == 16 series_keys[5] assert (sorted(set(sk.FREQ.value for sk in data.series)) == 'A D H M Q'.split()) daily = sdmx.to_pandas(data).xs('D', level='FREQ') assert len(daily) == 514 assert_pd_equal(daily.tail().values, np.array([1.0446, 1.0445, 1.0401, 1.0453, 1.0541]))
def test_write_conceptscheme(): with specimen('common-structure.xml') as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cdc = data['concept_scheme']['CROSS_DOMAIN_CONCEPTS'] assert cdc.loc['UNIT_MEASURE', 'name'] == 'Unit of Measure'
def test_doc_howto_timeseries(): with specimen("sg-ts.xml") as f: ds = pandasdmx.read_sdmx(f).data[0] # Convert to pd.Series and unstack the time dimension to columns base = pandasdmx.to_pandas(ds) s1 = base.unstack("TIME_PERIOD") # DatetimeIndex on columns s1.columns = pd.to_datetime(s1.columns) assert isinstance(s1.columns, pd.DatetimeIndex) # DatetimeIndex on index s2 = base.unstack("TIME_PERIOD").transpose() s2.index = pd.to_datetime(s2.index) assert isinstance(s2.index, pd.DatetimeIndex) # Same with pd.PeriodIndex s3 = s1.to_period(axis=1) assert isinstance(s3.columns, pd.PeriodIndex) assert s3.columns.freqstr == "M" s4 = s2.to_period(axis=0) assert isinstance(s4.index, pd.PeriodIndex) assert s4.index.freqstr == "M"
def dump_dimension(dsd, dim_name: str, all: bool = False): if all: print(dsd.dimensions.components) print(dsd.attributes.components) e = dsd.dimensions.get(dim_name).local_representation.enumerated df = sdmx.to_pandas(e) print(df)
def test_write_conceptscheme(): with specimen("common-structure.xml") as f: msg = pandasdmx.read_sdmx(f) data = pandasdmx.to_pandas(msg) cdc = data["concept_scheme"]["CROSS_DOMAIN_CONCEPTS"] assert cdc.loc["UNIT_MEASURE", "name"] == "Unit of Measure"
def fetch_dataset(db_collection, flow_name: str, parameters): # 1. try CSV fetch, if that doesnt work then try PandasSDMX to get the dataframe data_response = requests.get(url, params=parameters, headers={"Accept": "text/csv"}) assert data_response.status_code == 200 with get_tempfile() as fp: fp.write(data_response.text.encode()) fp.seek(0) kwargs = ({} if not flow_name in extra_csv_parms else dict( **extra_csv_parms[flow_name])) try: df = pd.read_csv(fp, **kwargs) save_dataframe(db_collection, {}, df, url, "ECB") return except pd.errors.EmptyDataError: # no data is ignored as far as --fail-fast is concerned print( f"No CSV data to save.. now trying {flow_name} using pandasdmx" ) # FALLTHRU... # 2. try pandassdmx if CSV fetch fails ecb = sdmx.Request("ECB", backend="memory") data_msg = ecb.data(flow_name, params=parameters) df = sdmx.to_pandas(data_msg) assert isinstance(df, pd.DataFrame) save_dataframe(db_collection, {}, df, url, "ECB")
def get_available_of_dimension(dsd: sdmx.model.DataStructureDefinition, dim_name: str): dim = dsd.dimensions.get(dim_name).local_representation.enumerated available_results = sdmx.to_pandas(dim) #print(type(available_results)) assert isinstance(available_results, pd.Series) return available_results
def get_df(data_response, Logger): try: oecd_data = data_response df = pandasdmx.to_pandas(oecd_data, datetime=dict(dim='TIME_PERIOD', axis=1)) except req.exceptions.ConnectionError as err: Logger.critical(f'response took too long, this is the error: {err}') return df
def test_pandas(self, msg): data = msg.data[0] series_keys = list(data.series.keys()) # Number of series in dataframe assert len(series_keys) == 4 # Convert the observations for one SeriesKey to a pd.Series s3_key = series_keys[3] s3 = sdmx.to_pandas(data.series[s3_key]) assert isinstance(s3, pd.Series) # Test a particular value assert s3[0] == 1.2894 # Length of index assert len(s3.index.names) == 6 # Convert again, with attributes pd_data = sdmx.to_pandas(data, attributes='osgd') # Select one SeriesKey's data out of the DataFrame keys, levels = zip(*[(kv.value, kv.id) for kv in s3_key]) s3 = pd_data.xs(keys, level=levels, drop_level=False) # Get the value of the first observation assert s3.iloc[0].value == 1.2894 # Length of index assert len(s3.index.names) == 6 # Number of attributes available assert len(set(s3.columns) - {'value'}) == 7 # Access an attribute of the first value. # NB that this uses… # 1. the *pandas* attribute access shorthand, NOT DictLike: # "s3.iloc[0]" is a single row of s3, i.e. a pd.Series; and # ".OBS_STATUS" accesses the ps.Series element associated with that # key in the index # 2. the AttributeValue.__eq__() comparison operator; # s3.iloc[0].OBS_STATUS is a full AttributeValue, rather than a str. assert s3.iloc[0].OBS_STATUS == 'A' assert s3.iloc[0].OBS_STATUS.value_for == 'OBS_STATUS' # consistency!
def test_write_categoryscheme(): with specimen("IPI-2010-A21-structure.xml") as f: msg = pandasdmx.read_sdmx(f) data = pandasdmx.to_pandas(msg) cs = data["category_scheme"]["CLASSEMENT_DATAFLOWS"] assert cs.loc["COMPTA-NAT", "name"] == "National accounts (GDP, consumption...)" # Children appear assert cs.loc["CNA-PIB-2005", "parent"] == "CNA-PIB"
def test_write_data(data_path): msg = sdmx.read_sdmx(data_path) result = sdmx.to_pandas(msg) expected = expected_data(data_path) if expected is not None: print(expected, result, sep='\n') assert_pd_equal(expected, result) # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)
def test_doc_example(): """Code from example.rst.""" import pandasdmx estat = pandasdmx.Request("ESTAT") metadata = estat.datastructure("DSD_une_rt_a") for cl in "CL_AGE", "CL_UNIT": print(pandasdmx.to_pandas(metadata.codelist[cl])) resp = estat.data("une_rt_a", key={"GEO": "EL+ES+IE"}, params={"startPeriod": "2007"}) data = pandasdmx.to_pandas(resp, datetime=dict(dim='TIME_PERIOD', freq='FREQ')).xs("Y15-74", axis=1, level="AGE", drop_level=False) data.loc[:, ("Y15-74", "PC_ACT", "T")] # Further checks per https://github.com/dr-leo/pandaSDMX/issues/157 # DimensionDescriptor for the structure message dd1 = metadata.structure.DSD_une_rt_a.dimensions # DimensionDescriptor retrieved whilst validating the data message dd2 = resp.data[0].structured_by.dimensions # DimensionDescriptors have same ID, components and order assert dd1 == dd2 # One SeriesKey from the data message sk = list(resp.data[0].series.keys())[0] # Key values have same order as in the DSD assert dd1.order_key(sk) == sk
def test_write_categoryscheme(): with specimen('IPI-2010-A21-structure.xml') as f: msg = sdmx.read_sdmx(f) print(msg.category_scheme) data = sdmx.to_pandas(msg) cs = data['category_scheme']['CLASSEMENT_DATAFLOWS'] assert (cs.loc['COMPTA-NAT', 'name'] == 'National accounts (GDP, consumption...)') # Children appear assert cs.loc['CNA-PIB-2005', 'parent'] == 'CNA-PIB'