def test_determine_join_columns(): df_1 = pd.DataFrame( [], columns=["year", "source_detail1", "source_detail2", "value"]) df_2 = pd.DataFrame( [], columns=["year", "GES", "source_detail1", "source_detail2", "value"]) df_3 = pd.DataFrame( [], columns=["year", "source_detail1", "source_detail2", "value"]) frames_inp = [df_1, df_2, df_3] output = QueryOutputTransformer._determine_join_columns(frames_inp) expected_output = set(["year"]) assert output == expected_output df_1 = pd.DataFrame([], columns=[ "year", "NAT", "GES", "source_detail1", "source_detail2", "value" ]) df_2 = pd.DataFrame( [], columns=["year", "GES", "source_detail1", "source_detail2", "value"]) frames_inp = [df_1, df_2] output = QueryOutputTransformer._determine_join_columns(frames_inp) expected_output = set(["year", "GES"]) assert output == expected_output
def test_output_transformer(): """ prepare test of output transformer """ testquery = buildQuery() query_result = runQuery(testquery) """ start test of output transformer """ qOutTrans = QueryOutputTransformer(query_result) # test whether input data arrive in correct format assert type(query_result) == dict, "input data not dict type" data_transformed = qOutTrans.transform() # test whether transformed output data is a dataframe assert (type(data_transformed) == pandas.DataFrame ), "transformed data is not a dataframe" assert "id" in data_transformed.columns, "no id colum" assert "name" in data_transformed.columns, "no name colum" assert "year" in data_transformed.columns, "no year colum" # columns of outdata should not contain json format lenlist = len(data_transformed.columns) checklist = ["." in data_transformed.columns[x] for x in range(lenlist)] assert True not in checklist, "hierarchy not properly transformed"
def test_dumplicate_removal_multi(query_duplicates_for_states_multi_stat): qOutTrans = QueryOutputTransformer(query_duplicates_for_states_multi_stat) data_transformed = qOutTrans.transform(remove_duplicates=False) assert all(data_transformed.name.value_counts() == 4) data_transformed = qOutTrans.transform(remove_duplicates=True) assert all(data_transformed.name.value_counts() == 1)
def test_output_transformer_with_one_statistic_and_units( query_results_one_statistic_with_units): """check if units were added correctly""" qOutTrans = QueryOutputTransformer(query_results_one_statistic_with_units) data_transformed = qOutTrans.transform(add_units=True) assert data_transformed.loc[0, "TIE003_unit"] == "Anzahl" assert data_transformed.columns[5] == "TIE003_unit"
def test_build_execute_transform_integration_all_regions(query_all_regions): """ Smoke test covering all_regions """ q_exec = QueryExecutioner() res = q_exec.run_query(query_all_regions) output_transf = QueryOutputTransformer(res) output_transf.transform()
def test_output_transformer_auto_join_enum( query_result_with_autojoin_and_one_enum): qOutTrans = QueryOutputTransformer(query_result_with_autojoin_and_one_enum) data_transformed = qOutTrans.transform(verbose_enum_values=False) assert "BEVSTD_GES" in data_transformed assert data_transformed.columns.get_loc("BEVSTD_GES") <= 8 assert list(data_transformed.BEVSTD_GES.unique()) == [None] data_transformed = qOutTrans.transform(verbose_enum_values=True) assert "BEVSTD_GES" in data_transformed assert list(data_transformed.BEVSTD_GES.unique()) == ["Gesamt"]
def test_output_transformer_format_options_multi_enum( query_results_with_mult_enum): qOutTrans = QueryOutputTransformer(query_results_with_mult_enum) data_transformed = qOutTrans.transform(verbose_enum_values=False) assert data_transformed["ADVNW2"].iloc[0] == "ADVTN420" assert data_transformed["ADVNW1"].iloc[0] is None data_transformed = qOutTrans.transform(verbose_enum_values=True) print(data_transformed.head()) assert data_transformed["ADVNW2"].iloc[0] == "Grünanlage" assert data_transformed["ADVNW1"].iloc[0] == "Gesamt"
def test_build_execute_transform_integration_multi_region(query_multi_regions): """ Smoke test covering multiple regions in region query. """ q_exec = QueryExecutioner() res = q_exec.run_query(query_multi_regions) output_transf = QueryOutputTransformer(res) output_transf.transform()
def results(self, verbose_statistics: bool = False, verbose_enums: bool = False) -> DataFrame: """Runs the query and returns a Pandas DataFrame with the results. It also fills the instance variable result_meta_data with meta data specific to the query instance. Arguments: verbose_statistics -- Toggles whether statistic column names displayed with their short description in the result data frame verbose_enums -- Toggles whether enum values are displayed with their short description in the result data frame :raises RuntimeError: If the query fails raise RuntimeError. :return: A DataFrame with the queried data. :rtype: DataFrame """ result = QueryExecutioner(statistics_meta_data_provider=self. _stat_meta_data_provider).run_query(self) if result: # It is currently assumed that all graphql queries # that are generated internally for the Query instance # at hand yield the same meta data. self.result_meta_data = result[0].meta_data return QueryOutputTransformer(result).transform( verbose_statistic_names=verbose_statistics, verbose_enum_values=verbose_enums, ) else: raise RuntimeError("No results could be returned for this Query.")
def test_determine_column_order(): input_columns = ["source_A", "source_B", "stat_A_value", "stat_B_value", "year"] input_frame = pd.DataFrame([], columns=input_columns) join_columns = set(["year"]) output = QueryOutputTransformer._determine_column_order(input_frame, join_columns) expected_output = ["year", "stat_A_value", "stat_B_value", "source_A", "source_B"] assert output == expected_output
def test_prefix_frame_columns(): cols = ["year", "stat_value", "source"] df = pd.DataFrame([], columns=cols) output = list( QueryOutputTransformer._prefix_frame_cols(df, prefix="A", exceptions=["year"]).columns) expected_output = ["year", "A_stat_value", "A_source"] assert output == expected_output
def test_get_general_fields(): meta_dict = {"stat_1": "stat_1 description", "stat_2": "stat_2 description"} region_json = { "id": "11", "stat_1": [], "name": "Berlin", "stat_2": [{"year": 2000, "value": 1}, {"year": 2001, "value": 2}], } output = QueryOutputTransformer._get_general_fields(region_json, meta_dict) expected_output = ["id", "name"] assert output == expected_output
def results( self, verbose_statistics: bool = False, verbose_enums: bool = False, add_units: bool = False, remove_duplicates: bool = True, ) -> DataFrame: """Runs the query and returns a Pandas DataFrame with the results. It also fills the instance variable result_meta_data with meta data specific to the query instance. :param verbose_statistics: Toggles whether statistic column names displayed with their short description in the result data frame :param verbose_enums: Toggles whether enum values are displayed with their short description in the result data frame :param add_units: Adds units available in the metadata to the result dataframe. Care should be taken, because not every statistic specifies these corretly. When in doubt one should refer to the statistic description. :param remove_duplicates: Removes duplicates from query results, i.e. if the exact same number has been reported for the same statistic, year, region etc. from the same source it gets removed. Such duplications are sometimes caused on the API side and this is convenience functionality to remove them. The removal happens before potentially joining several different statistics. Unless diagnosing the API the default (True) is generally in the users interest. :raises RuntimeError: If the query fails raise RuntimeError. :return: A DataFrame with the queried data. :rtype: DataFrame """ if not self._contains_statistic_field(): raise Exception( "No statistic field is defined in query, please add statistic field " "via method add_field.") result = QueryExecutioner(statistics_meta_data_provider=self. _stat_meta_data_provider).run_query(self) if result: # It is currently assumed that all graphql queries # that are generated internally for the Query instance # at hand yield the same meta data. if self._query_result_contains_undefined_region(result): raise ValueError("Queried region is invalid.") self.result_meta_data = result[0].meta_data return QueryOutputTransformer(result).transform( verbose_statistic_names=verbose_statistics, verbose_enum_values=verbose_enums, add_units=add_units, remove_duplicates=remove_duplicates, ) else: raise RuntimeError("No results could be returned for this Query.")
def test_output_transformer_defaults(query_result): """ start test of output transformer """ qOutTrans = QueryOutputTransformer(query_result) data_transformed = qOutTrans.transform() # test whether transformed output data is a dataframe assert type(data_transformed ) == pd.DataFrame, "transformed data is not a dataframe" assert "id" in data_transformed.columns, "no id colum" assert "name" in data_transformed.columns, "no name colum" assert "year" in data_transformed.columns, "no year colum" assert "BEVMK3" in data_transformed.columns, "statistic values are missing" assert ( "BEVMK3_value" not in data_transformed.columns), "old statistics name still present" # columns of outdata should not contain json format lenlist = len(data_transformed.columns) checklist = ["." in data_transformed.columns[x] for x in range(lenlist)] assert not any(checklist), "hierarchy not properly transformed"
def test_output_transformer_with_multiple_statistics_and_units( query_results_multiple_statistics_with_units): """check if units were added correctly""" qOutTrans = QueryOutputTransformer( query_results_multiple_statistics_with_units) data_transformed = qOutTrans.transform(add_units=True) assert data_transformed.iloc[1, range(4, 15, 2)].to_list() == [ "Prozent", "Prozent", "Prozent", "Prozent", "Prozent", "kg", ] assert data_transformed.columns[range(4, 15, 2)].to_list() == [ "AI0203_unit", "AI0204_unit", "AI0205_unit", "AI0206_unit", "AI0207_unit", "AI1902_unit", ]
def results(self) -> DataFrame: """Runs the query and returns a Pandas DataFrame with the results. Raises: RuntimeError: If the Query did not return any results. E.g. if the Query was ill-formed. Returns: DataFrame -- A DataFrame with the queried data. If the query fails raise RuntimeError. """ result = QueryExecutioner().run_query(self) if result: # TODO: adapt QueryOutputTransformer to process list of results return QueryOutputTransformer(result[0].query_results[0]).transform() else: raise RuntimeError("No results could be returned for this Query.")
def test_output_transformer_format_options(query_result, query_results_with_enum): qOutTrans = QueryOutputTransformer(query_result) data_transformed = qOutTrans.transform(verbose_statistic_names=True) assert ( # "Von der Scheidung betroffene Kinder (BEVMK3)" in data_transformed.columns "BEVMK3 (BEVMK3)" in data_transformed.columns), "statistic values are missing" enum_values = { "AFD", "B90_GRUENE", "CDU", "DIELINKE", "FDP", "SONSTIGE", "SPD", "GESAMT", None, } enum_descriptions = { "AfD", "GRÜNE", "CDU/CSU", "DIE LINKE", "FDP", "Sonstige Parteien", "SPD", "Gesamt", } qOutTrans = QueryOutputTransformer(query_results_with_enum) data_transformed = qOutTrans.transform() assert set(data_transformed["PART04"]).issubset(enum_values) qOutTrans = QueryOutputTransformer(query_results_with_enum) data_transformed = qOutTrans.transform(verbose_enum_values=True) assert set(data_transformed["PART04"]).issubset(enum_descriptions) qOutTrans = QueryOutputTransformer(query_results_with_enum) data_transformed = qOutTrans.transform(verbose_enum_values=True, verbose_statistic_names=True) # assert "Gültige Zweitstimmen (WAHL09)" in data_transformed assert "WAHL09 (WAHL09)" in data_transformed