def test_unpack_column_raises_on_max_value_violation(self): """checks that unpacking a column with a max value constraint works """ with self.assertRaises(ValueError): sit_parser.unpack_table(table=pd.DataFrame([(1, )]), column_descriptions=[{ "index": 0, "name": "col0", "type": int, "max_value": 0 }], table_name="")
def test_unpack_table_raises_on_duplicate_column(self): """checks that if any 2 columns are the identical, an error is thrown """ with self.assertRaises(ValueError): sit_parser.unpack_table(table=pd.DataFrame([("0", "0")]), column_descriptions=[{ "index": 0, "name": "duplicate" }, { "index": 1, "name": "duplicate" }], table_name="")
def parse(yield_table, classifiers, classifier_values, age_classes): """Parses and validates the CBM SIT growth and yield format. Args: yield_table (pandas.DataFrame): SIT formatted growth and yield data classifiers (pandas.DataFrame): used to validate the classifier set columns of the yield data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_values (pandas.DataFrame): used to validate the classifier set columns of the yield data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` age_classes (pandas.DataFrame): used to validate the number of volume columns. Use the return value of: :py:func:`libcbm.input.sit.sit_age_class_parser.parse` Raises: ValueError: the specified data did not have the correct number of columns according to the defined classifiers and age classes ValueError: the leading_species column contained a value that was not defined in the specified species map. ValueError: Classifier sets were not valid according to the specified classifiers and classifier_values. Returns: pandas.DataFrame: Validated sit input with standardized column names and substituted species """ yield_format = sit_format.get_yield_format( classifiers.name, len(yield_table.columns)) unpacked_table = sit_parser.unpack_table( yield_table, yield_format, "yield") # check that the number of volumes is equal to the number of age classes expected_column_count = len(age_classes) + len(classifiers) + 1 if expected_column_count != len(unpacked_table.columns): raise ValueError( f"expected {expected_column_count} columns. This is defined as " f"{len(classifiers) + 1} classifiers plus {len(age_classes)} " "age classes") # check that the correct number of classifiers are present and check that # each value in yield table classifier sets is defined in classifier values for row in classifiers.itertuples(): yield_classifiers = unpacked_table[row.name].unique() defined_classifier_values = classifier_values[ classifier_values["classifier_id"] == row.id]["name"].unique() wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()]) valid_classifiers = np.concatenate( [defined_classifier_values, wildcard]) diff = np.setdiff1d(yield_classifiers, valid_classifiers) if len(diff) > 0: raise ValueError( "Undefined classifier values detected: " f"classifier: '{row.name}', values: {diff}") return unpacked_table
def test_unpack_column_raises_on_unconvertable_value(self): """checks that unpacking a column with a type constraint works """ cases = [ ("invalid_integer", int), ("1.1", int), ("invalid_float", float), ] for value, constraint_type in cases: with self.assertRaises(ValueError): sit_parser.unpack_table(table=pd.DataFrame([(value, )]), column_descriptions=[{ "index": 0, "name": "col0", "type": constraint_type }], table_name="")
def parse(disturbance_types_table): """Parse and validate a SIT formatted disturbance type table Args: disturbance_types_table (pandas.DataFrame): a table in SIT disturbance type format Example: Input: ======== ========= 0 1 ======== ========= distid1 fire distid2 clearcut distid3 clearcut ======== ========= Output: ======== ========= id name ======== ========= distid1 fire distid2 clearcut distid3 clearcut ======== ========= Raises: ValueError: duplicate ids detected in disturbance data. Returns: pandas.DataFrame: a validated copy of the input table with standardized colmun names """ result = sit_parser.unpack_table( disturbance_types_table, sit_format.get_disturbance_type_format( len(disturbance_types_table.columns)), "disturbance types") duplicates = result.groupby("id").size() duplicates = list(duplicates[duplicates > 1].index) if len(duplicates) > 0: raise ValueError( f"duplicate ids detected in disturbance types {duplicates}") # establish a numeric identifier for each row of the SIT disturbances result.insert(0, "sit_disturbance_type_id", np.arange(len(result)) + 1) return result
def test_unpack_table_expected_result(self): """test that unpack_table function returns an expected value """ unpacked = sit_parser.unpack_table(table=pd.DataFrame([("1", "2", "3") ]), column_descriptions=[{ "index": 0, "name": "col0", "type": int }, { "index": 1, "name": "col1", "type": float }, { "index": 2, "name": "col2" }], table_name="") self.assertTrue(list(unpacked.columns) == ["col0", "col1", "col2"]) table = list(unpacked.itertuples())[0] self.assertTrue(table.col0 == 1) self.assertTrue(table.col1 == 2.0) self.assertTrue(table.col2 == "3")
def parse(age_class_table): """Parse the sit age class table format into a table of age classes with fields: - name - class_size - start_year - end_year Args: age_class_table (pandas.DataFrame): a dataframe Raises: ValueError: the first, and only the first row must have a 0 value ValueError: duplicate values in the first column of the specified table were detected Example: Input: ====== ==== 0 1 ====== ==== age_0 0 age_1 10 age_2 10 age_3 10 age_4 10 age_5 10 age_6 10 age_7 10 age_8 10 age_9 10 ====== ==== Output: ====== =========== =========== ========= name class_size start_year end_year ====== =========== =========== ========= age_0 0 0 0 age_1 10 1 10 age_2 10 11 20 age_3 10 21 30 age_4 10 31 40 age_5 10 41 50 age_6 10 51 60 age_7 10 61 70 age_8 10 71 80 age_9 10 81 90 ====== =========== =========== ========= Returns: pandas.DataFrame: a dataframe describing the age classes. """ table = sit_parser.unpack_table(age_class_table, sit_format.get_age_class_format(), "age classes") result = [] for i, row in enumerate(table.itertuples()): size = row.class_size if i == 0: if size != 0: raise ValueError("First age class row expected to have 0 size") result.append({ "name": row.id, "class_size": 0, "start_year": 0, "end_year": 0 }) else: start_year = result[-1]["end_year"] + 1 if size == 0: raise ValueError("All age class rows other than the" "first one must have size > 0") result.append({ "name": row.id, "class_size": row.class_size, "start_year": start_year, "end_year": start_year + row.class_size - 1 }) age_classes = pd.DataFrame( result, columns=["name", "class_size", "start_year", "end_year"]) duplicates = age_classes.groupby("name").size() duplicates = list(duplicates[duplicates > 1].index) if len(duplicates) > 0: raise ValueError( f"duplicate names detected in age classes {duplicates}") return age_classes
def parse(transition_rules, classifiers, classifier_values, classifier_aggregates, disturbance_types, age_classes): """Parses and validates the CBM SIT transition rule format. Args: transition_rules (pandas.DataFrame): CBM SIT transition rule formatted data. classifiers (pandas.DataFrame): used to validate the classifier set columns of the transition rule data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_values (pandas.DataFrame): used to validate the classifier set columns of the transition rule data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_aggregates (pandas.DataFrame): used to validate the classifier set columns of the transition rule data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` disturbance_types (pandas.DataFrame): Used to validate the disturbance_type column of the transition rule data. Use the return value of: :py:func:`libcbm.input.sit.sit_disturbance_types_parser.parse` age_classes (pandas.DataFrame): used to validate the number of volume columns. Use the return value of: :py:func:`libcbm.input.sit.sit_age_class_parser.parse` Raises: ValueError: undefined classifier values were found in the transition rule classifier sets ValueError: a grouped set of transition rules has a percent greater than 100%. ValueError: undefined disturbance types were found in the transition rule disturbance_type column Returns: pandas.DataFrame: validated transition rules """ transition_rule_format = sit_format.get_transition_rules_format( classifiers.name, len(transition_rules.columns)) transitions = sit_parser.unpack_table( transition_rules, transition_rule_format, "transitions") if len(transitions.index) == 0: return transitions # check that each value in transition_rules events classifier sets is # defined in classifier values, classifier aggregates or is a wildcard for row in classifiers.itertuples(): source_classifiers = transitions[row.name].unique() # get the destination classifier tr_dest_fmt = sit_format.get_tr_classifier_set_postfix() dest_classifiers = transitions[f"{row.name}{tr_dest_fmt}"] defined_classifiers = classifier_values[ classifier_values["classifier_id"] == row.id]["name"].unique() aggregates = np.array( [x["name"] for x in classifier_aggregates if x["classifier_id"] == row.id]) wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()]) valid_source_classifiers = np.concatenate( [defined_classifiers, aggregates, wildcard]) diff_source = np.setdiff1d( source_classifiers, valid_source_classifiers) if len(diff_source) > 0: raise ValueError( "Undefined classifier values detected: " f"classifier: '{row.name}', values: {diff_source}") # aggregates may not appear in transition rule destination classifier # set (only the defined classifier values, or wildcards) valid_dest_classifiers = np.concatenate( [defined_classifiers, wildcard]) diff_dest = np.setdiff1d( dest_classifiers, valid_dest_classifiers) if len(diff_dest) > 0: raise ValueError( "Undefined classifier values detected: " f"classifier: '{row.name}', values: {diff_dest}") parse_bool_func = sit_parser.get_parse_bool_func( "transitions", "using_age_class") transitions = sit_parser.substitute_using_age_class_rows( transitions, parse_bool_func, age_classes) # validate and substitute disturbance type names versus the SIT disturbance # types a = transitions.disturbance_type.unique() b = disturbance_types.id.unique() undefined_disturbances = np.setdiff1d(a, b) if len(undefined_disturbances) > 0: raise ValueError( "Undefined disturbance type ids (as defined in sit " f"disturbance types) detected: {undefined_disturbances}" ) transitions = transitions.rename( columns={ "min_softwood_age": "min_age", "max_softwood_age": "max_age"}) transitions = transitions.drop( columns=["using_age_class", "min_hardwood_age", "max_hardwood_age"]) # if the sum of percent for grouped transition rules exceeds 100% raise an # error group_cols = list(classifiers.name) + \ ["min_age", "max_age", "disturbance_type"] grouped = transitions[group_cols + ["percent"]].groupby(group_cols).sum() invalid_grouped = grouped[ grouped.percent > (100 + GROUPED_PERCENT_ERR_MAX)] if len(invalid_grouped) > 0: invalid_percents = [x.Index for x in invalid_grouped.head().itertuples()] raise ValueError( "the following groups have a total percent greater than 100%: " f"{invalid_percents}") return transitions
def parse_eligibilities(disturbance_events, disturbance_eligibilities): """Parse and validate disturbance eligibilities which are a libcbm-specific alternative to the eligibility columns in the cbm-cfs3 sit_disturbance events input. The benefit of this format is that the number of columns in sit_events is greatly reduced, and arbitrary boolean expressions of stand pool and state values, rather than min/max ranges supported in the CBM3-SIT format may be used. Example disturbance_eligibilities table: == ===================================== ======================= id pool_filter_expression state_filter_expression == ===================================== ======================= 1 (SoftwoodMerch + HardwoodMerch) >= 10 NULL 2 (SoftwoodMerch + HardwoodMerch) >= 10 (age > 5) & (age < 100) 3 NULL NULL == ===================================== ======================= * The id field in the disturbance_eligibilities corresponds to sit events * expressions are parsed by the numexpr library * note brackets are required around nested boolean expressions joined by a boolean operator (eg &) * for both pool_filter_expression, and state_filter_expression, the expressions must evaluate to a True or False value. False indicates that the stand records being evaluated for the corresponding disturbance event deemed ineligible for the disturbance. True indicates that the expressions does not eliminate the stand from eligibility. * for pool_filter_expression any CBM pool is acceptable. The pool names are defined in the cbm_defaults database tables. * for state_filter_expression any of the state values may be used in the boolean expression. See: :py:func:`libcbm.model.cbm.cbm_variables.initialize_cbm_state_variables` The final eligibility is evaluated as follows: ====================== ======================= ================= pool_filter_expression state_filter_expression deemed_ineligible ====================== ======================= ================= NULL or TRUE NULL or TRUE FALSE NULL or TRUE FALSE TRUE FALSE NULL or TRUE TRUE FALSE FALSE TRUE ====================== ======================= ================= Args: disturbance_events (pandas.DataFrame): alternate form of CBM-CFS3 sit_events: the 21 eligibility columns and the using age class and min-max columns are omitted. disturbance_eligibilities (pandas.DataFrame): table of id (int), state_filter expression (str), pool filter expression (str). The disturbance event disturbance_eligibility_id column corresponds to the id column in this table. Raises: ValueError: disturbance_eligibility_id values found in the specified sit_events were not present in the provided disturbance_eligibilities table. ValueError: at lease one null id value was detected in the id column of the specified disturbance_eligibilities table. ValueError: duplicate id value was detected in the id column of the specified disturbance_eligibilities table. Returns: pandas.DataFrame: the validated event eligibilities table """ disturbance_eligibility_format = \ sit_format.get_disturbance_eligibility_format() eligibilities = sit_parser.unpack_table( disturbance_eligibilities, disturbance_eligibility_format, "disturbance eligibilities") # confirm that each row in the disturbance events with an # eligibility id >= 0 has a corresponding record in the eligibilities # table missing_ids = ( set(disturbance_events["disturbance_eligibility_id"]) - set(eligibilities["disturbance_eligibility_id"])) if missing_ids: raise ValueError( "disturbance_eligibility_id values found in sit_events " f"but not in sit_disturbance_eligibilities {missing_ids}") if pd.isnull(eligibilities.disturbance_eligibility_id).any(): raise ValueError( "null values detected in eligibilities disturbance_eligibility_id " "column") if eligibilities.disturbance_eligibility_id.duplicated().any(): raise ValueError( "duplicated disturbance_eligibility_id values detected in " "eligibilities") eligibilities = eligibilities.fillna("") return eligibilities
def parse(disturbance_events, classifiers, classifier_values, classifier_aggregates, disturbance_types, age_classes=None, separate_eligibilities=False): """Parses and validates the CBM SIT disturbance event format, or optionally an extended sit disturbance event format where disturbance eligibilites are separate from sit_events and joined by foreign key. Args: disturbance_events (pandas.DataFrame): CBM SIT disturbance events formatted data. classifiers (pandas.DataFrame): used to validate the classifier set columns of the disturbance event data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_values (pandas.DataFrame): used to validate the classifier set columns of the disturbance event data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_aggregates (pandas.DataFrame): used to validate the classifier set columns of the disturbance event data. Use the return value of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` disturbance_types (pandas.DataFrame): Used to validate the disturbance_type column of the disturbance event data. Use the return value of: :py:func:`libcbm.input.sit.sit_disturbance_types_parser.parse` age_classes (pandas.DataFrame, optional): used to validate and compute age eligibility criteria in disturbance_events. Use the return value of: :py:func:`libcbm.input.sit.sit_age_class_parser.parse`. disturbance_eligibilities (pandas.DataFrame, optional): table of eligibility expressions. Raises: ValueError: undefined classifier values were found in the disturbance event classifier sets ValueError: undefined disturbance types were found in the disturbance event disturbance_type column ValueError: undefined sort types were found in the disturbance event sort_type column. See :py:func:`get_sort_types` ValueError: undefined target types were found in the disturbance event target_type column. See :py:func:`get_target_types` Returns: pandas.DataFrame: the validated disturbance events """ disturbance_event_format = sit_format.get_disturbance_event_format( classifiers.name, len(disturbance_events.columns), include_eligibility_columns=not separate_eligibilities) events = sit_parser.unpack_table( disturbance_events, disturbance_event_format, "disturbance events") # check that the correct number of classifiers are present, and check # that each value in disturbance events classifier sets is defined in # classifier values, classifier aggregates or is a wildcard for row in classifiers.itertuples(): event_classifiers = events[row.name].unique() defined_classifiers = classifier_values[ classifier_values["classifier_id"] == row.id]["name"].unique() aggregates = np.array( [x["name"] for x in classifier_aggregates if x["classifier_id"] == row.id]) wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()]) valid_classifiers = np.concatenate( [defined_classifiers, aggregates, wildcard]) diff_classifiers = np.setdiff1d( event_classifiers, valid_classifiers) if len(diff_classifiers) > 0: raise ValueError( "Undefined classifier values detected: " f"classifier: '{row.name}', values: {diff_classifiers}") if not separate_eligibilities: # if age classes are used substitute the age critera based on the age # class id, and raise an error if the id is not defined, and drop # using_age_class from output parse_bool_func = sit_parser.get_parse_bool_func( "events", "using_age_class") events = sit_parser.substitute_using_age_class_rows( events, parse_bool_func, age_classes) events = events.rename( columns={ "min_softwood_age": "min_age", "max_softwood_age": "max_age"}) events = events.drop( columns=["using_age_class", "min_hardwood_age", "max_hardwood_age"]) # validate sort type valid_sort_types = get_sort_types().keys() int_sort_type = events.sort_type.astype(int) sort_type_diff = set(int_sort_type.unique()) \ .difference(set(valid_sort_types)) if len(sort_type_diff) > 0: raise ValueError( f"specified sort types are not valid: {sort_type_diff}") events.sort_type = int_sort_type.map(get_sort_types()) # validate target type valid_target_types = get_target_types().keys() target_type_diff = set(events.target_type.unique()) \ .difference(set(valid_target_types)) if len(target_type_diff) > 0: raise ValueError( f"specified target types are not valid: {target_type_diff}") events.target_type = events.target_type.map(get_target_types()) # validate disturbance type according to specified disturbance types a = events.disturbance_type.unique() b = disturbance_types.id.unique() undefined_disturbances = np.setdiff1d(a, b) if len(undefined_disturbances) > 0: raise ValueError( "Undefined disturbance type ids (as defined in sit " f"disturbance types) detected: {undefined_disturbances}" ) return events
def parse(inventory_table, classifiers, classifier_values, disturbance_types, age_classes): """Parses and validates SIT formatted inventory data. The inventory_table parameter is the primary data, and the other args act as validation metadata. Args: inventory_table (pandas.DataFrame): SIT formatted inventory classifiers (pandas.DataFrame): table of classifier as returned by the function: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` classifier_values (pandas.DataFrame): table of classifier values as returned by the function: :py:func:`libcbm.input.sit.sit_classifier_parser.parse` disturbance_types (pandas.DataFrame): table of disturbance types as returned by the function: :py:func:`libcbm.input.sit.sit_disturbance_type_parser.parse` age_classes (pandas.DataFrame): table of disturbance types as returned by the function: :py:func:`libcbm.input.sit.sit_age_class_parser.parse` Raises: ValueError: Undefined classifier values detected in inventory table ValueError: Undefined disturbance types detected in inventory table Example: Input: SIT_Inventory: === === ====== ======= === === === ===== ===== === 0 1 2 3 4 5 6 7 8 9 === === ====== ======= === === === ===== ===== === b a True age_2 1 1 1 dist1 dist2 -1 a a False 100 1 0 0 dist2 dist1 0 a a -1 4 1 0 0 dist1 dist1 -1 === === ====== ======= === === === ===== ===== === classifiers parameter: === =========== id name === =========== 1 classifier1 2 classifier2 === =========== classifier_values parameter: ============== ===== ============ classifier_id name description ============== ===== ============ 1 a a 1 b b 2 a a ============== ===== ============ disturbance_types parameter: ====== ========= id name ====== ========= dist1 fire dist2 clearcut dist3 clearcut ====== ========= age_classes parameter: ====== =========== =========== ========= name class_size start_year end_year ====== =========== =========== ========= age_0 0 0 0 age_1 10 1 10 age_2 10 11 20 age_3 10 21 30 age_4 10 31 40 age_5 10 41 50 age_6 10 51 60 age_7 10 61 70 age_8 10 71 80 age_9 10 81 90 ====== =========== =========== ========= land_classes parameter:: land_classes = {0: "lc_1", 1: "lc_2"} Output: (abbreviated column names) == === ===== ==== ===== ===== ========== ========= ===== c1 c2 age area delay lc hist_dist last_dist s_ref == === ===== ==== ===== ===== ========== ========= ===== a a 100 1.0 0 lc_1 fire fire 0 a a 4 1.0 0 lc_1 clearcut clearcut -1 b a 11 0.1 1 lc_2 fire fire -1 b a 12 0.1 1 lc_2 fire fire -1 b a 13 0.1 1 lc_2 fire fire -1 b a 14 0.1 1 lc_2 fire fire -1 b a 15 0.1 1 lc_2 fire fire -1 b a 16 0.1 1 lc_2 fire fire -1 b a 17 0.1 1 lc_2 fire fire -1 b a 18 0.1 1 lc_2 fire fire -1 b a 19 0.1 1 lc_2 fire fire -1 b a 20 0.1 1 lc_2 fire fire -1 == === ===== ==== ===== ===== ========== ========= ===== The actual output column names for this example are: - classifier1 - classifier2 - age - area - delay - land_class - historical_disturbance_type - last_pass_disturbance_type - spatial_reference Returns: pandas.DataFrame: validated inventory """ inventory_format = sit_format.get_inventory_format( classifiers.name, len(inventory_table.columns)) inventory = sit_parser.unpack_table(inventory_table, inventory_format, "inventory") # validate the classifier values in the inventory table for row in classifiers.itertuples(): a = inventory[row.name].unique() b = classifier_values[classifier_values["classifier_id"] == row.id]["name"].unique() diff = np.setdiff1d(a, b) if len(diff) > 0: raise ValueError("Undefined classifier values detected: " f"classifier: '{row.name}', values: {diff}") # if the historical/last pass disturbances are specified substitute them # according to the specified disturbance type parameters if "historical_disturbance_type" in inventory: # first of all, validate undefined_historic = np.setdiff1d( inventory.historical_disturbance_type.unique(), disturbance_types.id.unique()) undefined_lastpass = np.setdiff1d( inventory.last_pass_disturbance_type.unique(), disturbance_types.id.unique()) if len(undefined_historic) > 0: raise ValueError( "Undefined disturbance type ids (as defined in sit " f"disturbance types) detected: {undefined_historic}") if len(undefined_lastpass) > 0: raise ValueError( "Undefined disturbance type ids (as defined in sit " f"disturbance types) detected: {undefined_lastpass}") inventory.using_age_class = inventory.using_age_class.map( sit_parser.get_parse_bool_func("inventory", "using_age_class")) # for rows where using_age_class is false, a type of integer and min value # of 0 is enforced age_column_format = [x for x in inventory_format if x["name"] == "age"][0].copy() age_column_format["type"] = int age_column_format["min_value"] = 0 sit_parser.unpack_column(inventory.loc[~inventory.using_age_class], age_column_format, "inventory") if inventory.using_age_class.any(): inventory = expand_age_class_inventory(inventory, age_classes) inventory = inventory.drop(columns=["using_age_class"]) inventory = inventory.reset_index(drop=True) if "spatial_reference" in inventory: if inventory.spatial_reference[ inventory.spatial_reference > 0].duplicated().any(): raise ValueError( "duplicate value detected in spatial_reference column") return inventory
def parse(classifiers_table): """parse SIT_Classifiers formatted data. Args: classifiers_table (pandas.DataFrame): a dataFrame in sit classifiers format. Raises: ValueError: duplicated names detected, or other validation error occurred Example Input: == =========== =========== === === 0 1 2 3 4 == =========== =========== === === 1 _CLASSIFIER classifier1 NaN NaN 1 a a NaN NaN 1 b b NaN NaN 1 agg1 agg1 a b 1 agg2 agg2 a b 2 _CLASSIFIER classifier2 NaN NaN 2 a a NaN NaN 2 agg1 agg1 a NaN == =========== =========== === === Output based on Example input: Classifiers: === =========== id name === =========== 1 classifier1 2 classifier2 === =========== Classifier Values: ============== ===== ============ classifier_id name description ============== ===== ============ 1 a a 1 b b 2 a a ============== ===== ============ Classifier Aggregates:: [{'classifier_id': 1, 'name': 'agg1', 'description': 'agg2', 'classifier_values': ['a', 'b']}, {'classifier_id': 1, 'name': 'agg2', 'description': 'agg2', 'classifier_values': ['a', 'b']}, {'classifier_id': 2, 'name': 'agg1', 'description': 'agg1', 'classifier_values': ['a']}] Returns: tuple: - classifiers - a validated table of classifiers - classifier_values - a validated table of classifier values - aggregate_values - a dictionary describing aggregate values """ classifiers_format = sit_format.get_classifier_format( len(classifiers_table.columns)) unpacked = sit_parser.unpack_table(classifiers_table, classifiers_format, "classifiers") classifiers = unpacked \ .loc[unpacked["name"] == get_classifier_keyword()] classifiers = pd.DataFrame( data={ "id": classifiers.id, # for classifiers, the 3rd column is used for the name "name": classifiers.description }, columns=["id", "name"]) if classifiers.shape[0] != len(unpacked.id.unique()): # this can occur if the data isnt formatted correctly raise ValueError( "number of unique id values must match number of occurrences of " "'{}'".format(get_classifier_keyword())) # since the order of classifier ids defines the order of classifier # value columns in the other SIT tables, sorting is important classifiers.sort_values(by="id", inplace=True) duplicate_classifiers = classifiers.groupby("name").size() duplicated_classifier_names = list( duplicate_classifiers[duplicate_classifiers > 1].index) if len(duplicated_classifier_names) > 0: raise ValueError( "The following classifier names appear more than one time:" f"{duplicated_classifier_names}") # filter out rows that have the _CLASSIFIER keyword and also # any rows that have a value on the 3rd or greater column. # This is the set of classifier values. classifier_values = unpacked \ .loc[pd.isnull(unpacked.iloc[:, 3:]).all(axis=1) & (unpacked["name"] != get_classifier_keyword())] classifier_values = pd.DataFrame({ "classifier_id": classifier_values.id, "name": classifier_values.name, "description": classifier_values.description }) duplicate_classifier_values = classifier_values.groupby( ["classifier_id", "name"]).size() duplicate_classifier_values = [{ "classifier_id": x[0], "classifier_value": x[1] } for x in list(duplicate_classifier_values[ duplicate_classifier_values > 1].index)] if len(duplicate_classifier_values) > 0: raise ValueError( "The following classifier values are duplicated for the specified " f"classifier ids: {duplicate_classifier_values}") aggregate_values = [] classifier_aggregates = unpacked.loc[~pd.isnull(unpacked.iloc[:, 3:]).all( axis=1)] for i in range(0, classifier_aggregates.shape[0]): agg_values = classifier_aggregates.iloc[i, 3:] agg_values = agg_values[~pd.isnull(agg_values)] aggregate_values.append({ "classifier_id": classifier_aggregates.iloc[i, :]["id"], "name": classifier_aggregates.iloc[i, :]["name"], "description": classifier_aggregates.iloc[i, :]["description"], "classifier_values": list(agg_values[:]) }) unique_agg_set = set() unique_agg_value_set = set() for agg in aggregate_values: classifier_id = agg["classifier_id"] name = agg["name"] agg_values = agg["classifier_values"] if len(agg_values) > len(set(agg_values)): raise ValueError( "duplicate classifier values detected in aggregate with " f"classifier_id: {classifier_id}, name {name}") for classifier_value in agg_values: unique_agg_value_set.add((classifier_id, classifier_value)) if (classifier_id, name) in unique_agg_set: raise ValueError("duplicate classifier aggregate detected: " f"classifier_id: {classifier_id}, name {name}") else: unique_agg_set.add((classifier_id, name)) for classifier_id in classifier_values.classifier_id.unique(): classifier_id_values_set = set(classifier_values[ classifier_values.classifier_id == classifier_id].name) aggregate_values_set = set( [x[1] for x in unique_agg_value_set if x[0] == classifier_id]) if not aggregate_values_set.issubset(classifier_id_values_set): missing_aggregate_values = aggregate_values_set.difference( classifier_id_values_set) raise ValueError( "The following aggregate values that are not defined as " f"classifier values in the classifier with id {classifier_id} " f"were found: {missing_aggregate_values}.") return classifiers, classifier_values, aggregate_values