def _scale_observations_relative_to_interface(self, processor: Processor, interface_name: str, scale: Union[int, float]): for factor in processor.factors: for observation in factor.quantitative_observations: relative_to_interface = observation.attributes.get( "relative_to", None) if relative_to_interface and strcmp(relative_to_interface.name, interface_name): observation.value = float(observation.value) * scale observation.attributes["relative_to"] = None
def _constrains_interface(self, scale: str, invoking_interface_name: str, requested_interface_name: str, parent_processor: Processor, child_processor: Processor): for f in parent_processor.factors: if strcmp(f.name, invoking_interface_name): origin_factor = f break else: raise Exception("Invoking interface name '" + invoking_interface_name + "' not found for processor '" + parent_processor.name + "'") for f in child_processor.factors: if strcmp(f.name, requested_interface_name): destination_factor = f break else: raise Exception("Requested interface name '" + invoking_interface_name + "' not found for processor '" + parent_processor.name + "'") relationship = FactorsRelationScaleObservation.create_and_append( origin=origin_factor, destination=destination_factor, observer=None, quantity=scale) # relationship = ProcessorsRelationUpscaleObservation.create_and_append(parent=parent_processor, # child=child_processor, # observer=None, # factor_name=interface_name, # quantity=scale) self._glb_idx.put(relationship.key(), relationship)
def execute(self, state: "State"): issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.code for ds in datasets.values()] # Process parsed information for r, line in enumerate(self._content["items"]): # A dataset dataset_name = line["name"] # Find it in the already available datasets. MUST EXIST for n in ds_names: if strcmp(dataset_name, n): df = pd.read_json(StringIO(line["values"]), orient="split") # Check columns ds = datasets[n] iss = prepare_dataframe_after_external_read(ds, df) for issue in iss: issues.append( Issue(itype=3, description=issue, location=IssueLocation(sheet_name=name, row=-1, column=-1))) # Everything ok? Store the dataframe! if len(iss) == 0: ds.data = df break else: issues.append( Issue(itype=3, description="Metadata for the dataset '"+dataset_name+"' must be defined previously", location=IssueLocation(sheet_name=name, row=-1, column=-1))) return issues, None
def process_row(item): """ Process a dictionary representing a row of the data input command. The dictionary can come directly from the worksheet or from a dataset. Implicitly uses "glb_idx" :param row: dictionary """ # Gather variables in one dictionary fields_value = { k: item.get(k, v.default_value) for k, v in fields.items() } # Check if mandatory fields with no value exist for field in [ k for k, v in fields.items() if v.mandatory and not fields_value[k] ]: add_issue(IType.error(), f"Mandatory field '{field}' is empty. Skipped.") return # Interface f_alias = fields_value.get("alias") f_processor_name = fields_value.get("processor") f_interface_type_name = fields_value.get("interface_type") f_interface_name = fields_value.get( "interface") # A "simple_ident", optional f_location = fields_value.get("location") f_orientation = fields_value.get("orientation") # f_roegen_type = fields_value.get("roegen_type") # f_sphere = fields_value.get("sphere") # f_opposite_processor_type = fields_value.get("opposite_processor_type") # f_geolocation_ref = fields_value.get("geolocation_ref") # f_geolocation_code = fields_value.get("geolocation_code") # Qualified Quantity f_value = fields_value.get("value") f_unit = fields_value.get("unit") f_uncertainty = fields_value.get("uncertainty") f_assessment = fields_value.get("assessment") f_pedigree_matrix = fields_value.get("pedigree_matrix") f_pedigree = fields_value.get("pedigree") f_relative_to = fields_value.get("relative_to") f_time = fields_value.get("time") f_source = fields_value.get("qq_source") f_number_attributes = fields_value.get("number_attributes", {}) f_comments = fields_value.get("comments") # Transform text of "interface_attributes" into a dictionary field_val = fields_value.get("interface_attributes") if field_val: try: fields_value[ "interface_attributes"] = dictionary_from_key_value_list( field_val, glb_idx) except Exception as e: add_issue(IType.error(), str(e)) return else: fields_value["interface_attributes"] = {} # Transform text of "number_attributes" into a dictionary if f_number_attributes: try: number_attributes = dictionary_from_key_value_list( f_number_attributes, glb_idx) except Exception as e: add_issue(IType.error(), str(e)) return else: number_attributes = {} # f_processor_name -> p # f_interface_type_name -> it # f_interface_name -> i # # IF NOT i AND it AND p => i_name = it.name => get or create "i" # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error) # IF i AND p AND NOT it => get "i" (MUST EXIST) if not f_interface_name: if not f_interface_type_name: add_issue( IType.error(), "At least one of InterfaceType or Interface must be defined" ) return possibly_local_interface_name = None f_interface_name = f_interface_type_name else: possibly_local_interface_name = f_interface_name # Check existence of PedigreeMatrix, if used if f_pedigree_matrix and f_pedigree: pm = glb_idx.get( PedigreeMatrix.partial_key(name=f_pedigree_matrix)) if len(pm) == 0: add_issue( IType.error(), "Could not find Pedigree Matrix '" + f_pedigree_matrix + "'") return else: try: lst = pm[0].get_modes_for_code(f_pedigree) except: add_issue( IType.error(), "Could not decode Pedigree '" + f_pedigree + "' for Pedigree Matrix '" + f_pedigree_matrix + "'") return elif f_pedigree and not f_pedigree_matrix: add_issue( IType.error(), "Pedigree specified without accompanying Pedigree Matrix") return # Source if f_source: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, f_source) ref_id = ast["ref_id"] references = glb_idx.get( ProvenanceReference.partial_key(ref_id)) if len(references) == 1: source = references[0] else: references = glb_idx.get( BibliographicReference.partial_key(ref_id)) if len(references) == 1: source = references[0] else: add_issue(IType.error(), f"Reference '{f_source}' not found") except: # TODO Change when Ref* are implemented source = f_source + " (not found)" else: source = None # Geolocation if f_location: try: # TODO Change to parser for Location (includes references, but also Codes) ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, f_location) ref_id = ast["ref_id"] references = glb_idx.get( GeographicReference.partial_key(ref_id)) if len(references) == 1: geolocation = references[0] except: geolocation = f_location else: geolocation = None # Find Processor # TODO Allow creating a basic Processor if it is not found? p = glb_idx.get(Processor.partial_key(f_processor_name)) if len(p) == 0: add_issue( IType.error(), "Processor '" + f_processor_name + "' not declared previously") return elif len(p) > 1: add_issue( IType.error(), "Processor '" + f_processor_name + "' found " + str(len(p)) + " times. It must be uniquely identified.") return else: p = p[0] # Try to find Interface ft: FactorType = None f = glb_idx.get( Factor.partial_key(processor=p, name=f_interface_name)) if len(f) == 1: f = f[0] ft: FactorType = f.taxon if f_interface_type_name: if not strcmp(ft.name, f_interface_type_name): add_issue( IType.warning(), f"The InterfaceType of the Interface, {ft.name} " f"is different from the specified InterfaceType, {f_interface_type_name}. Record skipped." ) return elif len(f) > 1: add_issue( IType.error(), f"Interface '{f_interface_name}' found {str(len(f))} times. " f"It must be uniquely identified.") return elif len(f) == 0: f: Factor = None # Does not exist, create it below if not f_orientation: add_issue( IType.error(), f"Orientation must be defined for new Interfaces") return # InterfaceType still not found if not ft: # Find FactorType # TODO Allow creating a basic FactorType if it is not found ft = glb_idx.get(FactorType.partial_key(f_interface_type_name)) if len(ft) == 0: add_issue( IType.error(), f"InterfaceType '{f_interface_type_name}' not declared previously" ) return elif len(ft) > 1: add_issue( IType.error(), f"InterfaceType '{f_interface_type_name}' found {str(len(ft))} times. " f"It must be uniquely identified.") return else: ft = ft[0] if not f: # Get attributes default values taken from Interface Type or Processor attributes default_values = { # "orientation": ft.orientation, "sphere": ft.sphere, "roegen_type": ft.roegen_type, "opposite_processor_type": p.subsystem_type } # Get internal and user-defined attributes in one dictionary attributes = { k: ifnull(fields_value[k], default_values.get(k, None)) for k, v in fields.items() if v.attribute_of == Factor } attributes.update(fields_value["interface_attributes"]) f = Factor.create_and_append( f_interface_name, p, in_processor_type=FactorInProcessorType(external=False, incoming=False), taxon=ft, geolocation=f_location, tags=None, attributes=attributes) glb_idx.put(f.key(), f) # Find Observer oer = glb_idx.get(Observer.partial_key(f_source)) if not oer: add_issue(IType.warning(), f"Observer '{f_source}' has not been found.") else: oer = oer[0] if f_relative_to: ast = parser_field_parsers.string_to_ast( parser_field_parsers.factor_unit, f_relative_to) relative_to_interface_name = ast_to_string(ast["factor"]) rel_unit_name = ast["unparsed_unit"] try: f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units) except (UndefinedUnitError, AttributeError) as ex: add_issue( 3, f"The final unit could not be computed, interface '{f_unit}' / " f"relative_to '{rel_unit_name}': {str(ex)}") return f_relative_to = first( f.processor.factors, lambda ifc: strcmp(ifc.name, relative_to_interface_name)) if not f_relative_to: add_issue( IType.error(), f"Interface specified in 'relative_to' column " f"'{relative_to_interface_name}' has not been found.") return # Create quantitative observation if f_value: # If an observation exists then "time" is mandatory if not f_time: add_issue( IType.error(), f"Field 'time' needs to be specified for the given observation." ) return o = _create_or_append_quantitative_observation( f, f_value, f_unit, f_uncertainty, f_assessment, f_pedigree, f_pedigree_matrix, oer, f_relative_to, f_time, None, f_comments, None, number_attributes)
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition # Obtain the source source = obtain_dataset_source(dataset_name) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=3, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=3, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=3, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=3, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=3, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=2, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=3, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=3, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=3, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=3, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=2, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def _process_row(self, row: Dict[str, Any]): self._current_row_number = row["_row"] self._fields_values = self._get_command_fields_values(row) self._check_all_mandatory_fields_have_values() scaling_type = self._fields_values["scaling_type"] scale: str = self._fields_values["scale"] # Find processors invoking_processor = self._get_processor_from_field( "invoking_processor") requested_processor = self._get_processor_from_field( "requested_processor") invoking_interface_name: str = self._fields_values[ "invoking_interface"] requested_interface_name: str = self._fields_values[ "requested_interface"] requested_new_processor_name: str = self._fields_values[ "new_processor_name"] print( f"Invoking: {invoking_processor.name}:{invoking_interface_name}, Requested: {requested_processor.name}:{requested_interface_name}" ) if strcmp(scaling_type, "CloneAndScale"): # TODO: check “RequestedProcessor” must be an archetype # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor” requested_processor_clone = self._clone_processor_as_child( processor=requested_processor, parent_processor=invoking_processor, name=requested_new_processor_name) # 2. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale” self._constrains_interface( scale=scale, invoking_interface_name=invoking_interface_name, requested_interface_name=requested_interface_name, parent_processor=invoking_processor, child_processor=requested_processor_clone) elif strcmp(scaling_type, "Scale"): # Processors must be of same type (archetype or instance) if not strcmp(invoking_processor.instance_or_archetype, requested_processor.instance_or_archetype): raise CommandExecutionError( "Requested and invoking processors should be of the same type " "(both instance or_archetype)") # 1. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale” self._constrains_interface( scale=scale, invoking_interface_name=invoking_interface_name, requested_interface_name=requested_interface_name, parent_processor=invoking_processor, child_processor=requested_processor) elif strcmp(scaling_type, "CloneScaled"): # “RequestedProcessor” must be an archetype # if not strcmp(requested_processor.instance_or_archetype, "archetype"): # raise CommandExecutionError(f"Requested processor '{requested_processor.name}' should be of type 'archetype'") # “InvokingProcessor” must be an instance # if not strcmp(invoking_processor.instance_or_archetype, "instance"): # raise CommandExecutionError(f"Invoking processor '{invoking_processor.name}' should be of type 'instance'") # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor” # 2. Scales the new processor using “Scale” as the value of “RequestedInterface” requested_processor_clone = self._clone_processor_as_child( processor=requested_processor, parent_processor=invoking_processor) # Value Scale, which can be an expression, should be evaluated (ast) because we need a final float number scale_value = self._get_scale_value(scale) # In the cloned processor search in all interfaces if there are Observations relative_to RequestedInterface # and multiply the observation by the computed scale. self._scale_observations_relative_to_interface( processor=requested_processor_clone, interface_name=requested_interface_name, scale=scale_value)
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] dataset_datetime = self._content["dataset_datetime"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, measures = obtain_dataset_metadata(dataset_name, source) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in ["startperiod", "endperiod"]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = backend.data_source_manager.get_dataset_filtered( source, dataset_name, params) df = ds.data # Join with mapped dimensions (augment it) mapping_dict = create_dictionary() for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map)) mapping_dict[mappings[m].origin] = (mappings[m].destination, { d["o"]: d["to"] for d in mappings[m].map }) df = augment_dataframe_with_mapped_columns(df, mapping_dict, ["value"]) # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value" # values = self._content["measures"] out_names = self._content["measures_as"] group_by_dims = translate_case(self._content["group_by"], params) # Group by dimension names lcase_group_by_dims = [d.lower() for d in group_by_dims] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_group_by_dims: # Find and replace for i, d2 in enumerate(group_by_dims): if strcmp(d, d2): group_by_dims[i] = d break agg_funcs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: agg_funcs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: agg_funcs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: agg_funcs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: agg_funcs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: agg_funcs.append("count") agg_names["count"] = "countav" elif f.lower() in ["avgna"]: agg_funcs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: agg_funcs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present for r in group_by_dims.copy(): df_columns_dict = create_dictionary( data={c: None for c in df.columns}) if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[k] = params[k][0] else: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in group_by_dims: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by if True: groups = df.groupby(by=group_by_dims, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(agg_funcs): for i, (value, agg_func) in enumerate(zip(values, agg_funcs)): if len(out_names) == len(values) and out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg_func] + "_" + value) lst = d.get(value, []) lst.append(agg_func) d[value] = lst else: for value in values: lst = d.get(value, []) for agg_func in agg_funcs: lst.append(agg_func) lst_names.append(agg_names[agg_func] + "_" + value) d[value] = lst # Print NaN values for each value column for value in set(values): cnt = df[value].isnull().sum() print("NA count for col '" + value + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = group_by_dims + lst_names else: # Pivot table df2 = pd.pivot_table(df, values=values, index=group_by_dims, aggfunc=[agg_funcs[0]], fill_value=np.NaN, margins=False, dropna=True) # Remove the multiindex in columns df2.columns = [col[-1] for col in df2.columns.values] # Remove the index df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds.data = df2 except Exception as e: traceback.print_exc() issues.append((3, "There was a problem: " + str(e))) # Store the dataset in State datasets[result_name] = ds return issues, None
def process_line(item): sc_src_hierarchy = item.get("source_hierarchy") sc_src_interface_type = item.get("source_interface_type") sc_tgt_hierarchy = item.get("target_hierarchy") sc_tgt_interface_type = item.get("target_interface_type") sc_scale = item.get("scale") sc_src_context = item.get("source_context") sc_tgt_context = item.get("target_context") sc_src_unit = item.get("source_unit") sc_tgt_unit = item.get("target_unit") # Check the existence of the interface types force_create = True if force_create: pass # Check if FactorTypes exist fts = [] for i, (hierarchy, interface_type) in enumerate([ (sc_src_hierarchy, sc_src_interface_type), (sc_tgt_hierarchy, sc_tgt_interface_type) ]): m = "origin" if i == 0 else "destination" if not interface_type: issues.append( Issue(itype=3, description="The " + m + "interface type name has not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check if FactorType exists ft = glb_idx.get(FactorType.partial_key(interface_type)) if len(ft) > 0: if len(ft) == 1: fts.append(ft[0]) else: if not hierarchy: issues.append( Issue( itype=3, description="The hierarchy of the " + m + "interface type name has not been specified and the interface type name is not unique", location=IssueLocation(sheet_name=name, row=r, column=None))) return for ft2 in ft: if strcmp(ft2.hierarchy.name, hierarchy): fts.append(ft2) if len(fts) != 2: issues.append( Issue( itype=3, description="Found " + str(len(fts)) + " interface types in the specification of a scale change", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check that the interface types are from different hierarchies (warn if not; not error) if fts[0].hierarchy == fts[1].hierarchy: issues.append( Issue(itype=2, description="The interface types '" + fts[0].name + "' and '" + fts[1].name + "' are in the same hierarchy", location=IssueLocation(sheet_name=name, row=r, column=None))) # Create the directed Scale (Linear "Transformation") Relationship origin = fts[0] destination = fts[1] FactorTypesRelationUnidirectionalLinearTransformObservation.\ create_and_append(origin, destination, sc_scale, sc_src_context, sc_tgt_context, Observer.no_observer_specified)
def parse_scale_conversion_command( sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType: """ Analyze the input area Obtain the numerical part Read a row above and a column to the left, looking for source (left col) and target (row above) factor types FactorTypes do not need to exist previously, they can be created :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :return: list of issues (issue_type, message), command label, command content """ def get_subrow(r, c1, c2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell previous = None for c in range(c1, c2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst def get_subcolumn(c, r1, r2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell # !!! This may not be correct at all times: when a cell is intentionally left blank # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range previous = None for r in range(r1, r2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst # --------------------------------------------- some_error = False issues = [] # Detect the matrix defining scales m = binary_mask_from_worksheet( sh, True) # "True" is to focus on cells containing numbers # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix t = obtain_rectangular_submatrices(m)[ 0] # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3] t = ( t[0] + 1, t[1] + 1, t[2] + 1, t[3] + 1 ) # The previous calculation is done using Numpy, so it is Zero based. Correct this # Obtain the factor type names in the subrow on top of the matrix subrow = get_subrow(t[0] - 1, t[2], t[3]) # Obtain the factor type names in the subcolumn to the left of the matrix subcol = get_subcolumn(t[2] - 1, t[0], t[1]) # Check that we have valid factor type names for ft in subrow + subcol: try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, ft) except: some_error = True issues.append((3, "'" + ft + "' is not a valid Factor Type name")) if some_error: return issues, None, None # Scan the matrix, creating scale records scales = [] for i, r in enumerate(range(t[0], t[1])): for j, c in enumerate(range(t[2], t[3])): v = sh.cell(row=r, column=c).value if v: if not isinstance(v, str): v = str(v) # Origin factor origin = subcol[i] # Destination factor destination = subrow[j] if strcmp(origin, destination): issues.append( (3, "A change of scale to the same factor type (" + origin + ") is not allowed")) else: try: parser_field_parsers.string_to_ast( parser_field_parsers.expression_with_parameters, v) # Add the scale scales.append( dict(origin=origin, destination=destination, scale=v)) except: issues.append( (3, "The expression '" + v + "' at the intersection of factor types " + origin + " and " + destination + " is syntactically incorrect")) content = { "origin_factor_types": subcol, "destination_factor_types": subrow, "scales": scales } return issues, None, content
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in ["startperiod", "endperiod"]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = backend.data_source_manager.get_dataset_filtered( source, dataset_name, params) df = ds.data # Join with mapped dimensions (augment it) # TODO Prepare an "m" containing ALL the mappings affecting "df" # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"]) # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # TODO Change by many-to-many mapping # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns) # Elaborate a many to one mapping tmp = [] for el in mappings[m].map: for to in el["to"]: if to["d"]: tmp.append([el["o"], to["d"]]) df_dst = pd.DataFrame( tmp, columns=['sou_rce', mappings[m].destination]) for di in df.columns: if strcmp(mappings[m].origin, di): d = di if not backend.case_sensitive: df[d + "_l"] = df[d].str.lower() d = d + "_l" break df = pd.merge(df, df_dst, how='left', left_on=d, right_on='sou_rce') del df['sou_rce'] if not backend.case_sensitive: del df[d] # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] out_names = self._content["measures_as"] rows = translate_case(self._content["group_by"], params) # Group by dimension names lcase_rows = [d.lower() for d in rows] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_rows: # Find and replace for i, d2 in enumerate(rows): if strcmp(d, d2): rows[i] = d break aggs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: aggs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: aggs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: aggs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: aggs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: aggs.append("count") agg_names["count"] = "countav" elif f.lower() in ["avgna"]: aggs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: aggs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "rows" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present df_columns_dict = create_dictionary( data={c: c for c in df.columns}) for r in rows.copy(): if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[r] = params[k][0] else: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Put proper DIMENSION names for ir, r in enumerate(rows): if r in df_columns_dict: rows[ir] = df_columns_dict[r] # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in rows: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by # if True: groups = df.groupby(by=rows, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(aggs): for i, t in enumerate(zip(values, aggs)): v, agg = t if len(out_names) == len(values): if out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg] + "_" + v) else: lst_names.append(agg_names[agg] + "_" + v) lst = d.get(v, []) lst.append(agg) d[v] = lst else: for v in values: lst = d.get(v, []) for agg in aggs: lst.append(agg) lst_names.append(agg_names[agg] + "_" + v) d[v] = lst # Print NaN values for each value column for v in set(values): cnt = df[v].isnull().sum() print("NA count for col '" + v + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = rows + lst_names # else: # # Pivot table # df2 = pd.pivot_table(df, # values=values, # index=rows, # aggfunc=[aggs[0]], fill_value=np.NaN, margins=False, # dropna=True) # # Remove the multiindex in columns # df2.columns = [col[-1] for col in df2.columns.values] # # Remove the index # df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds.data = df2 except Exception as e: issues.append( (3, "There was a problem with the grouping: " + repr(e))) # Store the dataset in State datasets[result_name] = ds return issues, None
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType, dataset_name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) # Dataset source source = obtain_dataset_source(dataset_name) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = [k.lower() for k in dims[d].code_list.keys()] # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] cl[mappings[m].destination] = set(tmp) # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian measures = [] out_dims = [] agg_funcs = [] measures_as = [] filter_ = {} # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["dimensions_kept", "dims", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for d in lst: if not d: continue if d not in cl: issues.append((3, "The dimension specified for output, '"+d+"' is neither a dataset dimension nor a mapped dimension. ["+', '.join([d2 for d2 in cl])+"]")) else: out_dims.append(d) elif col_name.lower().strip() in ["aggregation_function", "aggfunc", "agg_func"]: # "SELECT AGGREGATORS" lst = obtain_column(c, area[0] + 1, area[1]) for f in lst: if f.lower() not in ["sum", "avg", "count", "sumna", "countav", "avgna", "pctna"]: issues.append((3, "The specified aggregation function, '"+f+"' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'")) else: agg_funcs.append(f) elif col_name.lower().strip() in ["measures"]: # "SELECT" lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for m in lst: if not m: continue if m not in meas: issues.append((3, "The specified measure, '"+m+"' is not a measure available in the dataset. ["+', '.join([m2 for m2 in measures])+"]")) else: measures.append(m) elif col_name.lower().strip() in ["measuresas"]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for m in lst: measures_as.append(m) elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for cd in lst: if not cd: continue if str(cd).lower() not in cl[col_name]: issues.append((3, "The code '"+cd+"' is not present in the codes declared for dimension '"+col_name+"'. Please, check them.")) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in ["startperiod", "endperiod"]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in ["result_name", "result name", "resultname"]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( (3, "Column '" + col_name + "' has an invalid dataset name '" + result_name + "'")) if len(measures) == 0: issues.append((3, "At least one measure should be specified")) if len(agg_funcs) == 0: issues.append((2, "No aggregation function specified. Assuming 'average'")) agg_funcs.append("average") if not result_name: result_name = source + "_" + dataset_name issues.append((2, "No result name specified. Assuming '"+result_name+"'")) content = {"dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": None, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def process_line(item): # Read variables ft_h_name = item.get( "interface_type_hierarchy", "_default") # "_default" InterfaceType Hierarchy NAME <<<<<< ft_name = item.get("interface_type", None) ft_sphere = item.get("sphere", None) ft_roegen_type = item.get("roegen_type", None) ft_parent = item.get("parent_interface_type", None) ft_formula = item.get("formula", None) ft_description = item.get("description", None) ft_unit = item.get("unit", None) # ft_orientation = item.get("orientation", None) ft_unit = item.get("unit", None) ft_attributes = item.get("attributes", {}) if ft_attributes: try: attributes = dictionary_from_key_value_list( ft_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=3, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} # Process # Mandatory fields if not ft_h_name: issues.append( Issue(itype=3, description= "Empty interface type hierarchy name. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if not ft_name: issues.append( Issue(itype=3, description="Empty interface type name. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it hie = glb_idx.get(Hierarchy.partial_key(name=ft_h_name)) if not hie: hie = Hierarchy(name=ft_h_name, type_name="interfacetype") glb_idx.put(hie.key(), hie) else: hie = hie[0] # If parent defined, check if it exists # (it must be registered both in the global registry AND in the hierarchy) if ft_parent: parent = glb_idx.get(FactorType.partial_key(ft_parent)) if len(parent) > 0: for p in parent: if p.hierarchy == hie: parent = p break if not isinstance(parent, FactorType): issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not found in hierarchy '" + ft_h_name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not found", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Double check, it must be defined in "hie" if ft_parent not in hie.codes: issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not registered in the hierarchy '" + ft_h_name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: parent = None # Check if FactorType exists ft = glb_idx.get(FactorType.partial_key(ft_name)) if len(ft) == 0: # TODO Compile and CONSIDER attributes (on the FactorType side) roegen_type = None if ft_roegen_type: roegen_type = FlowFundRoegenType.flow if strcmp( ft_roegen_type, "flow") else FlowFundRoegenType.fund ft = FactorType( ft_name, parent=parent, hierarchy=hie, roegen_type=roegen_type, tags=None, # No tags attributes=dict(unit=ft_unit, description=ft_description, **ft_attributes), expression=ft_formula, # orientation=ft_orientation, sphere=ft_sphere) # Simple name glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft) if not strcmp(ft_name, ft.full_hierarchy_name()): glb_idx.put( FactorType.partial_key(ft.full_hierarchy_name(), ft.ident), ft) else: issues.append( Issue(itype=3, description="Interface type name '" + ft_name + "' already registered", location=IssueLocation(sheet_name=name, row=r + 1, column=None))) return
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ issues: List[Issue] = [] # Analyze column names col_map = create_dictionary() for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value.strip() # Avoid repetitions if col_name in col_map: issues.append( Issue(itype=3, description="The column name '" + col_name + "' is repeated", location=IssueLocation(sheet_name=name, row=1, column=c))) if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"): col_map["dataset"] = c elif col_name: # Concept name col_map[col_name] = c if "dataset" not in col_map: issues.append( Issue( itype=3, description= "The column name 'DatasetName' is not defined for command 'DatasetData'", location=IssueLocation(sheet_name=name, row=1, column=c))) if any([i.itype == 3 for i in issues]): return issues, None, None # Read all the content into a list of lists lines = [] for r in range(area[0] + 1, area[1]): line = [] for col_name, c in col_map.items(): v = sh.cell(row=r, column=c).value if isinstance(v, str): v = v.strip() line.append(v) lines.append(line) # pd.DataFrame df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines) # Find the different datasets datasets = df["dataset"].unique() datasets = set([d.lower() for d in datasets]) content = [] # The output JSON for dataset in datasets: # Obtain filtered df2 = df.loc[df['dataset'].str.lower() == dataset] # Convert to JSON and store in content del df2["dataset"] s = StringIO() df2.to_json(s, orient="split") content.append(dict(name=dataset, values=s.getvalue())) return issues, None, dict(items=content, command_name=name)
def parse_and_unfold_line(item): # Consider multiplicity because of: # - A dataset (only one). First a list of dataset concepts used in the line is obtained. # Then the unique tuples formed by them are obtained. # - Processor name. # - A set of processors (wildcard or filter by attributes) # - A set of interfaces (according to another filter?) # - Multiple types of relation # - Both (first each dataset record applied -expanded-, then the name evaluation is applied) # - UNRESOLVED: expressions are resolved partially. Parts where parameters # expressions depending on parameters. Only the part of the expression depending on varying things # - The processor name could be a concatenation of multiple literals # # Look for multiple items in r_source_processor_name, r_source_interface_name, # r_target_processor_name, r_target_interface_name if item["_complex"]: asts = parse_line(item, fields) if item["_expandable"]: # It is an expandable line # Look for fields which are specified to be variable in order to originate the expansion res = classify_variables(asts, datasets, hh, parameters) ds_list = res["datasets"] ds_concepts = res["ds_concepts"] h_list = res["hierarchies"] if len(ds_list) >= 1 and len(h_list) >= 1: issues.append( Issue( itype=3, description="Dataset(s): " + ", ".join([d.name for d in ds_list]) + ", and hierarchy(ies): " + ", ".join([h.name for h in h_list]) + ", have been specified. Only a single dataset is supported.", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(ds_list) > 1: issues.append( Issue( itype=3, description= "More than one dataset has been specified: " + ", ".join([d.name for d in ds_list]) + ", just one dataset is supported.", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h_list) > 0: issues.append( Issue( itype=3, description= "One or more hierarchies have been specified: " + ", ".join([h.name for h in h_list]), location=IssueLocation(sheet_name=name, row=r, column=None))) return const_dict = obtain_dictionary_with_literal_fields( item, asts) if len(ds_list) == 1: # If a measure is requested and not all dimensions are used, aggregate or # issue an error (because it is not possible to reduce without aggregation). # If only dimensions are used, then obtain all the unique tuples ds = ds_list[0] measure_requested = False all_dimensions = set([ c.code for c in ds.dimensions if not c.is_measure ]) for con in ds_concepts: for c in ds.dimensions: if strcmp(c.code, con): if c.is_measure: measure_requested = True else: # Dimension all_dimensions.remove(c.code) only_dimensions_requested = len(all_dimensions) == 0 if measure_requested and not only_dimensions_requested: issues.append( Issue( itype=3, description= "It is not possible to use a measure if not all dataset dimensions are used (cannot assume implicit aggregation)", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif not measure_requested and not only_dimensions_requested: # TODO Reduce the dataset to the unique tuples (consider the current case -sensitive or not-sensitive-) data = None else: # Take the dataset as-is!!! data = ds.data # Each row for row in data.iterrows(): item2 = const_dict.copy() d = {} for c in ds_concepts: d["{" + ds.code + "." + c + "}"] = row[c] # Expand in all fields for f in fields: if f not in const_dict: # Replace all string = item[f] # TODO Could iterate through the variables in the field (not IN ALL FIELDS of the row) for item in sorted(d.keys(), key=len, reverse=True): string = re.sub(item, d[item], string) item2[f] = string print("Multiple by dataset: " + str(item2)) yield item2 else: # No dataset, no hierarchy of categories, but it could be still complex, because of wildcards # For now return just the line yield item # wildcard_in_source = ".." in item.get("source_processor", "") # wildcard_in_target = ".." in item.get("target_processor", "") # if wildcard_in_source or wildcard_in_target: # r_source_processor_name = string_to_ast(processor_names, item.get("source_processor", None)) # r_target_processor_name = string_to_ast(processor_names, item.get("target_processor", None)) # if wildcard_in_source: # source_processor_names = obtain_matching_processors(r_source_processor_name, all_processors) # else: # source_processor_names = [item["source_processor"]] # if wildcard_in_target: # target_processor_names = obtain_matching_processors(r_target_processor_name, all_processors) # else: # target_processor_names = [item["target_processor"]] # for s in source_processor_names: # for t in target_processor_names: # item3 = const_dict.copy() # item3["source_processor"] = s # item3["target_processor"] = t # print("Multiple by wildcard: "+str(item3)) # yield item3 # else: # # yield item # raise Exception("If 'complex' is signaled, it should not pass by this line") else: # print("Single: "+str(item)) yield item
def execute(self, state: "State"): """ For each parent processor clone all the child processors. The cloning process may pass some factor observation, that may result in """ some_error = False issues = [] parent_processor_type = self._content["parent_processor_type"] child_processor_type = self._content["child_processor_type"] scaled_factor = self._content["scaled_factor"] source = self._content["source"] # column_headers = self._content["column_headers"] # row_headers = self._content["row_headers"] scales = self._content["scales"] # Find processor sets, for parent and child glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) if parent_processor_type not in p_sets: some_error = True issues.append((3, "The processor type '"+parent_processor_type + "' (appointed for parent) has not been found in the commands execute so far")) if child_processor_type not in p_sets: some_error = True issues.append((3, "The processor type '"+child_processor_type + "' (should be child processor) has not been found in the commands execute so far")) if some_error: return issues, None # CREATE the Observer of the Upscaling oer = glb_idx.get(Observer.partial_key(source)) if not oer: oer = Observer(source) glb_idx.put(oer.key(), oer) else: oer = oer[0] # Processor Sets have associated attributes, and each of them has a code list parent = p_sets[parent_processor_type] # type: ProcessorsSet child = p_sets[child_processor_type] # type: ProcessorsSet # Form code lists from the command specification code_lists = None for sc_dict in scales: codes = sc_dict["codes"] if not code_lists: code_lists = [set() for _ in codes] for i, c in enumerate(codes): code_lists[i].add(c) # Match existing code lists (from Processor attributes) with the ones gathered in the specification of # the two (parent and child) processors sets. # Form lists of attributes of processors used in the code lists parent_attrs = [] child_attrs = [] matched = [] for i, cl in enumerate(code_lists): found = False for attr, attr_values in parent.attributes.items(): if set(attr_values).issuperset(cl): parent_attrs.append((attr, i)) # (Attribute, code list index) found = True break for attr, attr_values in child.attributes.items(): if set(attr_values).issuperset(cl): child_attrs.append((attr, i)) # (Attribute, code list index) found = True break matched.append(found) for i, found in enumerate(matched): if not found: cl = code_lists[i] # TODO Try cl as a list of names of parent or child processors if not found: issues.append((2, "The code list: " + ", ".join(cl) + " is not contained in the attributes of the parent processors set '" + parent_processor_type + "' nor in the attributes of the child processors set '" + child_processor_type + "'")) # Execute the upscale for each cached_processors = {} for sc_dict in scales: try: non_zero_weight = math.fabs(float(sc_dict["weight"])) > 1e-6 except: non_zero_weight = True if not non_zero_weight: continue codes = sc_dict["codes"] # Find parent processor parent_dict = {attr: codes[i] for attr, i in parent_attrs} d2s = str(parent_dict) if d2s in cached_processors: parent = cached_processors[d2s] if not parent: issues.append((3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one.")) else: parent_dict.update(Processor.partial_key()) # Obtain Processor matching the attributes <<<<<<<<<< # Query the PartialRetrievalDictionary by attributes parents = glb_idx.get(parent_dict) if len(parents) > 1: issues.append((3, "The tuple ("+str(parent_dict)+") matches "+str(len(parents))+" Processors: "+(", ".join([p.name for p in parents])))) parent = None elif len(parents) == 0: issues.append((3, "The tuple (" + str(parent_dict) + ") did not match any Processor")) parent = None else: parent = parents[0] cached_processors[d2s] = parent # Find child processor child_dict = {attr: codes[i] for attr, i in child_attrs} d2s = str(child_dict) if d2s in cached_processors: child = cached_processors[d2s] if not child: issues.append((3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one.")) else: child_dict.update(Processor.partial_key()) # Obtain Processors matching the attributes # Query the PartialRetrievalDictionary by attributes children = glb_idx.get(child_dict) if len(children) > 1: issues.append((3, "The tuple ("+str(child_dict)+") matches "+str(len(parents))+" Processors: "+(", ".join([p.name for p in children])))) child = None elif len(children) == 0: issues.append((3, "The tuple (" + str(child_dict) + ") did not match any Processor")) child = None else: child = children[0] # type: Processor cached_processors[d2s] = child # Clone child processor (and its descendants) and add an upscale relation between "parent" and the clone if parent and child: if non_zero_weight: # Clone the child processor # TODO cloned_child = child.clone(state=glb_idx) glb_idx.put(cloned_child.key(), cloned_child) # Create the new Relation Observations # - Part-of Relation o1 = ProcessorsRelationPartOfObservation.create_and_append(parent, cloned_child, oer) # Part-of glb_idx.put(o1.key(), o1) # - Upscale Relation quantity = str(sc_dict["weight"]) if True: # Find Interface named "scaled_factor" for f in parent.factors: if strcmp(f.name, scaled_factor): origin = f break else: origin = None for f in cloned_child.factors: if strcmp(f.name, scaled_factor): destination = f break else: destination = None if origin and destination: o3 = FactorsRelationScaleObservation.create_and_append(origin, destination, observer=None, quantity=quantity) glb_idx.put(o3.key(), o3) else: raise Exception("Could not find Interfaces to define a Scale relation. Processors: " + parent.name+", "+cloned_child.name+"; Interface name: "+scaled_factor) else: o3 = ProcessorsRelationUpscaleObservation.create_and_append(parent, cloned_child, observer=None, factor_name=scaled_factor, quantity=quantity) glb_idx.put(o3.key(), o3) else: # TODO parent_dict = str({attr: codes[i] for attr, i in parent_attrs}) child_dict = str({attr: codes[i] for attr, i in child_attrs}) if not parent and child: issues.append((2, "Could not find parent Processor matching attributes: "+parent_dict)) elif not child and parent: issues.append((2, "Could not find child Processor matching attributes: "+child_dict)) else: issues.append((2, "Could not find parent Processor matching attributes: "+parent_dict+", nor child Processor matching attributes: " + child_dict)) return issues, None
def execute(self, state: "State"): """ Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept: * FactorTypes and Categories use Hierarchies, which are intrinsic. The hierarchy name is passed to the containing Hierarchy object * Processors use Part-Of Relations. In this case, the hierarchy name is lost Names of Processor and FactorTypes are built both in hierarchical and simple form The hierarchical is all the ancestors from root down to the current node, separated by "." The simple name is just the current node. If there is already another concept with that name, the simple name is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??) """ issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # Process parsed information for item in self._content["items"]: r = item["_row"] # HierarchySource (Optional) hsource = item.get("source", None) # Code of entity defining the Hierarchy if hsource: tmp = hsource hsource = glb_idx.get( HierarchySource.partial_key(name=hsource)) if len(hsource) == 0: hsource = HierarchySource(name=tmp) glb_idx.put(hsource.key(), hsource) else: hsource = hsource[0] hname = item.get("hierarchy_name", None) if not hname: issues.append( Issue( itype=3, description= "The name of the Hierarchy has not been defined. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL) hg = item.get("hierarchy_group", None) if hg: is_code_list = False # Hierarchy group else: is_code_list = True # Hierarchy group for the Code List, with the same name hg = hname # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup tmp = hg hg = glb_idx.get(HierarchyGroup.partial_key(name=hg)) if len(hg) == 0: hg = HierarchyGroup(name=tmp, source=hsource) glb_idx.put(hg.key(), hg) else: hg = hg[0] # Check if the Hierarchy is defined. YES, get it; NO, create it tmp = hname h = glb_idx.get(Hierarchy.partial_key(name=hname)) if len(h) == 0: h = Hierarchy(name=tmp) glb_idx.put(h.key(), h) glb_idx.put(h.key(hg.name + "." + h.name), h) # Register with alternative (full) name else: h = h[0] # Add the Hierarchy to the HierarchyGroup (if not) if h not in hg.hierarchies: hg.hierarchies.append(h) # Level level = item.get("level", None) if level: # Check if the level is defined. YES, get it; NO, create it for l in h.levels: if strcmp(l.name, level): level = l break else: level = HierarchyLevel(name=level, hierarchy=h) h.levels.append(level) code = item.get("code", None) label = item.get("label", None) description = item.get("description", None) attributes = item.get("attributes", None) expression = item.get("expression", None) # Parent property (what really defines Hierarchies) parent_code = item.get("parent_code", None) if parent_code: ph = h # Parent Hierarchy is the same as current hierarchy pcode = ph.codes.get(parent_code, None) if not pcode: issues.append( Issue(itype=3, description="Could not find code '" + parent_code + "' in hierarchy '" + ph.name + "'. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: pcode = None # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned if not is_code_list: ref_hierarchy = item.get("referred_hierarchy", None) if not ref_hierarchy: issues.append( Issue( itype=3, description= "For HCLs, defining ReferredHierarchy is mandatory", location=IssueLocation(sheet_name=name, row=r, column=None))) continue tmp = ref_hierarchy ref_hierarchy = glb_idx.get( Hierarchy.partial_key(name=ref_hierarchy)) if len(ref_hierarchy) == 0: issues.append( Issue(itype=3, description="ReferredHierarchy '" + tmp + "' not defined previously", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_hierarchy = ref_hierarchy[0] ref_code = ref_hierarchy.codes.get(code, None) if not ref_code: issues.append( Issue(itype=3, description="Code '" + code + "' not found in referred hierarchy '" + ref_hierarchy.name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Ignore: LABEL, DESCRIPTION. Copy them from referred code label = ref_code.label description = ref_code.description else: ref_code = None c = h.codes.get(code, None) if c: issues.append( Issue(itype=3, description="Code '" + code + "' in hierarchy '" + h.name + "' redefined.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None) c = Taxon(name=code, hierarchy=h, level=level, referred_taxon=ref_code, parent=pcode, label=label, description=description, attributes=attributes, expression=expression) # Add code to hierarchy h.codes[code] = c if not c.parent: h.roots_append(c) # Add code to level if level: level.codes.add(c) # Add child to parent code # (DONE BY THE CONSTRUCTOR!!) # if pcode: # pcode.children_codes.append(c) return issues, None # Issues, Output
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin, destination) -> IssuesLabelContentTripleType: """ Map from a set of categories from an external dataset into a set of MuSIASEM categories If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping will still hold The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence. The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side) :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param origin: :param destination: :return: list of issues (issue_type, message), command label, command content """ some_error = False issues = [] # Analyze Origin cell = sh.cell(row=area[0], column=area[2]) col_name = cell.value if origin: if not strcmp(origin, col_name): some_error = True issues.append((3, "The Origin name is different in the sheet name and in the worksheet ("+origin+", "+col_name+")")) else: origin = col_name # Obtain the source, the dataset and the dimension of "origin" spl = origin.split(".") if len(spl) == 3: # Source.Dataset.Dimension s, ds, dim = spl s = s + "." origin_ok = True elif len(spl) == 2: # Dataset.Dimension ds, dim = spl s = "" origin_ok = True else: origin_ok = False some_error = True issues.append((3, "Origin must specify a dataset and a dimension name separated by '.'")) if origin_ok: origin_dataset = s + ds origin_dim = dim if not check_dataset_exists(origin_dataset): some_error = True issues.append((3, "The Origin '" + origin_dataset + "' does not match any registered dataset")) else: dims, attrs, meas = obtain_dataset_metadata(ds) if origin_dim not in dims: some_error = True issues.append((3, "The Origin dataset '" + origin_dataset + "' does not have a dimension '" + origin_dim + "'")) # Analyze Destination cell = sh.cell(row=area[0], column=area[2] + 1) col_name = cell.value if destination: if not strcmp(destination, col_name): some_error = True issues.append((3, "The Destination name is different in the sheet name and in the worksheet (" + destination + ", " + col_name + ")")) else: destination = col_name # Destination name must be a simple identity try: parser_field_parsers.simple_ident.parseString(destination, parseAll=True) except: some_error = True issues.append((3, "'" + destination + "' category name has to be a simple identifier")) if some_error: # Issues at this point are errors, return if there are any return issues, None, None # Read mapping Origin to Destination o_dict = create_dictionary() for r in range(area[0] + 1, area[1]): o_value = sh.cell(row=r, column=area[2]).value # First column -> Origin d_value = sh.cell(row=r, column=area[2] + 1).value # Second column -> Destination try: exp_value = sh.cell(row=r, column=area[2] + 2).value # Third column -> Weight (for Many to Many mappings) if exp_value: try: exp_value = float(exp_value) except: # If it is not possible, it maybe an expression, postpone conversion until usage pass else: exp_value = 1.0 # If undefined -> Many to One except: exp_value = 1.0 # If undefined -> Many to One if not o_value and not d_value: # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped.")) continue elif not o_value or not d_value: if not o_value and d_value: issues.append((2, "Row "+str(r)+": Origin not defined. Row skipped.")) else: issues.append((2, "Row " + str(r) + ": Destination not defined. Row skipped.")) continue o_value = str(o_value).lower() d_value = str(d_value).lower() if o_value in o_dict: lst = o_dict[o_value] else: lst = [] o_dict[o_value] = lst # Check "d_value" is not being repeated for "o_value" if (len(lst) == 0) or (len(lst) >= 1 and d_value not in [d["d"] for d in lst]): lst.append({"d": d_value, "w": exp_value}) else: issues.append((3, "Destination category '" + destination + "' has been repeated for origin category '" + o_value + "' at row '"+str(r)+"'")) # List of dictionaries, where each dictionary contains the specification of an origin "o" # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e" content = {"origin_dataset": origin_dataset, # Name of the origin dataset (may include the source name) "origin_dimension": origin_dim, # Name of the origin dimension inside the dataset "destination": destination, # Name of the destination hierarchy "map": [{"o": k, "to": v} for k, v in o_dict.items()] } label = ((content["origin_dataset"] + ".") if origin_dataset else "") + content["origin_dimension"] + " -> " + content["destination"] return issues, label, content