def get_datasets(self, source: Union[IDataSourceManager, str] = None, database=None, local_datasets=None): # Register AdHoc datasets self.register_local_datasets(local_datasets) if source: source = self._get_source_manager(source) lst = [] if source: if strcmp(source.get_name(), "AdHoc"): lst = [(source.get_name(), source.get_datasets(database))] else: lst = self.get_external_datasets(source, database) else: # ALL DATASETS lst_total = [] lst = self.get_external_datasets( source, database ) # Because "get_external_datasets" uses "Memoize", DO NOT modify "lst" outside lst_total.extend(lst) for s in self.registry: if strcmp(s, "AdHoc") and local_datasets: lst_total.append( (s, [ds for ds in self.registry[s].get_datasets()])) # Unregister AdHoc datasets self.unregister_local_datasets(local_datasets) return lst
def get_interface_type(attribute, value, prd: PartialRetrievalDictionary = None): """ Obtain the name of an InterfaceType given the value of an attribute (Obtain the registry of objects) :param attribute: :param value: :param prd: A PartialRetrievalDictionary, passed in State "_glb_idx" to the AST evaluator by :return: """ if not prd: raise Exception(f"No Global-Index parameter passed to InterfaceType function") else: # Obtain ALL InterfaceTypes, then ONE having attribute "attribute" with value <value> its = prd.get(FactorType.partial_key()) ret = None for it in its: v = vars(it).get(attribute) if not v: v = it.attributes.get(attribute) if v and (strcmp(v, str(value)) or (is_float(value) and float(v) == float(value))): ret = it.name break if ret: return ret else: raise Exception(f"No InterfaceType found having attribute '{attribute}' with value '{value}'")
def get_external_datasets(self, source: Union[IDataSourceManager, str] = None, database=None): """ Obtain a list of tuples (Source, Dataset name) :param source: If specified, the name of the source :param database: If specified, the name of a database in the source :return: List of tuples (Source name, Dataset name) """ if source: source = self._get_source_manager(source) if source: if database: # SOURCE+DATABASE DATASETS return [(source.get_name(), source.get_datasets(database))] else: # ALL SOURCE DATASETS lst = [] for db in source.get_databases(): lst.extend(source.get_datasets(db)) return [(source.get_name(), lst) ] # List of tuples (dataset code, description, urn) else: # ALL DATASETS lst = [] for s in self.registry: if not strcmp(s, "AdHoc"): lst.append( (s, [ds for ds in self.registry[s].get_datasets()])) return lst # List of tuples (source, dataset code, description, urn)
def get_processor(attribute, value, prd: PartialRetrievalDictionary = None): """ Obtain the name of a Processor given the value of an attribute (Obtain the registry of objects) :param attribute: :param value: :param prd: A PartialRetrievalDictionary, passed in State "_glb_idx" to the AST evaluator by :return: """ if not prd: raise Exception(f"No Global-Index parameter passed to Processor function") else: # Obtain ALL Processors, then ONE having attribute "attribute" with value <value> procs = prd.get(Processor.partial_key()) ret = None for proc in procs: v = vars(proc).get(attribute) if not v: v = proc.attributes.get(attribute) if v and (strcmp(v, str(value)) or (is_float(value) and float(v) == float(value))): ret = proc.name break if ret: return ret else: raise Exception(f"No Processor found having attribute '{attribute}' with value '{value}'")
def _constrains_interface(self, scale: str, invoking_interface_name: str, requested_interface_name: str, parent_processor: Processor, child_processor: Processor): origin_factor = first( parent_processor.factors, lambda i: strcmp(i.name, invoking_interface_name)) if not origin_factor: raise Exception("Invoking interface name '" + invoking_interface_name + "' not found for processor '" + parent_processor.name + "'") destination_factor = first( child_processor.factors, lambda i: strcmp(i.name, requested_interface_name)) if not destination_factor: raise Exception("Requested interface name '" + invoking_interface_name + "' not found for processor '" + parent_processor.name + "'") if origin_factor.taxon != destination_factor.taxon: # Search for an Interface Type Conversion defined in the ScaleChangeMap command interface_types_transform = self._get_interface_types_transform( origin_factor.taxon, parent_processor, destination_factor.taxon, child_processor) scale = FloatOrString.multiply( scale, interface_types_transform.scaled_weight) relationship = FactorsRelationScaleObservation.create_and_append( origin=origin_factor, destination=destination_factor, observer=None, quantity=scale) # relationship = ProcessorsRelationUpscaleObservation.create_and_append(parent=parent_processor, # child=child_processor, # observer=None, # factor_name=interface_name, # quantity=scale) self._glb_idx.put(relationship.key(), relationship)
def _scale_observations_relative_to_interface(self, processor: Processor, interface_name: str, scale: Union[int, float]): for factor in processor.factors: for observation in factor.quantitative_observations: relative_to_interface = observation.attributes.get( "relative_to", None) if relative_to_interface and strcmp(relative_to_interface.name, interface_name): observation.value = float(observation.value) * scale observation.attributes["relative_to"] = None
def _check_flow_orientation(self, source_processor: Processor, target_processor: Processor, source_interface: Factor, target_interface: Factor, is_direct_flow: bool): """Check for correct interfaces orientation (input/output) of source and target""" allowed_source_orientation = ("Output" if is_direct_flow else "Input") # Are the orientations equal? if strcmp(source_interface.orientation, target_interface.orientation): if strcmp(source_interface.orientation, allowed_source_orientation): # Target processor should be parent of source processor parent_processor, child_processor = target_processor, source_processor else: # Source processor should be parent of target processor parent_processor, child_processor = source_processor, target_processor if child_processor not in parent_processor.children(self._glb_idx): raise CommandExecutionError( f"The processor '{child_processor.name}' should be part of the " f"processor '{parent_processor.name}' when using the same interface " f"orientation '{source_interface.orientation}'.") else: # Orientations are different if not strcmp(source_interface.orientation, allowed_source_orientation): raise CommandExecutionError( f"The source interface '{source_interface.full_name}' has the wrong " f"orientation '{source_interface.orientation}'.") if strcmp(target_interface.orientation, allowed_source_orientation): raise CommandExecutionError( f"The target interface '{target_interface.full_name}' has the wrong " f"orientation '{target_interface.orientation}'.")
def get_parameters_in_state(state: State): res = [] query = BasicQuery(state) for p in query.execute([Parameter], filt=""): p_name = p.name p_type = p.type if p.range: if strcmp(p_type, "Number"): p_range = p.range else: glb_idx, _, _, _, _ = get_case_study_registry_objects(state) h = glb_idx.get(Hierarchy.partial_key(p.range)) h = h[0] p_range = ', '.join(h.codes.keys()) else: p_range = "" res.append(dict(name=p_name, type=p_type, range=p_range)) return res
def _process_row(self, fields: Dict[str, Any], subrow=None) -> None: """ Create and register Indicator object :param fields: """ benchmark_names = fields["benchmarks"] benchmarks = [] if benchmark_names: for benchmark_name in benchmark_names.split(","): if benchmark_name: benchmark = self._glb_idx.get(Benchmark.partial_key(benchmark_name)) if len(benchmark) == 1: benchmark = benchmark[0] elif len(benchmark) == 0: self._add_issue(IType.ERROR, f"Benchmark {benchmark_name} does not exist (it must be declared previously in a " "ScalarBenchmark command worksheet") return elif len(benchmark) > 1: self._add_issue(IType.ERROR, f"Benchmark {benchmark_name} exists {len(benchmark)} times." " Only one occurrence is allowed.") return else: benchmark = None if benchmark: benchmarks.append(benchmark) indicator = Indicator(fields["indicator_name"], fields["formula"], None, fields.get("processors_selector"), benchmarks, IndicatorCategories.factors_expression if strcmp(fields.get("local"), "Yes") else IndicatorCategories.case_study, fields.get("description"), fields["indicators_group"], fields["unit"], fields["unit_label"], fields["source"]) self._glb_idx.put(indicator.key(), indicator)
def _get_factor_type_from_field( self, hierarchy_field_name: str, interface_type_field_name: str) -> FactorType: interface_type_name = self._fields[interface_type_field_name] if not interface_type_name: raise CommandExecutionError( f"The field '{interface_type_field_name}' has not been specified" ) # Check if FactorType exists interface_types = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 1: return interface_types[0] elif len(interface_types) == 0: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found" ) else: hierarchy_name = self._fields[hierarchy_field_name] if not hierarchy_name: raise CommandExecutionError( f"The field '{hierarchy_field_name}' has not been specified and " f"the interface type '{interface_type_name}' is not unique" ) interface_type = first( interface_types, lambda t: strcmp(t.hierarchy.name, hierarchy_name)) if not interface_type: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found in " f"hierarchy '{hierarchy_name}'") return interface_type
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType, dataset_name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Dataset source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = [k.lower() for k in dims[d].code_list.keys()] # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map tmp = [ to["d"] for o in mappings[m].map for to in o["to"] if to["d"] ] cl[mappings[m].destination] = set( tmp) # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian measures = [] out_dims = [] agg_funcs = [] measures_as = [] filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in [ "dimensions_kept", "dims", "dimensions" ]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for d in lst: if not d: continue if d not in cl: issues.append(( 3, "The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]")) else: out_dims.append(d) elif col_name.lower().strip() in [ "aggregation_function", "aggfunc", "agg_func" ]: # "SELECT AGGREGATORS" lst = obtain_column(c, area[0] + 1, area[1]) for f in lst: if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append(( 3, "The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'" )) else: agg_funcs.append(f) elif col_name.lower().strip() in ["measures"]: # "SELECT" lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for m in lst: if not m: continue if m not in meas: issues.append( (3, "The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]")) else: measures.append(m) elif col_name.lower().strip() in ["measuresas"]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for m in lst: measures_as.append(m) elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for cd in lst: if not cd: continue if str(cd).lower() not in cl[col_name]: issues.append(( 3, "The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.")) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in ["result_name", "result name", "resultname"]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append((3, "Column '" + col_name + "' has an invalid dataset name '" + result_name + "'")) if len(measures) == 0: issues.append((3, "At least one measure should be specified")) if len(agg_funcs) == 0: issues.append( (2, "No aggregation function specified. Assuming 'average'")) agg_funcs.append("average") if not result_name: result_name = source + "_" + dataset_name issues.append( (2, "No result name specified. Assuming '" + result_name + "'")) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": None, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def prepare_model(state) -> NoReturn: """ Modify the state so that: * Implicit references of Interfaces to subcontexts are materialized * Creating processors * Creating interfaces in these processors * Creating relationships in these processors :param state: """ # TODO: currently when an interface is defined as a Scale from two or more interfaces, the computed values are # added while the intuition tells us that only one scale should be defined. We have to give a warning message # if this situation happens. # Registry and the other objects also glb_idx, _, _, _, _ = get_case_study_registry_objects(state) # Prepare a Query to obtain ALL interfaces query = BasicQuery(state) filt = {} objs = query.execute([Factor], filt) for iface in objs[Factor]: # type: Factor if strcmp( iface.processor.instance_or_archetype, 'Archetype') or strcmp( iface.processor.instance_or_archetype, 'No'): continue # If the Interface is connected to a "Subcontext" different than the owning Processor if iface.opposite_processor_type: if iface.opposite_processor_type.lower( ) != iface.processor.subsystem_type.lower(): # Check if the interface has flow relationships # TODO An alternative is to search "observations" of type FactorsRelationDirectedFlowObservation # in the same "iface" if iface.orientation.lower() == "input": parameter = {"target": iface} else: parameter = {"source": iface} relations = glb_idx.get( FactorsRelationDirectedFlowObservation.partial_key( **parameter)) # If it does not have flow relationships: # * define default Processor name and retrieve it (or if it does not exist, create it) # * create an Interface into that Processor and a Flow Relationship if len(relations) == 0: # Define the name of a Processor in the same context but in different subcontext p_name = iface.processor.processor_system + "_" + iface.opposite_processor_type p = glb_idx.get(Processor.partial_key(p_name)) if len(p) == 0: attributes = { 'subsystem_type': iface.opposite_processor_type, 'processor_system': iface.processor.processor_system, 'functional_or_structural': 'Functional', 'instance_or_archetype': 'Instance' # 'stock': None } p = Processor(p_name, attributes=attributes) glb_idx.put(p.key(), p) else: p = p[0] attributes = { 'sphere': 'Technosphere' if iface.opposite_processor_type.lower() in ["local", "external"] else 'Biosphere', 'roegen_type': iface.roegen_type, 'orientation': "Input" if iface.orientation.lower() == "output" else "Output", 'opposite_processor_type': iface.processor.subsystem_type } # Create Interface (if it does not exist) if not p.factors_find(iface.taxon.name): f = Factor.create_and_append( name=iface.taxon.name, processor=p, in_processor_type=FactorInProcessorType( external=False, incoming=iface.orientation.lower() == "output"), attributes=attributes, taxon=iface.taxon) glb_idx.put(f.key(), f) # Create Flow Relationship if iface.orientation.lower() == "output": source = iface target = f else: source = f target = iface fr = FactorsRelationDirectedFlowObservation.create_and_append( source=source, target=target, observer=None) glb_idx.put(fr.key(), fr)
def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None: """ Process a dictionary representing a row of the InterfaceTypes command. The dictionary can come directly from the worksheet or from a dataset. :param field_values: dictionary """ # Read variables ft_h_name = field_values.get( "interface_type_hierarchy", "_default") # "_default" InterfaceType Hierarchy NAME <<<<<< ft_name = field_values.get("interface_type") ft_sphere = field_values.get("sphere") ft_roegen_type = field_values.get("roegen_type") ft_parent = field_values.get("parent_interface_type") ft_formula = field_values.get("formula") ft_description = field_values.get("description") ft_unit = field_values.get("unit") ft_opposite_processor_type = field_values.get( "opposite_processor_type") ft_level = field_values.get("level") ft_attributes = field_values.get("attributes", {}) print(str(type(ft_attributes))) if ft_attributes: try: ft_attributes = dictionary_from_key_value_list( ft_attributes, self._glb_idx) except Exception as e: self._add_issue(IType.ERROR, str(e) + subrow_issue_message(subrow)) return else: ft_attributes = {} # Process # Mandatory fields if not ft_h_name: self._add_issue( IType.WARNING, "Empty interface type hierarchy name. It is recommended to specify one, assuming '_default'." + subrow_issue_message(subrow)) ft_h_name = "_default" if not ft_name: self._add_issue( IType.ERROR, "Empty interface type name. Skipped." + subrow_issue_message(subrow)) return # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it hie = self._glb_idx.get(Hierarchy.partial_key(name=ft_h_name)) if not hie: hie = Hierarchy(name=ft_h_name, type_name="interfacetype") self._glb_idx.put(hie.key(), hie) else: hie = hie[0] # If parent defined, check if it exists # (it must be registered both in the global registry AND in the hierarchy) if ft_parent: parent = self._glb_idx.get(FactorType.partial_key(ft_parent)) if len(parent) > 0: for p in parent: if p.hierarchy == hie: parent = p break if not isinstance(parent, FactorType): self._add_issue( IType.ERROR, f"Parent interface type name '{ft_parent}' not found in hierarchy '{ft_h_name}" + subrow_issue_message(subrow)) return else: self._add_issue( IType.ERROR, f"Parent interface type name '{ft_parent}' not found" + subrow_issue_message(subrow)) return # Double check, it must be defined in "hie" if ft_parent not in hie.codes: self._add_issue( IType.ERROR, f"Parent interface type name '{ft_parent}' not registered in the hierarchy '{ft_h_name}'" + subrow_issue_message(subrow)) return else: parent = None # Check if FactorType exists ft = self._glb_idx.get(FactorType.partial_key(ft_name)) if len(ft) == 0: # TODO Compile and CONSIDER attributes (on the FactorType side) roegen_type = None if ft_roegen_type: roegen_type = FlowFundRoegenType.flow if strcmp( ft_roegen_type, "flow") else FlowFundRoegenType.fund ft = FactorType( ft_name, parent=parent, hierarchy=hie, roegen_type=roegen_type, tags=None, # No tags attributes=dict(unit=ft_unit, description=ft_description, level=ft_level, **ft_attributes), expression=ft_formula, sphere=ft_sphere, opposite_processor_type=ft_opposite_processor_type) # Simple name self._glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft) if not strcmp(ft_name, ft.full_hierarchy_name()): self._glb_idx.put( FactorType.partial_key(ft.full_hierarchy_name(), ft.ident), ft) else: self._add_issue( IType.WARNING, f"Interface type name '{ft_name}' already registered" + subrow_issue_message(subrow)) return
def execute(self, state: "State"): """ Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept: * FactorTypes and Categories use Hierarchies, which are intrinsic. The hierarchy name is passed to the containing Hierarchy object * Processors use Part-Of Relations. In this case, the hierarchy name is lost Names of Processor and FactorTypes are built both in hierarchical and simple form The hierarchical is all the ancestors from root down to the current node, separated by "." The simple name is just the current node. If there is already another concept with that name, the simple name is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??) """ issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # Process parsed information for item in self._content["items"]: r = item["_row"] # HierarchySource (Optional) hsource = item.get("source", None) # Code of entity defining the Hierarchy if hsource: tmp = hsource hsource = glb_idx.get( HierarchySource.partial_key(name=hsource)) if len(hsource) == 0: hsource = HierarchySource(name=tmp) glb_idx.put(hsource.key(), hsource) else: hsource = hsource[0] hname = item.get("hierarchy_name", None) if not hname: issues.append( Issue( itype=IType.ERROR, description= "The name of the Hierarchy has not been defined. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL) hg = item.get("hierarchy_group", None) if hg: is_code_list = False # Hierarchy group else: is_code_list = True # Hierarchy group for the Code List, with the same name hg = hname # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup tmp = hg hg = glb_idx.get(HierarchyGroup.partial_key(name=hg)) if len(hg) == 0: hg = HierarchyGroup(name=tmp, source=hsource) glb_idx.put(hg.key(), hg) else: hg = hg[0] # Check if the Hierarchy is defined. YES, get it; NO, create it tmp = hname h = glb_idx.get(Hierarchy.partial_key(name=hname)) if len(h) == 0: h = Hierarchy(name=tmp) glb_idx.put(h.key(), h) glb_idx.put(h.key(hg.name + "." + h.name), h) # Register with alternative (full) name else: h = h[0] # Add the Hierarchy to the HierarchyGroup (if not) if h not in hg.hierarchies: hg.hierarchies.append(h) # Level level = item.get("level", None) if level: # Check if the level is defined. YES, get it; NO, create it for l in h.levels: if strcmp(l.name, level): level = l break else: level = HierarchyLevel(name=level, hierarchy=h) h.levels.append(level) code = item.get("code", None) label = item.get("label", None) description = item.get("description", None) attributes = item.get("attributes", None) expression = item.get("expression", None) # Parent property (what really defines Hierarchies) parent_code = item.get("parent_code", None) if parent_code: ph = h # Parent Hierarchy is the same as current hierarchy pcode = ph.codes.get(parent_code, None) if not pcode: issues.append( Issue(itype=IType.ERROR, description="Could not find code '" + parent_code + "' in hierarchy '" + ph.name + "'. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: pcode = None # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned if not is_code_list: ref_hierarchy = item.get("referred_hierarchy", None) if not ref_hierarchy: issues.append( Issue( itype=IType.ERROR, description= "For HCLs, defining ReferredHierarchy is mandatory", location=IssueLocation(sheet_name=name, row=r, column=None))) continue tmp = ref_hierarchy ref_hierarchy = glb_idx.get( Hierarchy.partial_key(name=ref_hierarchy)) if len(ref_hierarchy) == 0: issues.append( Issue(itype=IType.ERROR, description="ReferredHierarchy '" + tmp + "' not defined previously", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_hierarchy = ref_hierarchy[0] ref_code = ref_hierarchy.codes.get(code, None) if not ref_code: issues.append( Issue(itype=IType.ERROR, description="Code '" + code + "' not found in referred hierarchy '" + ref_hierarchy.name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Ignore: LABEL, DESCRIPTION. Copy them from referred code label = ref_code.label description = ref_code.description else: ref_code = None c = h.codes.get(code, None) if c: issues.append( Issue(itype=IType.ERROR, description="Code '" + code + "' in hierarchy '" + h.name + "' redefined.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None) c = Taxon(name=code, hierarchy=h, level=level, referred_taxon=ref_code, parent=pcode, label=label, description=description, attributes=attributes, expression=expression) # Add code to hierarchy h.codes[code] = c if not c.parent: h.roots_append(c) # Add code to level if level: level.codes.add(c) # Add child to parent code # (DONE BY THE CONSTRUCTOR!!) # if pcode: # pcode.children_codes.append(c) return issues, None # Issues, Output
def parse_scale_conversion_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType: """ Analyze the input area Obtain the numerical part Read a row above and a column to the left, looking for source (left col) and target (row above) factor types FactorTypes do not need to exist previously, they can be created :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :return: list of issues (issue_type, message), command label, command content """ def get_subrow(r, c1, c2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell previous = None for c in range(c1, c2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst def get_subcolumn(c, r1, r2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell # !!! This may not be correct at all times: when a cell is intentionally left blank # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range previous = None for r in range(r1, r2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst # --------------------------------------------- some_error = False issues = [] # Detect the matrix defining scales m = binary_mask_from_worksheet(sh, True) # "True" is to focus on cells containing numbers # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix t = obtain_rectangular_submatrices(m)[0] # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3] t = (t[0]+1, t[1]+1, t[2]+1, t[3]+1) # The previous calculation is done using Numpy, so it is Zero based. Correct this # Obtain the factor type names in the subrow on top of the matrix subrow = get_subrow(t[0]-1, t[2], t[3]) # Obtain the factor type names in the subcolumn to the left of the matrix subcol = get_subcolumn(t[2]-1, t[0], t[1]) # Check that we have valid factor type names for ft in subrow+subcol: try: parser_field_parsers.string_to_ast(parser_field_parsers.simple_h_name, ft) except: some_error = True issues.append((3, "'"+ft+"' is not a valid Factor Type name")) if some_error: return issues, None, None # Scan the matrix, creating scale records scales = [] for i, r in enumerate(range(t[0], t[1])): for j, c in enumerate(range(t[2], t[3])): v = sh.cell(row=r, column=c).value if v: if not isinstance(v, str): v = str(v) # Origin factor origin = subcol[i] # Destination factor destination = subrow[j] if strcmp(origin, destination): issues.append((3, "A change of scale to the same factor type ("+origin+") is not allowed")) else: try: parser_field_parsers.string_to_ast(parser_field_parsers.expression_with_parameters, v) # Add the scale scales.append(dict(origin=origin, destination=destination, scale=v)) except: issues.append((3, "The expression '"+v+"' at the intersection of factor types " + origin + " and " + destination + " is syntactically incorrect")) content = {"origin_factor_types": subcol, "destination_factor_types": subrow, "scales": scales } return issues, None, content
def _process_row(self, fields: Dict[str, Any], subrow=None) -> None: scaling_type = fields["scaling_type"] scale: str = fields["scale"] # Find processors invoking_processor = self._get_processor_from_field( "invoking_processor") requested_processor = self._get_processor_from_field( "requested_processor") if invoking_processor == requested_processor: raise CommandExecutionError( f"Invoking and Requested processors cannot be the same '{invoking_processor.name}'. " f"Use the 'relative_to' attribute in 'Interfaces' command instead." ) invoking_interface_name: str = fields["invoking_interface"] requested_interface_name: str = fields["requested_interface"] requested_new_processor_name: str = fields["new_processor_name"] ## # Transform text of "attributes" into a dictionary if fields.get("attributes"): try: fields["attributes"] = dictionary_from_key_value_list( fields["attributes"], self._glb_idx) except Exception as e: self._add_issue(IType.ERROR, str(e) + subrow_issue_message(subrow)) return else: fields["attributes"] = {} # Process specific fields # Obtain the parent: it must exist. It could be created dynamically but it's important to specify attributes if fields.get("parent_processor"): try: parent_processor = self._get_processor_from_field( "parent_processor") except CommandExecutionError: self._add_issue( IType.ERROR, f"Specified parent processor, '{fields.get('parent_processor')}', does not exist" + subrow_issue_message(subrow)) return else: parent_processor = None # Get internal and user-defined attributes in one dictionary attributes = { c.name: fields[c.name] for c in self._command_fields if c.attribute_of == Processor and fields[c.name] is not None } # print(f"Invoking: {invoking_processor.name}:{invoking_interface_name}, Requested: {requested_processor.name}:{requested_interface_name}") requested_processor_clone = None if strcmp(scaling_type, "CloneAndScale") or strcmp( scaling_type, "Clone"): # TODO: check “RequestedProcessor” must be an archetype # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor” requested_processor_clone = self._clone_processor_as_child( processor=requested_processor, parent_processor=invoking_processor if not parent_processor else parent_processor, name=requested_new_processor_name, other_attributes=attributes) if strcmp(scaling_type, "CloneAndScale"): # 2. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale” try: self._constrains_interface( scale=scale, invoking_interface_name=invoking_interface_name, requested_interface_name=requested_interface_name, parent_processor=invoking_processor, child_processor=requested_processor_clone) except Exception as e: self._add_issue(IType.ERROR, str(e) + subrow_issue_message(subrow)) return elif strcmp(scaling_type, "Scale"): # Processors must be of same type (archetype or instance) if not strcmp(invoking_processor.instance_or_archetype, requested_processor.instance_or_archetype): raise CommandExecutionError( "Requested and invoking processors should be of the same type " "(both instance or_archetype)") # 1. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale” try: self._constrains_interface( scale=scale, invoking_interface_name=invoking_interface_name, requested_interface_name=requested_interface_name, parent_processor=invoking_processor, child_processor=requested_processor) except Exception as e: self._add_issue(IType.ERROR, str(e) + subrow_issue_message(subrow)) return elif strcmp(scaling_type, "CloneScaled"): # “RequestedProcessor” must be an archetype # if not strcmp(requested_processor.instance_or_archetype, "archetype"): # raise CommandExecutionError(f"Requested processor '{requested_processor.name}' should be of type 'archetype'") # “InvokingProcessor” must be an instance # if not strcmp(invoking_processor.instance_or_archetype, "instance"): # raise CommandExecutionError(f"Invoking processor '{invoking_processor.name}' should be of type 'instance'") # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor” # 2. Scales the new processor using “Scale” as the value of “RequestedInterface” requested_processor_clone = self._clone_processor_as_child( processor=requested_processor, parent_processor=invoking_processor if not parent_processor else parent_processor, other_attributes=attributes) # Value Scale, which can be an expression, should be evaluated (ast) because we need a final float number scale_value = self._get_scale_value(scale) # In the cloned processor search in all interfaces if there are Observations relative_to RequestedInterface # and multiply the observation by the computed scale. self._scale_observations_relative_to_interface( processor=requested_processor_clone, interface_name=requested_interface_name, scale=scale_value) if requested_processor_clone: # Find or create processor and REGISTER it in "glb_idx" # Add to ProcessorsGroup, if specified field_val = fields.get("processor_group") if field_val: p_set = self._p_sets.get(field_val, ProcessorsSet(field_val)) self._p_sets[field_val] = p_set if p_set.append( requested_processor_clone, self._glb_idx ): # Appends codes to the pset if the processor was not member of the pset p_set.append_attributes_codes(fields["attributes"])
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] dataset_datetime = self._content["dataset_datetime"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, measures = obtain_dataset_metadata( dataset_name, source, datasets) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = nexinfosys.data_source_manager.get_dataset_filtered( source, dataset_name, params, datasets) df = ds.data # Join with mapped dimensions (augment it) mapping_dict = create_dictionary() for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map)) mapping_dict[mappings[m].origin] = (mappings[m].destination, { d["o"]: d["to"] for d in mappings[m].map }) # If accelerated version not available, use slow version try: if nexinfosys.get_global_configuration_variable( "ENABLE_CYTHON_OPTIMIZATIONS") == "True": from nexinfosys.restful_service.helper_accel import augment_dataframe_with_mapped_columns2 as augment_df else: raise Exception("Just to import the slow version") except: from nexinfosys.common.helper import augment_dataframe_with_mapped_columns as augment_df df = augment_df(df, mapping_dict, ["value"]) # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] v2 = [] for v in values: for c in df.columns: if v.lower() == c.lower(): v2.append(c) break values = v2 # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value" # values = self._content["measures"] out_names = self._content["measures_as"] group_by_dims = translate_case( self._content["group_by"], df.columns) # Group by dimension names lcase_group_by_dims = [d.lower() for d in group_by_dims] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_group_by_dims: # Find and replace for i, d2 in enumerate(group_by_dims): if strcmp(d, d2): group_by_dims[i] = d break agg_funcs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: agg_funcs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: agg_funcs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: agg_funcs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: agg_funcs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: agg_funcs.append("count") agg_names["count"] = "countav" elif f.lower() in ["avgna"]: agg_funcs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: agg_funcs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present for r in group_by_dims.copy(): df_columns_dict = create_dictionary( data={c: None for c in df.columns}) if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[k] = params[k][0] else: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in group_by_dims: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by if True: groups = df.groupby(by=group_by_dims, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(agg_funcs): for i, (value, agg_func) in enumerate(zip(values, agg_funcs)): if len(out_names) == len(values) and out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg_func] + "_" + value) lst = d.get(value, []) lst.append(agg_func) d[value] = lst else: for value in values: lst = d.get(value, []) for agg_func in agg_funcs: lst.append(agg_func) lst_names.append(agg_names[agg_func] + "_" + value) d[value] = lst # Print NaN values for each value column for value in set(values): cnt = df[value].isnull().sum() print("NA count for col '" + value + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = group_by_dims + lst_names # else: # # Pivot table # df2 = pd.pivot_table(df, # values=values, # index=group_by_dims, # aggfunc=[agg_funcs[0]], fill_value=np.NaN, margins=False, # dropna=True) # # Remove the multiindex in columns # df2.columns = [col[-1] for col in df2.columns.values] # # Remove the index # df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds = self._create_new_dataset(result_name, ds, df2, group_by_dims, out_names) except Exception as e: traceback.print_exc() issues.append((3, "There was a problem: " + str(e))) # Store the dataset in State datasets[result_name] = ds return issues, None
def _get_factor_types_from_field( self, hierarchy_field_name: str, interface_type_field_name: str) -> List[FactorType]: """ Possibly obtain not only one but many InterfaceTypes """ hierarchy_name = self._fields[hierarchy_field_name] interface_type_name = self._fields[interface_type_field_name] if not interface_type_name and not hierarchy_name: raise CommandExecutionError( f"No hierarchy nor interface type have been specified. At least specify one of them." ) elif interface_type_name and hierarchy_name: interface_types = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 1: return [interface_types[0]] elif len(interface_types) == 0: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found" ) else: hierarchy_name = self._fields[hierarchy_field_name] if not hierarchy_name: raise CommandExecutionError( f"The field '{hierarchy_field_name}' has not been specified and " f"the interface type '{interface_type_name}' is not unique" ) interface_type = first( interface_types, lambda t: strcmp(t.hierarchy.name, hierarchy_name)) if not interface_type: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found in " f"hierarchy '{hierarchy_name}'") return [interface_type] elif interface_type_name and not hierarchy_name: interface_types = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 1: return [interface_types[0]] elif len(interface_types) == 0: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found" ) else: raise CommandExecutionError( f"The field '{hierarchy_field_name}' has not been specified and " f"the interface type '{interface_type_name}' is not unique" ) elif not interface_type_name and hierarchy_name: hie = self._glb_idx.get(Hierarchy.partial_key(hierarchy_name)) if len(hie) == 1: # All children of "hierarchy_name" return [v for v in hie[0].codes.values()] elif len(hie) == 0: raise CommandExecutionError( f"The InterfaceTypes hierarchy '{hierarchy_name}' has not been found" ) else: raise CommandExecutionError( f"The InterfaceTypes hierarchy '{hierarchy_name}' has been found multiple times!!" ) # Check if FactorType exists interface_types = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 1: return interface_types[0] elif len(interface_types) == 0: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found" ) else: hierarchy_name = self._fields[hierarchy_field_name] if not hierarchy_name: raise CommandExecutionError( f"The field '{hierarchy_field_name}' has not been specified and " f"the interface type '{interface_type_name}' is not unique" ) interface_type = first( interface_types, lambda t: strcmp(t.hierarchy.name, hierarchy_name)) if not interface_type: raise CommandExecutionError( f"The interface type '{interface_type_name}' has not been found in " f"hierarchy '{hierarchy_name}'") return interface_type
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition if dataset_name is None: issues.append( Issue( itype=IType.ERROR, description= f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command", location=IssueLocation(sheet_name=name, row=None, column=None))) return issues, None, None # Obtain the source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=IType.ERROR, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=IType.ERROR, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join( [m2["measure"] for m2 in out_measures.values]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=IType.ERROR, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=IType.ERROR, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: if col_name.lower() == "starttime": col_name = "StartPeriod" elif col_name.lower() == "endtime": col_name = "EndPeriod" filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=IType.ERROR, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=IType.WARNING, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=IType.ERROR, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=IType.ERROR, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=IType.ERROR, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=IType.ERROR, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=IType.WARNING, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def execute(self, state: "State"): issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.code for ds in datasets.values()] # List of datasets with local worksheet name external_dataset_names = [] for ds in datasets.values(): if ds.attributes["_location"].lower().startswith("data://#"): worksheet = ds.attributes["_location"][len("data://#"):] if not worksheet.lower().startswith("datasetdata "): worksheet = "DatasetData " + worksheet if strcmp(worksheet, name): external_dataset_names.append(ds.code) # Process parsed information for r, line in enumerate(self._content["items"]): # A dataset dataset_names = line["name"] if dataset_names == "": if external_dataset_names: dataset_names = external_dataset_names else: issues.append( Issue( itype=IType.ERROR, description= "The column name 'DatasetName' was not defined for command 'DatasetData' and there is no 'location' in a DatasetDef command pointing to it", location=IssueLocation(sheet_name=name, row=1, column=None))) else: dataset_names = [dataset_names] # Find it in the already available datasets. MUST EXIST for n in ds_names: for dataset_name in dataset_names: if strcmp(dataset_name, n): df = pd.read_json(StringIO(line["values"]), orient="split") # Check columns ds = datasets[n] iss = prepare_dataframe_after_external_read( ds, df, name) issues.extend(iss) # Everything ok? Store the dataframe! if not any_error_issue(iss): r = ds.attributes["_dataset_first_row"] # Loop over "ds" concepts. # - "dimension" concepts of type "string" generate a CodeHierarchy # - Check that the DataFrame contains ALL declared concepts. If not, generate issue # dims = translate_case([d.code for d in ds.dimensions], df.columns) cid = create_dictionary( data={col: col for col in df.columns}) col_names = list(df.columns) for c in ds.dimensions: if c.code in cid: col_names[df.columns.get_loc( cid[c.code])] = c.code # Rename column dsd_concept_data_type = c.attributes[ "_datatype"] if dsd_concept_data_type.lower( ) == "string" and not c.is_measure: # Freely defined dimension cl = df[cid[c.code]].unique().tolist() c.code_list = CodeList.construct( c.code, c.code, [""], codes=[ CodeImmutable(c, c, "", []) for c in cl ]) else: issues.append( Issue( itype=IType.ERROR, description= f"Concept '{c.code}' not defined for '{ds.code}'", location=IssueLocation( sheet_name=name, row=r, column=None))) df.columns = col_names ds.data = df dataset_names.remove(dataset_name) break if dataset_names: issues.append( Issue( itype=IType.ERROR, description= f"Metadata for the datasets: {','.join(dataset_names)}, must be defined previously", location=IssueLocation(sheet_name=name, row=-1, column=-1))) return issues, None
def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=IType.ERROR, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=IType.ERROR, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name ds.database = None ds.attributes = {} current_ds[dsd_dataset_name] = ds if not dsd_concept_type: if ds.attributes.get("_location"): issues.append( Issue( itype=IType.WARNING, description= f"Location of data for dataset {ds.code} previously declared. " f"Former: {attributes.get('_location')}, " f"Current: {dsd_dataset_data_location}", location=IssueLocation(sheet_name=name, row=r, column=None))) attributes = ds.attributes else: attributes["_dataset_first_row"] = r attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes else: # If concept_type is defined => add a concept # Check if the concept name already appears --> Error for d1 in ds.dimensions: if strcmp(d1.code, dsd_concept_name): issues.append( Issue( itype=IType.ERROR, description= f"Concept {dsd_concept_name} already declared for dataset {ds.code}", location=IssueLocation(sheet_name=name, row=r, column=None))) break d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=IType.ERROR, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=IType.ERROR, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ issues: List[Issue] = [] # Analyze column names col_map = create_dictionary() for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value.strip() # Avoid repetitions if col_name in col_map: issues.append(Issue(itype=IType.ERROR, description="The column name '"+col_name+"' is repeated", location=IssueLocation(sheet_name=name, row=1, column=c))) if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"): col_map["dataset"] = c elif col_name: # Concept name col_map[col_name] = c if any([i.itype == IType.ERROR for i in issues]): return issues, None, None # Read all the content into a list of lists lines = [] for r in range(area[0] + 1, area[1]): line = [] for col_name, c in col_map.items(): v = sh.cell(row=r, column=c).value if isinstance(v, str): v = v.strip() line.append(v) lines.append(line) # pd.DataFrame df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines) content = [] # The output JSON if "dataset" in df: # Find the different datasets datasets = df["dataset"].unique() datasets = set([d.lower() for d in datasets]) for dataset in datasets: # Obtain filtered df2 = df.loc[df['dataset'].str.lower() == dataset] # Convert to JSON and store in content del df2["dataset"] s = StringIO() df2.to_json(s, orient="split") content.append(dict(name=dataset, values=s.getvalue())) else: s = StringIO() df.to_json(s, orient="split") content.append(dict(name="", values=s.getvalue())) return issues, None, dict(items=content, command_name=name)
def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None: """ Process a dictionary representing a row of the Interfaces command. The dictionary can come directly from the worksheet or from a dataset. :param field_values: dictionary """ # f_processor_name -> p # f_interface_type_name -> it # f_interface_name -> i # # IF NOT i AND it AND p => i_name = it.name => get or create "i" # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error) # IF i AND p AND NOT it => get "i" (MUST EXIST) f_interface_type_name = field_values.get("interface_type") f_interface_name = field_values.get("interface") if not f_interface_name: if not f_interface_type_name: raise CommandExecutionError( "At least one of InterfaceType or Interface must be defined" + subrow_issue_message(subrow)) f_interface_name = f_interface_type_name processor = self.find_processor(field_values.get("processor"), subrow) # Try to find Interface f_orientation = field_values.get("orientation") interface_type: Optional[FactorType] = None interface: Optional[Factor] = None interfaces: Sequence[Factor] = self._glb_idx.get( Factor.partial_key(processor=processor, name=f_interface_name)) if len(interfaces) == 1: interface = interfaces[0] print(f"DEBUG - Interface '{interface.name}' found") interface_type = interface.taxon if f_interface_type_name and not strcmp(interface_type.name, f_interface_type_name): self._add_issue( IType.WARNING, f"The existing Interface '{interface.name}' has the InterfaceType " f"'{interface_type.name}' which is different from the specified " f"InterfaceType '{f_interface_type_name}'. Record skipped." + subrow_issue_message(subrow)) return elif len(interfaces) > 1: raise CommandExecutionError( f"Interface '{f_interface_name}' found {str(len(interfaces))} times. " f"It must be uniquely identified." + subrow_issue_message(subrow)) elif len(interfaces) == 0: # The interface does not exist, create it below if not f_orientation: raise CommandExecutionError( f"Orientation must be defined for new Interfaces." + subrow_issue_message(subrow)) # InterfaceType still not found if not interface_type: interface_type_name = ifnull(f_interface_type_name, f_interface_name) # Find FactorType # TODO Allow creating a basic FactorType if it is not found? interface_types: Sequence[FactorType] = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 0: raise CommandExecutionError( f"InterfaceType '{interface_type_name}' not declared previously" + subrow_issue_message(subrow)) elif len(interface_types) > 1: raise CommandExecutionError( f"InterfaceType '{interface_type_name}' found {str(len(interface_types))} times. " f"It must be uniquely identified." + subrow_issue_message(subrow)) else: interface_type = interface_types[0] # Get attributes default values taken from Interface Type or Processor attributes # Rows : value of (source) "processor.subsystem_type" # Columns: value of (target) "interface_type.opposite_processor_type" # Cells : CORRECTED value of "opposite_processor_type" # +--------+-------+--------+-------+---------+ # | | Local | Env | Ext | ExtEnv | # +--------+-------+--------+-------+---------+ # | Local | Local | Env | Ext | ExtEnv | # | Env | Local | Env | Ext | ExtEnv? | # | Ext | Ext | ExtEnv | Local | Env | # | ExtEnv | Ext | ExtEnv | Local | Env? | # +--------+-------+--------+-------+---------+ if interface_type.opposite_processor_type: tmp = interface_type.opposite_processor_type.lower() if processor.subsystem_type.lower() in ["local", "environment" ]: # First two rows opposite_processor_type = tmp else: opposite_processor_type = InterfacesAndQualifiedQuantitiesCommand.invert[ tmp] # TODO in doubt. Maybe these are undefined (values with question mark in the table) # if tmp == "externalenvironment" and processor.subsystem_type.lower() in ["environment", "externalenvironment"]: # pass else: opposite_processor_type = None interface_type_values = { "sphere": interface_type.sphere, "roegen_type": interface_type.roegen_type, "opposite_processor_type": opposite_processor_type } # Get internal and user-defined attributes in one dictionary # Use: value specified in Interfaces ELSE value specified in InterfaceTypes ELSE first value of allowed values attributes = { c.name: ifnull( field_values[c.name], ifnull(interface_type_values.get(c.name), head(c.allowed_values))) for c in self._command_fields if c.attribute_of == Factor } if not interface: # f_list: Sequence[Factor] = self._glb_idx.get( # Factor.partial_key(processor=p, factor_type=ft, orientation=f_orientation)) # # if len(f_list) > 0: # raise CommandExecutionError(f"An interface called '{f_list[0].name}' for Processor '{f_processor_name}'" # f" with InterfaceType '{f_interface_type_name}' and orientation " # f"'{f_orientation}' already exists"+subrow_issue_message(subrow)) # Transform text of "interface_attributes" into a dictionary interface_attributes = self.transform_text_attributes_into_dictionary( field_values.get("interface_attributes"), subrow) attributes.update(interface_attributes) location = self.get_location(field_values.get("location"), subrow) interface = Factor.create_and_append( f_interface_name, processor, in_processor_type=FactorInProcessorType(external=False, incoming=False), taxon=interface_type, geolocation=location, tags=None, attributes=attributes) self._glb_idx.put(interface.key(), interface) print(f"DEBUG - Interface '{interface.name}' created") elif not interface.compare_attributes(attributes): initial = ', '.join( [f"{k}: {interface.get_attribute(k)}" for k in attributes]) new = ', '.join([f"{k}: {attributes[k]}" for k in attributes]) name = interface.processor.full_hierarchy_names( self._glb_idx)[0] + ":" + interface.name raise CommandExecutionError( f"The same interface '{name}', is being redeclared with different properties. " f"INITIAL: {initial}; NEW: {new}." + subrow_issue_message(subrow)) f_unit = field_values.get("unit") if not f_unit: f_unit = interface_type.unit # Unify unit (it must be done before considering RelativeTo -below-, because it adds a transformation to "f_unit") f_value = field_values.get("value") if f_value is not None and f_unit != interface_type.unit: try: f_value = UnitConversion.convert(f_value, f_unit, interface_type.unit) except DimensionalityError: raise CommandExecutionError( f"Dimensions of units in InterfaceType ({interface_type.unit}) and specified ({f_unit}) are not convertible" + subrow_issue_message(subrow)) f_unit = interface_type.unit # Search for a relative_to interface f_relative_to = field_values.get("relative_to") relative_to_interface: Optional[Factor] = None if f_relative_to: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.factor_unit, f_relative_to) except: raise CommandExecutionError( f"Could not parse the RelativeTo column, value {str(f_relative_to)}. " + subrow_issue_message(subrow)) relative_to_interface_name = ast_to_string(ast["factor"]) # rel_unit_name = ast["unparsed_unit"] # try: # f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units) # except (UndefinedUnitError, AttributeError) as ex: # raise CommandExecutionError(f"The final unit could not be computed, interface '{f_unit}' / " # f"relative_to '{rel_unit_name}': {str(ex)}"+subrow_issue_message(subrow)) relative_to_interface = first( interface.processor.factors, lambda ifc: strcmp(ifc.name, relative_to_interface_name)) if not relative_to_interface: raise CommandExecutionError( f"Interface specified in 'relative_to' column " f"'{relative_to_interface_name}' has not been found." + subrow_issue_message(subrow)) if f_value is None and relative_to_interface is not None: # Search for a Interface Type Conversion defined in the ScaleChangeMap command interface_types_transforms: List[FactorTypesRelationUnidirectionalLinearTransformObservation] = \ find_factor_types_transform_relation(self._glb_idx, relative_to_interface.taxon, interface.taxon, processor, processor) # Overwrite any specified unit, it doesn't make sense without a value, i.e. it cannot be used for conversion f_unit = interface.taxon.unit if len(interface_types_transforms) == 1: f_value = interface_types_transforms[0].scaled_weight else: interface_types_transforms_message = "an interface type conversion doesn't exist" \ if (len(interface_types_transforms) == 0) \ else f"{len(interface_types_transforms)} interface type conversions exist" f_value = "0" self._add_issue( IType.WARNING, f"Field 'value' should be defined for interfaces having a " f"'RelativeTo' interface, and {interface_types_transforms_message}. " f"Using value '0'." + subrow_issue_message(subrow)) # Create quantitative observation if f_value is not None: f_uncertainty = field_values.get("uncertainty") f_assessment = field_values.get("assessment") f_pedigree_matrix = field_values.get("pedigree_matrix") f_pedigree = field_values.get("pedigree") f_time = field_values.get("time") f_comments = field_values.get("comments") f_source = field_values.get("qq_source") # TODO: source is not being used source = self.get_source(f_source, subrow) # Find Observer observer: Optional[Observer] = None if f_source: observer = self._glb_idx.get_one( Observer.partial_key(f_source)) if not observer: self._add_issue( IType.WARNING, f"Observer '{f_source}' has not been found." + subrow_issue_message(subrow)) # If an observation exists then "time" is mandatory if not f_time: raise CommandExecutionError( f"Field 'time' needs to be specified for the given observation." + subrow_issue_message(subrow)) # An interface can have multiple observations if each of them have a different [time, observer] combination for observation in interface.quantitative_observations: observer_name = observation.observer.name if observation.observer else None if strcmp(observation.attributes["time"], f_time) and strcmp( observer_name, f_source): raise CommandExecutionError( f"The interface '{interface.name}' in processor '{interface.processor.name}' already has an " f"observation with time '{f_time}' and source '{f_source}'." ) self.check_existence_of_pedigree_matrix(f_pedigree_matrix, f_pedigree, subrow) # Transform text of "number_attributes" into a dictionary number_attributes = self.transform_text_attributes_into_dictionary( field_values.get("number_attributes"), subrow) o = _create_or_append_quantitative_observation( interface, f_value, f_unit, f_uncertainty, f_assessment, f_pedigree, f_pedigree_matrix, observer, relative_to_interface, f_time, None, f_comments, None, number_attributes)
def export_model_to_xml( registry: PartialRetrievalDictionary ) -> Tuple[str, Dict[str, Processor]]: """ Elaborate an XML string containing the nested processors and their attributes. Also the interfaces inside processors <processors> <root_p1 fullname="" level="" system="" subsystem="" functional="true|false"> <interfaces> <i1 type="" sphere="" roegen_type="" orientation="" opposite_processor_type="" /> ... </interfaces> <child_p2> ... </child_p2> </root_p1> ... </processors> Example (abstract): '/processors//[level="n"]' :param registry: :return: """ def xml_processor(p: Processor, registry: PartialRetrievalDictionary, p_map: Dict[str, Processor]): """ Return the XML of a processor Recursive into children :param p: :return: """ def xml_interface(iface: Factor): """ :param iface: :return: """ s = f'<{iface.name} type="{iface.taxon.name}" sphere="{iface.sphere}" ' \ f'roegen_type="{iface.roegen_type}" orientation="{iface.orientation}" ' \ f'opposite_processor_type="{iface.opposite_processor_type}" />' if case_sensitive: return s else: return s.lower() children = p.children(registry) full_name = p.full_hierarchy_names(registry)[0] if case_sensitive: p_map[full_name] = p else: p_map[full_name.lower()] = p s = f""" <{p.name} fullname="{full_name}" level="{p.level}" system="{p.processor_system}" subsystem="{p.subsystem_type}" functional="{"true" if strcmp(p.functional_or_structural, "Functional") else "false"}" > <interfaces> {chr(10).join([xml_interface(f) for f in p.factors])} </interfaces> {chr(10).join([xml_processor(c, registry, p_map) for c in children])} </{p.name}>""" if case_sensitive: return s else: return s.lower() # Part of relationships por = registry.get(ProcessorsRelationPartOfObservation.partial_key()) # Set of all instance processors NOT touched by part-of relationships unaffected_procs = set([ p for p in registry.get(Processor.partial_key()) if strcmp(p.instance_or_archetype, "Instance") ]) for po in por: try: unaffected_procs.remove(po.parent_processor) except KeyError: pass try: unaffected_procs.remove(po.child_processor) except KeyError: pass # Keep those affecting Instance processors por = [ po for po in por if strcmp(po.parent_processor.instance_or_archetype, "Instance") ] # Get root processors (set of processors not appearing as child_processor) parents = set([po.parent_processor for po in por]) children = set([po.child_processor for po in por]) roots = parents.difference(children).union(unaffected_procs) # leaves = children.difference(parents) result = '<processors>' # <?xml version="1.0" encoding="utf-8"?>\n p_map = {} for p in roots: result += xml_processor(p, registry, p_map) result += "\n</processors>" return result, p_map
def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None: # Transform text of "attributes" into a dictionary if field_values.get("attributes"): try: field_values["attributes"] = dictionary_from_key_value_list( field_values["attributes"], self._glb_idx) except Exception as e: self._add_issue(IType.ERROR, str(e) + subrow_issue_message(subrow)) return else: field_values["attributes"] = {} # Process specific fields # Obtain the parent: it must exist. It could be created dynamically but it's important to specify attributes if field_values.get("parent_processor"): try: parent_processor = self._get_processor_from_field( "parent_processor") # parents = find_processors_matching_name(parent_processor) # if len(parents) > 1: # self._add_issue(IType.WARNING, # f"Parent processor '{parent_processor}' not unique. Matches: {', '.join(p.hierarchical_names[0] for p in parents)}. Skipped." + subrow_issue_message(subrow)) # return except CommandExecutionError: self._add_issue( IType.ERROR, f"Specified parent processor, '{field_values.get('parent_processor')}', does not exist" + subrow_issue_message(subrow)) return else: parent_processor = None behave_as_processor: Optional[Processor] = None if field_values.get("behave_as_processor"): try: behave_as_processor = self._get_processor_from_field( "behave_as_processor") except CommandExecutionError: self._add_issue( IType.WARNING, f"Specified 'behave as' processor, '{field_values.get('behave_as_processor')}', does not exist, value ignored" + subrow_issue_message(subrow)) # Find or create processor and REGISTER it in "glb_idx" # TODO Now, only Simple name allowed # TODO Improve allowing hierarchical names, and hierarchical names with wildcards pgroup = field_values.get("processor_group") # Get internal and user-defined attributes in one dictionary attributes = { c.name: field_values[c.name] for c in self._command_fields if c.attribute_of == Processor } attributes.update(field_values["attributes"]) attributes["processor_group"] = pgroup # Needed to support the new name of the field, "Accounted" (previously it was "InstanceOrArchetype") # (internally the values have the same meaning, "Instance" for a processor which has to be accounted, # "Archetype" for a processor which hasn't) v = attributes.get("instance_or_archetype", None) if strcmp(v, "Yes"): v = "Instance" elif strcmp(v, "No"): v = "Archetype" if v: attributes["instance_or_archetype"] = v name = field_values["processor"] p_names, _ = obtain_name_parts(name) geolocation = Geolocation.create(field_values["geolocation_ref"], field_values["geolocation_code"]) ps = find_processors_matching_name(name, self._glb_idx) more_than_one = len(ps) > 1 simple = len(p_names) == 1 exists = True if len(ps) == 1 else False # SIMPLE? EXISTS? PARENT? ACTION: # Yes Yes Yes NEW; HANG FROM PARENT # Yes Yes No Warning: repeated # Yes No Yes NEW; HANG FROM PARENT # Yes No No NEW # No Yes Yes Warning: cannot hang from parent # No Yes No Warning: repeated AND not simple not allowed # No No Yes Warning: cannot create more than one processor AND not simple not allowed # No No No Warning: cannot create more than one processor AND not simple not allowed create_new = False if not simple: if not parent_processor: self._add_issue( IType.WARNING, f"When a processor does not have parent, the name must be simple. Skipped." + subrow_issue_message(subrow)) return else: if exists and not parent_processor: self._add_issue( IType.WARNING, f"Repeated declaration of {name}. Skipped." + subrow_issue_message(subrow)) return create_new = True if create_new: p = find_or_create_processor(state=self._glb_idx, name=name, proc_attributes=attributes, proc_location=geolocation) else: if exists: p = ps[0] # Add to ProcessorsGroup, if specified if pgroup: p_set = self._p_sets.get(pgroup, ProcessorsSet(pgroup)) self._p_sets[pgroup] = p_set if p_set.append( p, self._glb_idx ): # Appends codes to the pset if the processor was not member of the pset p_set.append_attributes_codes(field_values["attributes"]) # If geolocation specified, check if it exists # Inside it, check it the code exists if p.geolocation and p.geolocation.reference: # Geographical reference gr = self._glb_idx.get( GeographicReference.partial_key(name=p.geolocation.reference)) if len(gr) == 0: self._add_issue( IType.ERROR, f"Geographical reference {p.geolocation.reference} not found " + subrow_issue_message(subrow)) return if p.geolocation.reference and not p.geolocation.code: self._add_issue( IType.ERROR, f"Geographical reference was specified but not the code in it " + subrow_issue_message(subrow)) return geo_id = p.geolocation.code try: url = gr[0].attributes["data_location"] except: self._add_issue( IType.ERROR, f"URL not found in geographical reference {p.geolocation.reference} " + subrow_issue_message(subrow)) return try: j, ids = read_geojson( url ) # READ the file!! (or get it from cache). Could take some time... except: self._add_issue( IType.ERROR, f"URL {url} in reference {p.geolocation.reference} could not be read " + subrow_issue_message(subrow)) return if geo_id not in ids: self._add_issue( IType.WARNING, f"Could not find code {geo_id} in file {url}, geographical reference {p.geolocation.reference} " + subrow_issue_message(subrow)) # Add Relationship "part-of" if parent was specified # The processor may have previously other parent processors that will keep its parentship if parent_processor: # Create "part-of" relationship if len( self._glb_idx.get( ProcessorsRelationPartOfObservation.partial_key( parent_processor, p))) > 0: self._add_issue( IType.WARNING, f"{p.name} is already part-of {parent_processor.name}. Skipped." + subrow_issue_message(subrow)) return o1 = ProcessorsRelationPartOfObservation.create_and_append( parent_processor, p, None, behave_as=behave_as_processor, weight=field_values.get("parent_processor_weight")) # Part-of self._glb_idx.put(o1.key(), o1) for hname in parent_processor.full_hierarchy_names(self._glb_idx): p_key = Processor.partial_key(f"{hname}.{p.name}", p.ident) if attributes: p_key.update({ k: ("" if v is None else v) for k, v in attributes.items() }) self._glb_idx.put(p_key, p)
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin, destination) -> IssuesLabelContentTripleType: """ Map from a set of categories from an external dataset into a set of MuSIASEM categories If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping will still hold The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence. The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side) :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param origin: :param destination: :return: list of issues (issue_type, message), command label, command content """ some_error = False issues = [] # Analyze Origin cell = sh.cell(row=area[0], column=area[2]) col_name = cell.value if origin: if not strcmp(origin, col_name): some_error = True issues.append(( 3, "The Origin name is different in the sheet name and in the worksheet (" + origin + ", " + col_name + ")")) else: origin = col_name # Obtain the source, the dataset and the dimension of "origin" spl = origin.split(".") if len(spl) == 3: # Source.Dataset.Dimension s, ds, dim = spl s = s + "." origin_ok = True elif len(spl) == 2: # Dataset.Dimension ds, dim = spl s = "" origin_ok = True else: origin_ok = False some_error = True issues.append(( 3, "Origin must specify a dataset and a dimension name separated by '.'" )) if origin_ok: origin_dataset = s + ds origin_dim = dim if not check_dataset_exists(origin_dataset): some_error = True issues.append((3, "The Origin '" + origin_dataset + "' does not match any registered dataset")) else: dims, attrs, meas = obtain_dataset_metadata(ds) if origin_dim not in dims: some_error = True issues.append( (3, "The Origin dataset '" + origin_dataset + "' does not have a dimension '" + origin_dim + "'")) # Analyze Destination cell = sh.cell(row=area[0], column=area[2] + 1) col_name = cell.value if destination: if not strcmp(destination, col_name): some_error = True issues.append(( 3, "The Destination name is different in the sheet name and in the worksheet (" + destination + ", " + col_name + ")")) else: destination = col_name # Destination name must be a simple identity try: parser_field_parsers.simple_ident.parseString(destination, parseAll=True) except: some_error = True issues.append((3, "'" + destination + "' category name has to be a simple identifier")) if some_error: # Issues at this point are errors, return if there are any return issues, None, None # Read mapping Origin to Destination o_dict = create_dictionary() for r in range(area[0] + 1, area[1]): o_value = sh.cell(row=r, column=area[2]).value # First column -> Origin d_value = sh.cell(row=r, column=area[2] + 1).value # Second column -> Destination try: exp_value = sh.cell( row=r, column=area[2] + 2).value # Third column -> Weight (for Many to Many mappings) if exp_value: try: exp_value = float(exp_value) except: # If it is not possible, it maybe an expression, postpone conversion until usage pass else: exp_value = 1.0 # If undefined -> Many to One except: exp_value = 1.0 # If undefined -> Many to One if not o_value and not d_value: # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped.")) continue elif not o_value or not d_value: if not o_value and d_value: issues.append( (2, "Row " + str(r) + ": Origin not defined. Row skipped.")) else: issues.append((2, "Row " + str(r) + ": Destination not defined. Row skipped.")) continue o_value = str(o_value).lower() d_value = str(d_value).lower() if o_value in o_dict: lst = o_dict[o_value] else: lst = [] o_dict[o_value] = lst # Check "d_value" is not being repeated for "o_value" if (len(lst) == 0) or (len(lst) >= 1 and d_value not in [d["d"] for d in lst]): lst.append({"d": d_value, "w": exp_value}) else: issues.append((3, "Destination category '" + destination + "' has been repeated for origin category '" + o_value + "' at row '" + str(r) + "'")) # List of dictionaries, where each dictionary contains the specification of an origin "o" # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e" content = { "origin_dataset": origin_dataset, # Name of the origin dataset (may include the source name) "origin_dimension": origin_dim, # Name of the origin dimension inside the dataset "destination": destination, # Name of the destination hierarchy "map": [{ "o": k, "to": v } for k, v in o_dict.items()] } label = ((content["origin_dataset"] + ".") if origin_dataset else "" ) + content["origin_dimension"] + " -> " + content["destination"] return issues, label, content
def _init_and_process_row(self, row: Dict[str, Any]) -> None: def obtain_dictionary_with_not_expandable_fields(d): output = {} for k, v in d.items(): if v is None or "{" not in v: output[k] = v return output self._current_row_number = row["_row"] self._fields = self._get_command_fields_values(row) tmp_fields = self._fields self._check_all_mandatory_fields_have_values() # If expandable, do it now expandable = row["_expandable"] if expandable: # Extract variables state = State() issues = [] asts = {} referenced_variables = create_dictionary() for e in expandable: ast = parser_field_parsers.string_to_ast( arith_boolean_expression, e) c_name = f"{{{e}}}" asts[c_name] = ast res, vars = ast_evaluator(ast, state, None, issues, atomic_h_names=True) for v in vars: referenced_variables[v] = None res = classify_variables2(referenced_variables.keys(), self._datasets, self._hierarchies, self._parameters) ds_list = res["datasets"] ds_concepts = res["ds_concepts"] h_list = res["hierarchies"] if len(ds_list) >= 1 and len(h_list) >= 1: self._add_issue( itype=IType.ERROR, description="Dataset(s): " + ", ".join([d.name for d in ds_list]) + ", and hierarchy(ies): " + ", ".join([h.name for h in h_list]) + ", have been specified. Only a single dataset is supported." ) return elif len(ds_list) > 1: self._add_issue( itype=IType.ERROR, description="More than one dataset has been specified: " + ", ".join([d.name for d in ds_list]) + ", just one dataset is supported.") return elif len(h_list) > 0: self._add_issue( itype=IType.ERROR, description="One or more hierarchies have been specified: " + ", ".join([h.name for h in h_list])) return if len(ds_list) == 1: # Expand dataset ds = ds_list[0] measure_requested = False all_dimensions = set( [c.code for c in ds.dimensions if not c.is_measure]) requested_dimensions = set() requested_measures = set() for con in ds_concepts: found = False for c in ds.dimensions: if strcmp(c.code, con): found = True if c.is_measure: measure_requested = True requested_measures.add(c.code) else: # Dimension all_dimensions.remove(c.code) requested_dimensions.add(c.code) if not found: self._add_issue( itype=IType.ERROR, description= f"The concept '{{{ds.code}.{con}}}' is not in the dataset '{ds.code}'" ) return ds_concepts = list(requested_measures) ds_concepts.extend(list(requested_dimensions)) all_dimensions_requested = len(all_dimensions) == 0 if measure_requested and not all_dimensions_requested: self._add_issue( IType.ERROR, f"It is not possible to use a measure ({', '.join(requested_measures)}), if not all dimensions are used " f"(cannot assume implicit aggregation). Dimensions not used: {', '.join(all_dimensions)}" ) return elif not measure_requested and not all_dimensions_requested: # Reduce the Dataframe to unique tuples of the specified dimensions # TODO Consider the current case -sensitive or not-sensitive- data = ds.data[list( requested_dimensions)].drop_duplicates() else: # Take the dataset as-is data = ds.data # Remove Index, and do it NOT-INPLACE data = data.reset_index() # Drop rows with empty dimension value import numpy as np data = data.replace(r'^\s*$', np.NaN, regex=True) data.dropna(subset=requested_dimensions, inplace=True) const_dict = obtain_dictionary_with_not_expandable_fields( self._fields) # row? var_dict = set( [f for f in self._fields.keys() if f not in const_dict]) re_concepts = {} for c in ds_concepts: c_name = f"{{{ds.code}.{c}}}" if case_sensitive: re_concepts[c_name] = re.compile(c_name) else: re_concepts[c_name] = re.compile(c_name, re.IGNORECASE) location = IssueLocation(sheet_name=self._command_name, row=self._current_row_number, column=None) already_parsed_fields = set(const_dict.keys()) for ds_row, row2 in enumerate( data.iterrows()): # Each row in the dataset # Initialize constant values (those with no "{..}" expressions) row3 = const_dict.copy() # Prepare state to evaluate functions state = State() for c in ds_concepts: state.set(f"{ds.code}.{c}", str(row2[1][c])) state.set( "_glb_idx", self._glb_idx ) # Pass PartialRetrievalDictionary to the evaluator. For functions needing it # Evaluate all functions expressions = {} for e, ast in asts.items(): res, vars = ast_evaluator(ast, state, None, issues, atomic_h_names=True) expressions[e] = res # Expansion into var_dict for f in var_dict: v = self._fields[f] # Initial value for item in sorted(expressions.keys(), key=len, reverse=True): v = v.replace(item, expressions[item]) row3[f] = v # # Concepts change dictionary # concepts = {} # for c in ds_concepts: # concepts[f"{{{ds.code}.{c}}}"] = str(row2[1][c]) # # Expansion into var_dict # for f in var_dict: # v = self._fields[f] # Initial value # for item in sorted(concepts.keys(), key=len, reverse=True): # v = re_concepts[item].sub(concepts[item], v) # row3[f] = v # Syntactic verification of the resulting expansion processable, tmp_issues = parse_cmd_row_dict( self._serialization_type, row3, already_parsed_fields, location) if len(tmp_issues) > 0: self._issues.extend(tmp_issues) # Process row if processable: self._fields = row3 self._process_row(row3, ds_row) self._fields = tmp_fields elif len(h_list) == 1: # Expand hierarchy pass else: self._process_row(self._fields) # Process row
def execute(self, state: "State"): """ For each parent processor clone all the child processors. The cloning process may pass some factor observation, that may result in """ some_error = False issues = [] parent_processor_type = self._content["parent_processor_type"] child_processor_type = self._content["child_processor_type"] scaled_factor = self._content["scaled_factor"] source = self._content["source"] # column_headers = self._content["column_headers"] # row_headers = self._content["row_headers"] scales = self._content["scales"] # Find processor sets, for parent and child glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) if parent_processor_type not in p_sets: some_error = True issues.append(( 3, "The processor type '" + parent_processor_type + "' (appointed for parent) has not been found in the commands execute so far" )) if child_processor_type not in p_sets: some_error = True issues.append(( 3, "The processor type '" + child_processor_type + "' (should be child processor) has not been found in the commands execute so far" )) if some_error: return issues, None # CREATE the Observer of the Upscaling oer = glb_idx.get(Observer.partial_key(source)) if not oer: oer = Observer(source) glb_idx.put(oer.key(), oer) else: oer = oer[0] # Processor Sets have associated attributes, and each of them has a code list parent = p_sets[parent_processor_type] # type: ProcessorsSet child = p_sets[child_processor_type] # type: ProcessorsSet # Form code lists from the command specification code_lists = None for sc_dict in scales: codes = sc_dict["codes"] if not code_lists: code_lists = [set() for _ in codes] for i, c in enumerate(codes): code_lists[i].add(c) # Match existing code lists (from Processor attributes) with the ones gathered in the specification of # the two (parent and child) processors sets. # Form lists of attributes of processors used in the code lists parent_attrs = [] child_attrs = [] matched = [] for i, cl in enumerate(code_lists): found = False for attr, attr_values in parent.attributes.items(): if set(attr_values).issuperset(cl): parent_attrs.append( (attr, i)) # (Attribute, code list index) found = True break for attr, attr_values in child.attributes.items(): if set(attr_values).issuperset(cl): child_attrs.append( (attr, i)) # (Attribute, code list index) found = True break matched.append(found) for i, found in enumerate(matched): if not found: cl = code_lists[i] # TODO Try cl as a list of names of parent or child processors if not found: issues.append(( 2, "The code list: " + ", ".join(cl) + " is not contained in the attributes of the parent processors set '" + parent_processor_type + "' nor in the attributes of the child processors set '" + child_processor_type + "'")) # Execute the upscale for each cached_processors = {} for sc_dict in scales: try: non_zero_weight = math.fabs(float(sc_dict["weight"])) > 1e-6 except: non_zero_weight = True if not non_zero_weight: continue codes = sc_dict["codes"] # Find parent processor parent_dict = {attr: codes[i] for attr, i in parent_attrs} d2s = str(parent_dict) if d2s in cached_processors: parent = cached_processors[d2s] if not parent: issues.append(( 3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one." )) else: parent_dict.update(Processor.partial_key()) # Obtain Processor matching the attributes <<<<<<<<<< # Query the PartialRetrievalDictionary by attributes parents = glb_idx.get(parent_dict) if len(parents) > 1: issues.append( (3, "The tuple (" + str(parent_dict) + ") matches " + str(len(parents)) + " Processors: " + (", ".join([p.name for p in parents])))) parent = None elif len(parents) == 0: issues.append((3, "The tuple (" + str(parent_dict) + ") did not match any Processor")) parent = None else: parent = parents[0] cached_processors[d2s] = parent # Find child processor child_dict = {attr: codes[i] for attr, i in child_attrs} d2s = str(child_dict) if d2s in cached_processors: child = cached_processors[d2s] if not child: issues.append(( 3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one." )) else: child_dict.update(Processor.partial_key()) # Obtain Processors matching the attributes # Query the PartialRetrievalDictionary by attributes children = glb_idx.get(child_dict) if len(children) > 1: issues.append( (3, "The tuple (" + str(child_dict) + ") matches " + str(len(parents)) + " Processors: " + (", ".join([p.name for p in children])))) child = None elif len(children) == 0: issues.append((3, "The tuple (" + str(child_dict) + ") did not match any Processor")) child = None else: child = children[0] # type: Processor cached_processors[d2s] = child # Clone child processor (and its descendants) and add an upscale relation between "parent" and the clone if parent and child: if non_zero_weight: # Clone the child processor # TODO cloned_child, cloned_children = child.clone(state=glb_idx) Processor.register([cloned_child] + list(cloned_children), glb_idx) # Create the new Relation Observations # - Part-of Relation o1 = ProcessorsRelationPartOfObservation.create_and_append( parent, cloned_child, oer) # Part-of glb_idx.put(o1.key(), o1) # - Upscale Relation quantity = str(sc_dict["weight"]) if True: # Find Interface named "scaled_factor" for f in parent.factors: if strcmp(f.name, scaled_factor): origin = f break else: origin = None for f in cloned_child.factors: if strcmp(f.name, scaled_factor): destination = f break else: destination = None if origin and destination: o3 = FactorsRelationScaleObservation.create_and_append( origin, destination, observer=None, quantity=quantity) glb_idx.put(o3.key(), o3) else: raise Exception( "Could not find Interfaces to define a Scale relation. Processors: " + parent.name + ", " + cloned_child.name + "; Interface name: " + scaled_factor) else: o3 = ProcessorsRelationUpscaleObservation.create_and_append( parent, cloned_child, observer=None, factor_name=scaled_factor, quantity=quantity) glb_idx.put(o3.key(), o3) else: # TODO parent_dict = str({attr: codes[i] for attr, i in parent_attrs}) child_dict = str({attr: codes[i] for attr, i in child_attrs}) if not parent and child: issues.append(( 2, "Could not find parent Processor matching attributes: " + parent_dict)) elif not child and parent: issues.append( (2, "Could not find child Processor matching attributes: " + child_dict)) else: issues.append(( 2, "Could not find parent Processor matching attributes: " + parent_dict + ", nor child Processor matching attributes: " + child_dict)) return issues, None
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in ["startperiod", "endperiod"]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = nexinfosys.data_source_manager.get_dataset_filtered( source, dataset_name, params) df = ds.data # Join with mapped dimensions (augment it) # TODO Prepare an "m" containing ALL the mappings affecting "df" # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"]) # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # TODO Change by many-to-many mapping # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns) # Elaborate a many to one mapping tmp = [] for el in mappings[m].map: for to in el["to"]: if to["d"]: tmp.append([el["o"], to["d"]]) df_dst = pd.DataFrame( tmp, columns=['sou_rce', mappings[m].destination]) for di in df.columns: if strcmp(mappings[m].origin, di): d = di if not nexinfosys.case_sensitive: df[d + "_l"] = df[d].str.lower() d = d + "_l" break df = pd.merge(df, df_dst, how='left', left_on=d, right_on='sou_rce') del df['sou_rce'] if not nexinfosys.case_sensitive: del df[d] # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] out_names = self._content["measures_as"] rows = translate_case(self._content["group_by"], params) # Group by dimension names lcase_rows = [d.lower() for d in rows] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_rows: # Find and replace for i, d2 in enumerate(rows): if strcmp(d, d2): rows[i] = d break aggs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: aggs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: aggs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: aggs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: aggs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: # countav=="Count Available" aggs.append("count") # Count number of non-NaN elements agg_names["count"] = "countav" elif f.lower() in ["avgav", "avgna"]: # avgna=="Average without aggs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: # % of NaN vs total elements aggs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "rows" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present df_columns_dict = create_dictionary( data={c: c for c in df.columns}) for r in rows.copy(): if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[r] = params[k][0] else: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Put proper DIMENSION names for ir, r in enumerate(rows): if r in df_columns_dict: rows[ir] = df_columns_dict[r] # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in rows: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by # if True: groups = df.groupby(by=rows, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(aggs): for i, t in enumerate(zip(values, aggs)): v, agg = t if len(out_names) == len(values): if out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg] + "_" + v) else: lst_names.append(agg_names[agg] + "_" + v) lst = d.get(v, []) lst.append(agg) d[v] = lst else: for v in values: lst = d.get(v, []) for agg in aggs: lst.append(agg) lst_names.append(agg_names[agg] + "_" + v) d[v] = lst # Print NaN values for each value column for v in set(values): cnt = df[v].isnull().sum() print("NA count for col '" + v + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = rows + lst_names # else: # # Pivot table # df2 = pd.pivot_table(df, # values=values, # index=rows, # aggfunc=[aggs[0]], fill_value=np.NaN, margins=False, # dropna=True) # # Remove the multiindex in columns # df2.columns = [col[-1] for col in df2.columns.values] # # Remove the index # df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds.data = df2 except Exception as e: issues.append( (3, "There was a problem with the grouping: " + repr(e))) # Store the dataset in State datasets[result_name] = ds return issues, None