def check_expandable(v, location): """ Check if curly braces match, that what is inside is syntactically correct, (and that the value exists) :param v: :return: """ import re reg = re.compile(r"{.*?}") matches = reg.findall(v) output = set() if len(matches) == 0: issues.append( Issue( itype=IType.ERROR, description=f"Incorrect syntax, no macro expansion found", location=location)) else: for m in matches: h_name = m[1:-1] try: parser_field_parsers.string_to_ast( arith_boolean_expression, h_name) # simple_h_name output.add(h_name) except: issues.append( Issue( itype=IType.ERROR, description= f"The value {m[1:-1]} is not a valid hierarchical name", location=location)) return output
def check_parameter_value(glb_idx, p, value, issues, sheet_name, row): retval = True if p.range: try: # Try "numeric interval" ast = string_to_ast(number_interval, p.range) # try Convert value to float ast2 = string_to_ast(expression_with_parameters, value) evaluation_issues: List[Tuple[int, str]] = [] s = State() value, unresolved_vars = ast_evaluator(exp=ast2, state=s, obj=None, issue_lst=evaluation_issues) if value is not None: try: value = float(value) left = ast["left"] right = ast["right"] left_number = ast["number_left"] right_number = ast["number_right"] if left == "[": value_meets_left = value >= left_number else: value_meets_left = value > left_number if right == "]": value_meets_right = value <= right_number else: value_meets_right = value < right_number if not value_meets_left or not value_meets_right: issues.append(Issue(itype=IType.ERROR, description=f"The value {value} specified for the parameter '{p.name}' is out of the range {p.range}", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False except: issues.append(Issue(itype=IType.ERROR, description=f"The parameter '{p.name}' has a non numeric value '{value}', and has been constrained with a numeric range. Please, either change the Value or the Range", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False else: pass # The parameter depends on other parameters, a valid situation except: # A hierarchy name h = glb_idx.get(Hierarchy.partial_key(p.range)) h = h[0] if value not in h.codes.keys(): issues.append(Issue(itype=IType.ERROR, description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False return retval
def get_source(self, reference_name, subrow) -> Any: reference = None if reference_name: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, reference_name) ref_id = ast["ref_id"] references = self._glb_idx.get( ProvenanceReference.partial_key(ref_id)) if len(references) == 1: reference = references[0] else: references = self._glb_idx.get( BibliographicReference.partial_key(ref_id)) if len(references) == 1: reference = references[0] else: raise CommandExecutionError( f"Reference '{reference_name}' not found" + subrow_issue_message(subrow)) except: # TODO Change when Ref* are implemented reference = reference_name + " (not found)" return reference
def _get_scale_value(self, scale: str): try: value = float(scale) except ValueError: ast = string_to_ast(expression_with_parameters, scale) evaluation_issues: List[Tuple[int, str]] = [] s = State() value, unresolved_vars = ast_evaluator(exp=ast, state=s, obj=None, issue_lst=evaluation_issues) if len(evaluation_issues) > 0: evaluation_issues_str = [i[1] for i in evaluation_issues] raise CommandExecutionError( f"Problems evaluating scale expression '{scale}': " f"{', '.join(evaluation_issues_str)}") elif len(unresolved_vars) > 0: raise CommandExecutionError( f"Unresolved variables evaluating the scale expression '{scale}':" f" {', '.join(unresolved_vars)}") elif not value: raise CommandExecutionError( f"The scale expression '{scale}' could not be evaluated.") return value
def dictionary_from_key_value_list(kvl, state: State = None): """ From a string containing a list of keys and values, return a dictionary Keys must be literals, values can be expressions, to be evaluated at a later moment (syntactic validity of expressions is not checked here) :param kvl: String containing the list of keys and values :except If syntactic problems occur :return: A dictionary """ pairs = kvl.split(",") d = create_dictionary() for p in pairs: k, v = p.split("=", maxsplit=1) if not k: raise Exception( "Each key-value pair must be separated by '=' and key has to be defined, value can be empty: " + kvl) else: try: k = k.strip() v = v.strip() string_to_ast(simple_ident, k) try: # Simplest: string string_to_ast(quotedString, v) v = v[1:-1] except: issues = [] ast = string_to_ast(expression_with_parameters, v) res, unres = ast_evaluator(ast, state, None, issues) if len(unres) == 0: v = res d[k] = v except: raise Exception("Key must be a string: " + k + " in key-value list: " + kvl) return d
def parse_line(item, fields): """ Convert fields from a line to AST :param item: :param fields: :return: """ asts = {} for f, v in item.items(): if not f.startswith("_"): field = fields[f] # Parse (success is guaranteed because of the first pass dedicated to parsing) asts[f] = parser_field_parsers.string_to_ast(field.parser, v) return asts
def get_location(self, reference_name, subrow) -> Any: reference = None if reference_name: try: # TODO Change to parser for Location (includes references, but also Codes) ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, reference_name) ref_id = ast["ref_id"] references = self._glb_idx.get( GeographicReference.partial_key(ref_id)) if len(references) == 1: reference = references[0] else: raise CommandExecutionError( f"Reference '{reference_name}' not found" + subrow_issue_message(subrow)) except: reference = reference_name return reference
def parse_hierarchy_command(sh: Worksheet, area: AreaTupleType, name: str, n_type: str) -> IssuesLabelContentTripleType: """ Analyze a "hierarchy" command expressed in a worksheet of a spreadsheet The resulting JSON will be: { "name": <hierarchy name>, "type": ("Category", "FactorType", "Processor"), (this determines if the hierarchy is "is-a" -categories or factor types- or "part-of" -processors-) "h": [{"name": ..., "description": ..., "expression": ..., children: []}, ] } In a hierarchy only simple names (not hierarchic) are allowed. The full name is determined by its position in the tree At execution time, if the elements already exist, their location in the hierarchy is updated (and the description, if present, is added) :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param n_type: Type of hierarchy node: "C" (Category), "I" (InterfaceType) or "P" (Processor) :return: list of issues [(issue_type, message)], command label, command content """ some_error = False issues = [] col_names = { ("expression", "formula"): "expression", ("code", "name"): "code", ("description", ): "description" } # Scan columns to prepare: # * "expression_column". The column that can contain an expression (it is Optional) # * "levels". List of Levels, formed by pairs "code, description", where "description" is optional expression_column = None levels = [] for c in range(area[2], area[3]): # Scan all columns col_name = sh.cell(row=area[0], column=c).value if not col_name: continue for k in col_names: col_name = col_name.lower() if col_name in k: if col_name == "expression": expression_column = c elif col_name == "code": levels.append(tuple([c])) elif col_name == "description": # Description if there is an active CODE. If the description for the active CODE was # already satified, replace it... if len(levels) > 0: tmp = levels[-1] levels[-1] = (tmp[0], c) # Code, Description break # Now, scan rows. # Only one Level can be active at a time. # Current level starts in zero, and is updated in each row. # Level can increase by one with regard to the previous level, or freely decrease nodes = { } # Store nodes to check expressions later. Row number is key of the dictionary, the node is the value nodes_stack = [] current_level = -1 for r in range(area[0] + 1, area[1]): found = False for level, t in enumerate(levels): code_column = t[0] value = sh.cell(row=r, column=code_column).value if value: found = True break if found: # Value syntax. A simple identity name try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_ident, value) except: issues.append( (3, "The name of the category must be a simple name. Row " + str(r))) # Description if len(t) > 1: description_column = t[1] description = sh.cell(row=r, column=description_column).value else: description = None # Expression if expression_column: expression = sh.cell(row=r, column=expression_column).value else: expression = None # Create the hierarchy node n = dict(code=value, description=description, expression=expression, children=[]) if not n["expression"]: del n["expression"] if not n["description"]: del n["description"] # Store the node nodes[r] = n # Process hierarchical information add_node = True if level == current_level + 1: # New (empty) list nodes_stack.append([]) current_level = level elif level <= current_level: while current_level > level: lst = nodes_stack.pop( ) # Take and remove last element of the stack current_level -= 1 if current_level >= 0: # From the current level, children of the last node of the list are defined in "lst" nodes_stack[current_level][-1]["children"] = lst else: issues.append(( 3, "Hierarchical level must increase by one, not more. Previous level was " + str(current_level) + ", current is " + str(level) + ". Row " + str(r))) add_node = False # Append the new node to the current level if add_node: nodes_stack[current_level].append(n) # Close while current_level > 0: lst = nodes_stack.pop() # Take and remove last element of the stack current_level -= 1 if current_level >= 0: # From the current level, children of the last node of the list are defined in "lst" nodes_stack[current_level][-1]["children"] = lst # Check that expressions are correct and that they refer to existing codes # TODO Check that expressions are not circular codes = set([n["code"].lower() for n in nodes.values()]) for r, n in nodes.items(): code = n["code"] if "expression" in n: expression = n["expression"] ast = parser_field_parsers.string_to_ast( parser_field_parsers.hierarchy_expression, expression) for p in ast["terms"]: if isinstance(p, str): if p.lower() not in codes: issues.append( (3, "The code '" + p + "' in the expression '" + expression + "' (declaration of code '" + code + "') was not defined. Row: " + str(r))) content = {"name": name, "type": n_type, "h": nodes_stack[0]} return issues, None, content
def construct_flow_graph_2(state: State, query: IQueryObjects, filt: Union[str, dict], format: str = "visjs"): """ Prepare a graph from which conclusions about factors can be extracted Example: 1) Obtain "s", the serialized state from Redis or from a test file 2) state = deserialize_state(s) 3) query = BasicQuery(state) # Create a Query and execute a query 4) construct_solve_graph(state, query, None) :param state: State :param query: A IQueryObjects instance (which has been already injected the state) :param filt: A filter to be passed to the query instance :param format: VisJS, GML, ... :return: """ include_processors = False # For completeness (not clarity...), include processors nodes, as a way to visualize grouped factors will_write = True # For debugging purposes, affects how the properties attached to nodes and edges are elaborated expand_factors_graph = False # Expand transformation between FactorTypes into instances of Factors # Format for different node types stated_factor_no_observation = dict(graphics={'fill': "#999900"}) # Golden stated_factor_some_observation = dict(graphics={'fill': "#ffff00"}) # Yellow qq_attached_to_factor = dict(graphics={ 'fill': "#eeee00", "type": "ellipse" }) # Less bright Yellow non_stated_factor = dict(graphics={'fill': "#999999"}) a_processor = dict(graphics={"type": "hexagon", "color": "#aa2211"}) # Format for different edge types edge_from_factor_type = dict(graphics={ "fill": "#ff0000", "width": 1, "targetArrow": "standard" }) edge_processor_to_factor = dict(graphics={ "fill": "#ff00ff", "width": 3, "targetArrow": "standard" }) edge_factors_flow = dict(graphics={ "fill": "#000000", "width": 5, "targetArrow": "standard" }) edge_factors_scale = dict(graphics={ "fill": "#333333", "width": 3, "targetArrow": "standard" }) edge_factors_relative_to = dict(graphics={ "fill": "#00ffff", "width": 3, "targetArrow": "standard" }) edge_factor_value = dict(graphics={ "fill": "#aaaaaa", "width": 1, "targetArrow": "standard" }) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Obtain the information needed to elaborate the graph objs = query.execute([ Processor, Factor, FactorType, FactorTypesRelationUnidirectionalLinearTransformObservation, FactorsRelationScaleObservation, FactorsRelationDirectedFlowObservation ], filt) # 1) Graphical Representation: BOX -- BOX # # 2) Internal (not for end-users), pseudo-code: # # Processor1 <- Factor1 -> FactorType0 # Processor2 <- Factor2 -> FactorType0 # Processor3 <- Factor3 -> FactorType1 # Processor3 <- Factor4 -> FactorType0 # Factor1 <- FactorsRelationDirectedFlowObservation(0.4) -> Factor2 # Factor1 <- FactorsRelationDirectedFlowObservation(0.6) -> Factor4 # Factor1 <- FactorQuantitativeObservation(5.3 m²) # FactorType0 -> FactorTypesRelationUnidirectionalLinearTransformObservation(ctx) -> FactorType1 # Factor4 -> w1 -> Factor3 # Factor5 -> w2 -> Factor3 # # Index quantitative observations. # Also, mark Factors having QQs (later this will serve to color differently these nodes) qqs = {} qq_cont = 0 factors_with_some_observation = set() for o in find_quantitative_observations(glb_idx): # Index quantitative observations. if "relative_to" in o.attributes and o.attributes["relative_to"]: continue # Do not index intensive quantities, because they are translated as edges in the graph if o.factor in qqs: lst = qqs[o.factor] else: lst = [] qqs[o.factor] = lst lst.append(o) # Mark Factors having QQs (later this will serve to color differently these nodes) factors_with_some_observation.add(o.factor) # ---- MAIN GRAPH: Factors and relations between them -------------------------------------------------------------- the_node_names_set = set() # -- Nodes: "Factor"s passing the filter, and QQs associated to some of the Factors n = [] e = [] f_types = {} # Contains a list of Factors for each FactorType p_factors = {} # Contains a list of Factors per Processor7 rel_to_observations = set( ) # Set of FactorObservation having "relative_to" property defined factors = create_dictionary() # Factor_ID -> Factor for f in objs[Factor]: f_id = get_factor_id(f, prd=glb_idx) factors[f_id] = f # Dictionary Factor_ID -> Factor # f_types if f.taxon in f_types: lst = f_types[f.taxon] else: lst = [] f_types[f.taxon] = lst lst.append(f) # p_factors if f.processor in p_factors: lst = p_factors[f.processor] else: lst = [] p_factors[f.processor] = lst lst.append(f) # Add Node to graph the_node_names_set.add(f_id) if will_write: n.append((f_id, stated_factor_some_observation if f in factors_with_some_observation else stated_factor_no_observation)) if f in qqs: for qq in qqs[f]: if not ("relative_to" in qq.attributes and qq.attributes["relative_to"]): # value = str(qq.value) # str(qq_cont) + ": " + str(qq.value) value_node_name = f_id + " " + str(qq.value) n.append((value_node_name, qq_attached_to_factor)) e.append((value_node_name, f_id, { "w": "", "label": "", **edge_factor_value })) qq_cont += 1 else: rel_to_observations.add(qq) else: qqs2 = [ qq for qq in qqs if not ("relative_to" in qq.attributes and qq.attributes["relative_to"]) ] d = dict(factor=factor_to_dict(f), observations=qqs[f_id] if f_id in qqs2 else []) n.append((f_id, d)) # -- Edges # "Relative to" relation (internal to the Processor) -> Intensive to Extensive for o in rel_to_observations: if "relative_to" in o.attributes and o.attributes["relative_to"]: # Parse "defining_factor", it can be composed of the factor name AND the unit defining_factor = o.attributes["relative_to"] ast = parser_field_parsers.string_to_ast( parser_field_parsers.factor_unit, defining_factor) factor_type = ast_to_string(ast["factor"]) unit_name = ast["unparsed_unit"] ureg(unit_name) f_id = get_factor_id(o.factor, prd=glb_idx) # Check that "f_id" exists in the nodes list (using "factors") factors[f_id] # If "defining_factor" exists in the processor, ok. If not, create it. # Find factor_type in the processor factor_name = get_processor_id( o.factor.processor) + ":" + factor_type factors[factor_name] e.append((factor_name, f_id, { "w": o.value.expression, "label": o.value.expression, **edge_factors_relative_to })) # Directed Flows between Factors for df in objs[FactorsRelationDirectedFlowObservation]: sf = get_factor_id(df.source_factor, prd=glb_idx) tf = get_factor_id(df.target_factor, prd=glb_idx) # Check that both "sf" and "tf" exist in the nodes list (using "factors") factors[sf] factors[tf] weight = df.weight if df.weight else "1" e.append((sf, tf, {"w": weight, "label": weight, **edge_factors_flow})) # Scale Flows between Factors for df in objs[FactorsRelationScaleObservation]: sf = get_factor_id(df.origin, prd=glb_idx) tf = get_factor_id(df.destination, prd=glb_idx) # Check that both "sf" and "tf" exist in the nodes list (using "factors") factors[sf] factors[tf] weight = str(df.quantity) if df.quantity else "1" e.append((sf, tf, { "w": weight, "label": weight, **edge_factors_scale })) # TODO Consider Upscale relations # e.append((..., ..., {"w": upscale_weight, "label": upscale_weight, **edge_factors_upscale})) # -- Create the graph factors_graph = nx.DiGraph() factors_graph.add_nodes_from(n) factors_graph.add_edges_from(e) # nx.write_gml(factors_graph, "/home/rnebot/IntermediateGraph.gml") # ---- AUXILIARY GRAPH: FACTOR TYPES AND THEIR INTERRELATIONS ---- n = [] e = [] # -- Nodes: "FactorType"s passing the filter for ft in objs[FactorType]: n.append((get_factor_type_id(ft), dict(factor_type=ft))) # -- Edges # Hierarchy and expressions stated in the hierarchy ft_in = { } # Because FactorTypes cannot be both in hierarchy AND expression, marks if it has been specified one was, to raise an error if it is specified also the other way for ft in objs[FactorType]: ft_id = get_factor_type_id(ft) if ft.expression: if ft not in ft_in: # TODO Create one or more relations, from other FactorTypes (same Hierarchy) to this one # TODO The expression can only be a sum of FactorTypes (same Hierarchy) ft_in[ft] = "expression" # TODO Check that both "ft-id" and "..." exist in the nodes list (keep a temporary set) # weight = ... # e.append((ft_id, ..., {"w": weight, "label": weight, "origin": ft, "destination": ...})) if ft.parent: if ft.parent not in ft_in or (ft.parent in ft_in and ft_in[ft.parent] == "hierarchy"): # Create an edge from this FactorType ft_in[ft.parent] = "hierarchy" parent_ft_id = get_factor_type_id(ft.parent) # TODO Check that both "ft-id" and "parent_ft_id" exist in the nodes list (keep a temporary set) # Add the edge e.append((ft_id, parent_ft_id, { "w": "1", "origin": ft, "destination": ft.parent })) else: raise Exception( "The FactorType '" + ft_id + "' has been specified by an expression, it cannot be parent." ) # Linear transformations for f_rel in objs[ FactorTypesRelationUnidirectionalLinearTransformObservation]: origin = get_factor_type_id(f_rel.origin) destination = get_factor_type_id(f_rel.destination) e.append((origin, destination, { "w": f_rel.weight, "label": f_rel.weight, "origin": f_rel.origin, "destination": f_rel.destination })) # ---- Create FACTOR TYPES graph ---- factor_types_graph = nx.DiGraph() factor_types_graph.add_nodes_from(n) factor_types_graph.add_edges_from(e) # ---- EXPAND "FACTORS GRAPH" with "FACTOR TYPE" RELATIONS ---- sg_list = [] # List of modified (augmented) subgraphs if expand_factors_graph: # The idea is: clone a FactorTypes subgraph if a Factor instances some of its member nodes # This cloning process can imply creating NEW Factors the_new_node_names_set = set() # Obtain weak components of the main graph. Each can be considered separately # for sg in nx.weakly_connected_component_subgraphs(factors_graph): # For each subgraph # print("--------------------------------") # for n in sg.nodes(): # print(n) # ---- Weakly connected components of "factor_types_graph" ---- factor_types_subgraphs = list( nx.weakly_connected_component_subgraphs(factor_types_graph)) for sg in nx.weakly_connected_component_subgraphs( factors_graph): # For each subgraph sg_list.append(sg) # Consider each Factor of the subgraph unprocessed_factors = set(sg.nodes()) while unprocessed_factors: # For each UNPROCESSED Factor tmp = unprocessed_factors.pop( ) # Get next unprocessed "factor name" if tmp not in factors: # QQ Observations are in the graph and not in "factors". The same with Processors continue f_ = factors[tmp] # Obtain Factor from "factor name" ft_id = get_factor_type_id( f_) # Obtain FactorType name from Factor # Iterate through FactorTypes and check if the Factor appears for sg2 in factor_types_subgraphs: # Each FactorTypes subgraph if ft_id in sg2: # If the current Factor is in the subgraph if len( sg2.nodes() ) > 1: # If the FactorType subgraph has at least two nodes # CLONE FACTOR TYPES SUBGRAPH # Nodes. Create if not present already n = [] e = [] for n2, attrs in sg2.nodes().items( ): # Each node in the FactorTypes subgraph ft_ = attrs["factor_type"] f_id = get_factor_id(f_.processor, ft_, prd=glb_idx) if f_id not in sg: # If the FactorType is not # Create Factor, from processor and ft_ -> f_new _, _, f_new = find_or_create_observable( state, name=f_id, source="solver") factors[f_id] = f_new if f_id not in the_node_names_set: if will_write: n.append((f_id, non_stated_factor)) else: d = dict( factor=factor_to_dict(f_new), observations=[]) n.append((f_id, d)) if f_id not in the_node_names_set: the_new_node_names_set.add(f_id) the_node_names_set.add(f_id) else: unprocessed_factors.discard(f_id) # Edges. Create relations between factors for r2, w_ in sg2.edges().items(): # Find origin and destination nodes. Copy weight. Adapt weight? If it refers to a FactorType, instance it? origin = get_factor_id(f_.processor, w_["origin"], prd=glb_idx) destination = get_factor_id(f_.processor, w_["destination"], prd=glb_idx) if origin in the_new_node_names_set or destination in the_new_node_names_set: graphics = edge_from_factor_type else: graphics = {} e.append((origin, destination, { "w": w_["w"], "label": w_["w"], **graphics })) sg.add_nodes_from(n) sg.add_edges_from(e) break # for sg in sg_list: # print("--------------------------------") # for n in sg.nodes(): # print(n) # Recompose the original graph if sg_list: factors_graph = nx.compose_all(sg_list) else: pass ##factors_graph = nx.DiGraph() # ---- # Add "Processor"s just as a way to visualize grouping of factors (they do not influence relations between factors) # - if include_processors: n = [] e = [] for p in objs[Processor]: p_id = get_processor_id(p) if will_write: n.append((p_id, a_processor)) else: n.append((p_id, processor_to_dict(p))) # Edges between Processors and Factors for f in p_factors[p]: f_id = get_factor_id(f, prd=glb_idx) e.append((p_id, f_id, edge_processor_to_factor)) factors_graph.add_nodes_from(n) factors_graph.add_edges_from(e) # # for ft in objs[FactorType]: # if ft.parent: # # Check which Factors are instances of this FactorType # if ft in f_types: # for f in f_types[ft]: # # Check if the processor contains the parent Factor # processor_factors = p_factors[f.processor] # if ft.parent not in processor_factors: # factor_data = (f.processor, ft) # else: # factor_data = None # create_factor = f in qqs # If there is some Observation # create_factor = True # Force creation # # # # Consider the creation of a relation # # Consider also the creation of a new Factor (a new Node for now): if the child has some observation for sure (maybe a child of the child had an observation, so it is the same) # ft_id = # ft_id = # Plot graph to file # import matplotlib.pyplot as plt # ax = plt.subplot(111) # ax.set_title('Soslaires Graph', fontsize=10) # nx.draw(factors_graph, with_labels=True) # plt.savefig("/home/rnebot/Graph.png", format="PNG") # GML File # nx.write_gml(factors_graph, "/home/rnebot/Graph.gml") ret = None if format == "visjs": # Assign IDs to nodes. Change edges "from" and "to" accordingly ids_map = create_dictionary() id_count = 0 for node in factors_graph.nodes(data=True): sid = str(id_count) node[1]["id"] = sid ids_map[node[0]] = sid id_count += 1 vis_nodes = [] vis_edges = [] for node in factors_graph.nodes(data=True): d = dict(id=node[1]["id"], label=node[0]) if "shape" in node[1]: # circle, ellipse, database, box, diamond, dot, square, triangle, triangleDown, text, star d["shape"] = node[1]["shape"] else: d["shape"] = "box" if "color" in node[1]: d["color"] = node[1]["color"] vis_nodes.append(d) for edge in factors_graph.edges(data=True): f = ids_map[edge[0]] t = ids_map[edge[1]] d = {"from": f, "to": t, "arrows": "to"} data = edge[2] if "w" in data: d["label"] = data["w"] d["font"] = {"align": "horizontal"} vis_edges.append(d) ret = {"nodes": vis_nodes, "edges": vis_edges} elif format == "gml": ret1 = io.BytesIO() nx.write_gml(factors_graph, ret1) ret = ret1.getvalue() ret1.close() return ret # #########################################################################3 # GEXF File # nx.write_gexf(factors_graph, "/home/rnebot/Graph.gexf") # Legend graph n = [] e = [] n.append(("Factor with Observation", stated_factor_some_observation)) n.append(("Factor with No Observation", stated_factor_no_observation)) if include_processors: n.append(("Processor", a_processor)) n.append(("Factor from FactorType", non_stated_factor)) n.append(("QQ Observation", qq_attached_to_factor)) n.append(("QQ Intensive Observation", qq_attached_to_factor)) e.append(("A Factor", "Another Factor", { "label": "Flow between Factors, attaching the weight", **edge_factors_flow })) e.append(("Factor #1", "Factor #2", { "label": "Relation from a FactorType", **edge_from_factor_type })) if include_processors: e.append(("Processor", "A Factor", { "label": "Link from Processor to Factor", **edge_processor_to_factor })) e.append(("A Factor", "Same Factor in another processor", { "label": "Upscale a Factor in two processors", **edge_factors_upscale })) e.append(("Factor with Observation", "QQ Intensive Observation", { "label": "Observation proportional to extensive value of factor same processor", **edge_factors_relative_to })) e.append(("QQ Observation", "A Factor", { "label": "A QQ Observation", **edge_factor_value })) factors_graph = nx.DiGraph() factors_graph.add_nodes_from(n) factors_graph.add_edges_from(e)
def parse_pedigree_matrix_command(sh: Worksheet, area: AreaTupleType, name: str) -> IssuesLabelContentTripleType: """ A pedigree matrix is formed by several columns, with a header naming a phase, and below a list of modes, normally in ascending qualitative order. Modes can be referred later by the order number specified in the "Code" column (mandatory). The order of the columns serves also to sequence the codes of the matrix, from left to right. Columns can be accompanied by a description column, to the right :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param name: Name of the Pedigree Matrix :return: list of issues (issue_type, message), command label, command content """ issues = [] # Analyze columns phases = [] # A phase per column codes = None # Column with codes max_len = 0 # Column with max length for c in range(area[2], area[3]): phase_modes = [] current_phase = None for r in range(area[0], area[1]): value = sh.cell(row=r, column=c).value # First row has to be defined. If not, skip to the next column if r == area[0] and not value: break if value is None: continue if r == area[0]: current_phase = value try: if current_phase.lower() != "code": parser_field_parsers.string_to_ast( parser_field_parsers.simple_ident, value) else: if r != area[0]: # An Integer try: int(value) except: issues.append((3, "The code must be an integer")) except: if r == area[0]: issues.append(( 3, "Phase '" + value + "' of the Pedigree Matrix must be a simple identity (alphabet letter followed by either alphabet letters or numbers" )) else: issues.append(( 3, "A mode (" + value + ") in phase '" + current_phase + "' of the Pedigree Matrix must be a simple identity (alphabet letter followed by either alphabet letters or numbers" )) # Append mode to the current phase phase_modes.append(dict(mode=value, description="")) # Check: at least one element if len(phase_modes) < 2: issues.append((3, "Phase '" + current_phase + "' should have at least one mode")) # Check: no repetitions if len(phase_modes) != len(set([mode["mode"] for mode in phase_modes])): if current_phase.lower() != "code": issues.append( (3, "There is at least a repeated mode in phase '" + current_phase + "'")) else: issues.append( (3, "There is at least a repeated code in the list of codes")) # Update max column length if len(phase_modes) > max_len: max_len = len(phase_modes) if current_phase.lower() != "code": phases.append(phase_modes) else: codes = phase_modes[1:] # If not codes if not codes: codes = [str(i) for i in range(max_len - 2, -1, -1)] return issues, None, dict(name=name, codes=codes, phases=phases)
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition if dataset_name is None: issues.append( Issue( itype=IType.ERROR, description= f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command", location=IssueLocation(sheet_name=name, row=None, column=None))) return issues, None, None # Obtain the source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=IType.ERROR, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=IType.ERROR, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join( [m2["measure"] for m2 in out_measures.values]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=IType.ERROR, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=IType.ERROR, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: if col_name.lower() == "starttime": col_name = "StartPeriod" elif col_name.lower() == "endtime": col_name = "EndPeriod" filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=IType.ERROR, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=IType.WARNING, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=IType.ERROR, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=IType.ERROR, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=IType.ERROR, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=IType.ERROR, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=IType.WARNING, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def execute(self, state: "State"): def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=IType.ERROR, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=IType.ERROR, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name ds.database = None ds.attributes = {} current_ds[dsd_dataset_name] = ds if not dsd_concept_type: if ds.attributes.get("_location"): issues.append( Issue( itype=IType.WARNING, description= f"Location of data for dataset {ds.code} previously declared. " f"Former: {attributes.get('_location')}, " f"Current: {dsd_dataset_data_location}", location=IssueLocation(sheet_name=name, row=r, column=None))) attributes = ds.attributes else: attributes["_dataset_first_row"] = r attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes else: # If concept_type is defined => add a concept # Check if the concept name already appears --> Error for d1 in ds.dimensions: if strcmp(d1.code, dsd_concept_name): issues.append( Issue( itype=IType.ERROR, description= f"Concept {dsd_concept_name} already declared for dataset {ds.code}", location=IssueLocation(sheet_name=name, row=r, column=None))) break d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=IType.ERROR, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=IType.ERROR, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes # ------------------------------------------------------------------------------------------------------------- issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.name for ds in datasets] # List of available Category hierarchies hierarchies = create_dictionary() for h in hh: hierarchies[h.name] = hh # Datasets being defined in this Worksheet current_ds = create_dictionary() # type: Dict[str, Dataset] # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: pass else: process_line(line) # Any error? error = any_error_issue(issues) # Load the data for those datasets that are not local (data defined later in the same spreadsheet) for ds in current_ds.values(): if "_location" not in ds.attributes: error = True issues.append( Issue(itype=IType.ERROR, description= "Location of data not specified, for dataset '" + ds.code + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: loc = ds.attributes["_location"] ast = parser_field_parsers.string_to_ast(url_parser, loc) if ast["scheme"] != "data": df = load_dataset(loc) if df is None: error = True issues.append( Issue( itype=IType.ERROR, description= f"Could not obtain data for dataset '{ds.code}' at '{loc}'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: iss = prepare_dataframe_after_external_read( ds, df, name) issues.extend(iss) # Everything ok? Store the dataframe! if len(iss) == 0: ds.data = df if not error: # If no error happened, add the new Datasets to the Datasets in the "global" state for ds in current_ds: r = current_ds[ds].attributes["_dataset_first_row"] df = current_ds[ds].data if df is not None: # Loop over "ds" concepts. # - "dimension" concepts of type "string" generate a CodeHierarchy # - Check that the DataFrame contains ALL declared concepts. If not, generate issue cid = create_dictionary( data={col: col for col in df.columns}) col_names = list(df.columns) for c in current_ds[ds].dimensions: if c.code in df.columns: col_names[df.columns.get_loc( cid[c.code])] = c.code # Rename column dsd_concept_data_type = c.attributes["_datatype"] if dsd_concept_data_type.lower( ) == "string" and not c.is_measure: # Freely defined dimension cl = df[cid[c.code]].unique().tolist() c.code_list = CodeList.construct( c.code, c.code, [""], codes=[ CodeImmutable(c, c, "", []) for c in cl ]) else: issues.append( Issue( itype=IType.ERROR, description= f"Concept '{c.code}' not defined for '{ds}' in {loc}", location=IssueLocation(sheet_name=name, row=r, column=None))) df.columns = col_names datasets[ds] = current_ds[ds] return issues, None
issues.append(Issue(itype=IType.ERROR, description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False return retval if __name__ == '__main__': from nexinfosys.model_services import State from dotted.collection import DottedDict issues = [] s = State() ex = "level =”N - 1”, farm_type =”GH”, efficiency = 0.3" ast = string_to_ast(key_value_list, ex) res, unres = ast_evaluator(ast, s, None, issues) s.set("Param1", 2.1) # s.set("Param", 0.1) s.set("Param2", 0.2) s.set("p1", 2.3) ej = "level='n+1', r=[Ref2019], a=5*p1, c=?p1>3 -> 'T1', p1<=2 -> 'T2', 'T3'?" ast = string_to_ast(key_value_list, ej) res, unres = ast_evaluator(ast, s, None, issues) examples = ["?Param1 > 3 -> 5, Param1 <=3 -> 2?", "(Param1 * 3 >= 0.3) AND (Param2 * 2 <= 0.345)", "cos(Param*3.1415)", "{Param} * 3 >= 0.3", "'Hola'",
def _init_and_process_row(self, row: Dict[str, Any]) -> None: def obtain_dictionary_with_not_expandable_fields(d): output = {} for k, v in d.items(): if v is None or "{" not in v: output[k] = v return output self._current_row_number = row["_row"] self._fields = self._get_command_fields_values(row) tmp_fields = self._fields self._check_all_mandatory_fields_have_values() # If expandable, do it now expandable = row["_expandable"] if expandable: # Extract variables state = State() issues = [] asts = {} referenced_variables = create_dictionary() for e in expandable: ast = parser_field_parsers.string_to_ast( arith_boolean_expression, e) c_name = f"{{{e}}}" asts[c_name] = ast res, vars = ast_evaluator(ast, state, None, issues, atomic_h_names=True) for v in vars: referenced_variables[v] = None res = classify_variables2(referenced_variables.keys(), self._datasets, self._hierarchies, self._parameters) ds_list = res["datasets"] ds_concepts = res["ds_concepts"] h_list = res["hierarchies"] if len(ds_list) >= 1 and len(h_list) >= 1: self._add_issue( itype=IType.ERROR, description="Dataset(s): " + ", ".join([d.name for d in ds_list]) + ", and hierarchy(ies): " + ", ".join([h.name for h in h_list]) + ", have been specified. Only a single dataset is supported." ) return elif len(ds_list) > 1: self._add_issue( itype=IType.ERROR, description="More than one dataset has been specified: " + ", ".join([d.name for d in ds_list]) + ", just one dataset is supported.") return elif len(h_list) > 0: self._add_issue( itype=IType.ERROR, description="One or more hierarchies have been specified: " + ", ".join([h.name for h in h_list])) return if len(ds_list) == 1: # Expand dataset ds = ds_list[0] measure_requested = False all_dimensions = set( [c.code for c in ds.dimensions if not c.is_measure]) requested_dimensions = set() requested_measures = set() for con in ds_concepts: found = False for c in ds.dimensions: if strcmp(c.code, con): found = True if c.is_measure: measure_requested = True requested_measures.add(c.code) else: # Dimension all_dimensions.remove(c.code) requested_dimensions.add(c.code) if not found: self._add_issue( itype=IType.ERROR, description= f"The concept '{{{ds.code}.{con}}}' is not in the dataset '{ds.code}'" ) return ds_concepts = list(requested_measures) ds_concepts.extend(list(requested_dimensions)) all_dimensions_requested = len(all_dimensions) == 0 if measure_requested and not all_dimensions_requested: self._add_issue( IType.ERROR, f"It is not possible to use a measure ({', '.join(requested_measures)}), if not all dimensions are used " f"(cannot assume implicit aggregation). Dimensions not used: {', '.join(all_dimensions)}" ) return elif not measure_requested and not all_dimensions_requested: # Reduce the Dataframe to unique tuples of the specified dimensions # TODO Consider the current case -sensitive or not-sensitive- data = ds.data[list( requested_dimensions)].drop_duplicates() else: # Take the dataset as-is data = ds.data # Remove Index, and do it NOT-INPLACE data = data.reset_index() # Drop rows with empty dimension value import numpy as np data = data.replace(r'^\s*$', np.NaN, regex=True) data.dropna(subset=requested_dimensions, inplace=True) const_dict = obtain_dictionary_with_not_expandable_fields( self._fields) # row? var_dict = set( [f for f in self._fields.keys() if f not in const_dict]) re_concepts = {} for c in ds_concepts: c_name = f"{{{ds.code}.{c}}}" if case_sensitive: re_concepts[c_name] = re.compile(c_name) else: re_concepts[c_name] = re.compile(c_name, re.IGNORECASE) location = IssueLocation(sheet_name=self._command_name, row=self._current_row_number, column=None) already_parsed_fields = set(const_dict.keys()) for ds_row, row2 in enumerate( data.iterrows()): # Each row in the dataset # Initialize constant values (those with no "{..}" expressions) row3 = const_dict.copy() # Prepare state to evaluate functions state = State() for c in ds_concepts: state.set(f"{ds.code}.{c}", str(row2[1][c])) state.set( "_glb_idx", self._glb_idx ) # Pass PartialRetrievalDictionary to the evaluator. For functions needing it # Evaluate all functions expressions = {} for e, ast in asts.items(): res, vars = ast_evaluator(ast, state, None, issues, atomic_h_names=True) expressions[e] = res # Expansion into var_dict for f in var_dict: v = self._fields[f] # Initial value for item in sorted(expressions.keys(), key=len, reverse=True): v = v.replace(item, expressions[item]) row3[f] = v # # Concepts change dictionary # concepts = {} # for c in ds_concepts: # concepts[f"{{{ds.code}.{c}}}"] = str(row2[1][c]) # # Expansion into var_dict # for f in var_dict: # v = self._fields[f] # Initial value # for item in sorted(concepts.keys(), key=len, reverse=True): # v = re_concepts[item].sub(concepts[item], v) # row3[f] = v # Syntactic verification of the resulting expansion processable, tmp_issues = parse_cmd_row_dict( self._serialization_type, row3, already_parsed_fields, location) if len(tmp_issues) > 0: self._issues.extend(tmp_issues) # Process row if processable: self._fields = row3 self._process_row(row3, ds_row) self._fields = tmp_fields elif len(h_list) == 1: # Expand hierarchy pass else: self._process_row(self._fields) # Process row
def parse_cmd_row_dict(cmd_name: str, row: Dict[str, str], already_parsed_fields: Set[str], location: IssueLocation) -> Tuple[bool, List[Issue]]: """ Parse a row (as a dictionary) from a command It is used after expansion of "macros" :param cmd_name: Name of command :param row: A dictionary containing the values to parse syntactically. Keys are field names, Values are field values :param already_parsed_fields: Set of fields already known to be syntactically valid :param location: IssueLocation object to use when creating Issues :return: A tuple: a boolean (True if the row can be used, otherwise False) and a list of Issues """ issues: List[Issue] = [] from nexinfosys.command_field_definitions import command_fields field_defs_dict = {f.name: f for f in command_fields[cmd_name]} mandatory_not_found = set([ c.name for c in command_fields[cmd_name] if c.mandatory and isinstance(c.mandatory, bool) ]) print(mandatory_not_found) complex_mandatory_cols = [ c for c in command_fields[cmd_name] if isinstance(c.mandatory, str) ] may_append = True complex_row = False for field_name, field_value in row.items(): field_def = field_defs_dict.get(field_name) if not field_def: return ParseException( f"Field {field_name} not found for command {cmd_name}") if field_value is not None: if not isinstance(field_value, str): field_value = str(field_value) field_value = field_value.strip() else: continue # Parse the field if field_def.allowed_values: if field_value.lower() not in [ v.lower() for v in field_def.allowed_values ]: # TODO Case insensitive CI issues.append( Issue( itype=IType.ERROR, description= f"Field '{field_name}' of command '{cmd_name}' has invalid value '{field_value}'." f" Allowed values are: {', '.join(field_def.allowed_values)}.", location=location)) may_append = False else: pass # OK else: # Instead of a list of values, check if a syntactic rule is met by the value if field_def.parser: # Parse, just check syntax (do not store the AST) try: if field_name not in already_parsed_fields: ast = parser_field_parsers.string_to_ast( field_def.parser, field_value) # Rules are in charge of informing if the result is expandable and if it complex if "expandable" in ast and ast["expandable"]: issues.append( Issue( itype=IType.ERROR, description= f"Field '{field_name}' of command '{cmd_name}' cannot be expandable again.", location=location)) may_append = False if "complex" in ast and ast["complex"]: complex_row = True except: issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{field_name}' of command '{cmd_name}' " f"is not syntactically correct. Entered: {field_value}", location=location)) may_append = False else: pass # Valid if field_def.name in mandatory_not_found: mandatory_not_found.discard(field_def.name) # MODIFY INPUT Dictionary with this new Key if complex_row: row["_complex"] = complex_row # Append if all mandatory fields have been filled if len(mandatory_not_found) > 0: issues.append( Issue( itype=IType.ERROR, description= f"Mandatory columns: {', '.join(mandatory_not_found)} have not been specified", location=location)) may_append = False # Check varying mandatory fields (fields depending on the value of other fields) for c in complex_mandatory_cols: field_def = c.name # next(c2 for c2 in col_map if strcmp(c.name, c2.name)) if isinstance(c.mandatory, str): # Evaluate mandatory = eval(c.mandatory, None, row) may_append = (mandatory and field_def in row) or (not mandatory) if mandatory and field_def not in row: issues.append( Issue(itype=IType.ERROR, description="Mandatory column: " + field_def + " has not been specified", location=location)) may_append = False return may_append, issues
def validate_command(command_content_to_validate): """ The input comes in a JSON field "content": {"command": "<command name", "fields": {"<field name>": "<value", ...} } :param command_content_to_validate: :return: A dictionary with the same fields of the input dictionary, whose values are the diagnosis, None being everything-ok, and a string being a message describing the problem. """ def split_expansion_expressions(f, content): # Dataset expansion. Isolate each expression pieces = [] offset = 0 look_for = "{" open_brace = False s = None while offset < len(content): pos = content[offset:].find(look_for) if pos >= 0: if look_for == "{": if pos > 0: pieces.append( (content[offset:offset + pos], False)) # Literal look_for = "}" open_brace = True else: if pos > 0: pieces.append( (content[offset:offset + pos], True)) # Expansion else: s = f"Invalid syntax in field '{f}' with value: " + content + ". No expression between curly braces." valid = False break look_for = "{" open_brace = False offset += pos + 1 else: # Character not found if open_brace: s = f"Invalid syntax in field '{f}' with value: " + content + ". Curly brace not closed." valid = False break else: # Add the rest pieces.append((content[offset:], False)) offset = len(content) return pieces, s if "command" in command_content_to_validate: command = command_content_to_validate["command"] else: raise Exception("Must specify 'command'") if "fields" in command_content_to_validate: fields = command_content_to_validate["fields"] else: raise Exception("Must specify 'fields'") alternative_command_names = command_content_to_validate.get( "alternative_command_names", {}) result = {} # Find command from the worksheet name ("command") match = None for cmd in commands: for cmd_name in cmd.allowed_names: if cmd_name.lower() in command.lower(): if match: if match[1] < len(cmd_name): match = (cmd.name, len(cmd_name)) else: match = (cmd.name, len(cmd_name)) if not match: for k, v in alternative_command_names: if k.lower() in command.lower(): for cmd in commands: for cmd_name in cmd.allowed_names: if cmd_name.lower() in v.lower(): match = (cmd.name, 0) break if match: break if match: break # Fields in the command status = True if match: for f in fields: # Validate field by field for f2 in command_fields[ match[0]]: # Find corresponding field in the command if f.lower() in [f3.lower() for f3 in f2.allowed_names]: fld = f2 break else: fld = None if fld: # If found, can validate syntax # Validate Syntax content = fields[f] content_msg = content # Original "content", to show in case of error if isinstance(content, (int, float)): content = str(content) # Check if it is an expansion expression valid = True if "{" in content or "}" in content: # Is expansion allowed in this command? expansion_allowed = True if expansion_allowed: pieces, s = split_expansion_expressions(f, content) if s is None: c = "" for p in pieces: if p[1]: # Expansion expression try: string_to_ast(arith_boolean_expression, p[0]) c += "expand" except: s = f"Invalid syntax in field '{f}' with value: {content}, expansion expression '{p[0]}' invalid" result[f] = s valid = False break else: c += p[0] if valid: content = c else: valid = False if not valid: result[f] = s status = False else: if fld.allowed_values: if content != content_msg: # It was an expansion expression, cannot check it now, assume it is good result[f] = None else: # Case insensitive comparison if content.lower().strip() in [ f.lower().strip() for f in fld.allowed_values ]: result[f] = None else: result[ f] = "'" + content + "' in field '" + f + "' must be one of: " + ", ".join( fld.allowed_values) status = False else: try: string_to_ast(fld.parser, content) result[f] = None except: s = f"Invalid syntax in field '{f}' with value: '{content_msg}'" if fld.examples: s += ". Examples: " + ", ".join(fld.examples) result[f] = s status = False else: result[ f] = "Field '" + f + "' not found in command '" + command + "'. Possible field names: " + ", ".join( [ item for f2 in command_fields[command] for item in f2.allowed_names ]) status = False else: for f in fields: # Validate field by field result[ f] = "Command '" + command + "' not found in the list of command names: " + ", ".join( [n for c in commands for n in c.allowed_names]) status = False return result, status
def parse_scale_conversion_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType: """ Analyze the input area Obtain the numerical part Read a row above and a column to the left, looking for source (left col) and target (row above) factor types FactorTypes do not need to exist previously, they can be created :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :return: list of issues (issue_type, message), command label, command content """ def get_subrow(r, c1, c2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell previous = None for c in range(c1, c2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst def get_subcolumn(c, r1, r2): lst = [] # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell # !!! This may not be correct at all times: when a cell is intentionally left blank # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range previous = None for r in range(r1, r2): v = sh.cell(row=r, column=c).value if not v: if previous: lst.append(previous) else: lst.append("") else: previous = v lst.append(v) return lst # --------------------------------------------- some_error = False issues = [] # Detect the matrix defining scales m = binary_mask_from_worksheet(sh, True) # "True" is to focus on cells containing numbers # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix t = obtain_rectangular_submatrices(m)[0] # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3] t = (t[0]+1, t[1]+1, t[2]+1, t[3]+1) # The previous calculation is done using Numpy, so it is Zero based. Correct this # Obtain the factor type names in the subrow on top of the matrix subrow = get_subrow(t[0]-1, t[2], t[3]) # Obtain the factor type names in the subcolumn to the left of the matrix subcol = get_subcolumn(t[2]-1, t[0], t[1]) # Check that we have valid factor type names for ft in subrow+subcol: try: parser_field_parsers.string_to_ast(parser_field_parsers.simple_h_name, ft) except: some_error = True issues.append((3, "'"+ft+"' is not a valid Factor Type name")) if some_error: return issues, None, None # Scan the matrix, creating scale records scales = [] for i, r in enumerate(range(t[0], t[1])): for j, c in enumerate(range(t[2], t[3])): v = sh.cell(row=r, column=c).value if v: if not isinstance(v, str): v = str(v) # Origin factor origin = subcol[i] # Destination factor destination = subrow[j] if strcmp(origin, destination): issues.append((3, "A change of scale to the same factor type ("+origin+") is not allowed")) else: try: parser_field_parsers.string_to_ast(parser_field_parsers.expression_with_parameters, v) # Add the scale scales.append(dict(origin=origin, destination=destination, scale=v)) except: issues.append((3, "The expression '"+v+"' at the intersection of factor types " + origin + " and " + destination + " is syntactically incorrect")) content = {"origin_factor_types": subcol, "destination_factor_types": subrow, "scales": scales } return issues, None, content
def parse_structure_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType: """ Analyze the input to produce a JSON object with a list of Observables and relations to other Observables Result:[ {"origin": <processor or factor>, "description": <label describing origin>, "attributes": {"<attr>": "value"}, "default_relation": <default relation type>, "dests": [ {"name": <processor or factor>, Optional("relation": <relation type>,) "weight": <expression resulting in a numeric value> } } ] :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :return: list of issues (issue_type, message), command label, command content """ some_error = False issues = [] # Scan the sheet, the first column must be one of the keys of "k_list", following # columns can contain repeating values col_names = { ("origin", "name"): "origin", ("relation", "default relation"): "default_relation", ("destination", "destinations"): "destinations", ("origin label", "label"): "description" } # Check columns col_map = collections.OrderedDict() for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value if not col_name: continue for k in col_names: if col_name.lower() in k: col_map[c] = col_names[k] break # Map key to a list of values content = [] # Dictionary of lists, one per metadata key for r in range(area[0] + 1, area[1]): item = {} for c in col_map: value = sh.cell(row=r, column=c).value if not value: continue k = col_map[c] if k == "origin": # Mandatory # Check syntax try: parser_field_parsers.string_to_ast( parser_field_parsers.factor_name, value) item[k] = value except: some_error = True issues.append( (3, "The name specified for the origin element, '" + value + "', is not valid, in row " + str(r) + ". It must be either a processor or a factor name.")) elif k == "default_relation": # Optional (if not specified, all destinations must specify it) # Check syntax allowed_relations = ('|', '>', '<', '<>', '><', '||') if value in allowed_relations: item[k] = value else: some_error = True issues.append(( 3, "The Default relation type specified for the origin element, '" + value + "', is not valid, in row " + str(r) + ". It must be one of " + ', '.join(allowed_relations) + ".")) elif k == "destinations": # Mandatory # Because the expression (weight relation p_f_name) and the simple p_f_name can collide syntactically, # first try the simpler expression then the complex one try: dummy = parser_field_parsers.string_to_ast( parser_field_parsers.factor_name, value) except: try: dummy = parser_field_parsers.string_to_ast( parser_field_parsers.relation_expression, value) except: traceback.print_exc() some_error = True issues.append(( 3, "The specification of destination, '" + value + "', is not valid, in row " + str(r) + ". It is a sequence of weight (optional) relation (optional) destination element (mandatory)" )) # Check syntax. It can contain: a weight, a relation type, a processor or factor name. if dummy: if k not in item: lst = [] item[k] = lst else: lst = item[k] lst.append(value) elif k == "description": # Optional item[k] = value # Check parameter completeness before adding it to the list of parameters if "origin" not in item: issues.append( (3, "The element must have an Origin, row " + str(r))) continue if "destinations" not in item: issues.append( (3, "The element must have at least one Destination, row " + str(r))) continue content.append(item) return issues, None, dict(structure=content)
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType, dataset_name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Dataset source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = [k.lower() for k in dims[d].code_list.keys()] # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map tmp = [ to["d"] for o in mappings[m].map for to in o["to"] if to["d"] ] cl[mappings[m].destination] = set( tmp) # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian measures = [] out_dims = [] agg_funcs = [] measures_as = [] filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in [ "dimensions_kept", "dims", "dimensions" ]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for d in lst: if not d: continue if d not in cl: issues.append(( 3, "The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]")) else: out_dims.append(d) elif col_name.lower().strip() in [ "aggregation_function", "aggfunc", "agg_func" ]: # "SELECT AGGREGATORS" lst = obtain_column(c, area[0] + 1, area[1]) for f in lst: if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append(( 3, "The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'" )) else: agg_funcs.append(f) elif col_name.lower().strip() in ["measures"]: # "SELECT" lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for m in lst: if not m: continue if m not in meas: issues.append( (3, "The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]")) else: measures.append(m) elif col_name.lower().strip() in ["measuresas"]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for m in lst: measures_as.append(m) elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for cd in lst: if not cd: continue if str(cd).lower() not in cl[col_name]: issues.append(( 3, "The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.")) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in ["result_name", "result name", "resultname"]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append((3, "Column '" + col_name + "' has an invalid dataset name '" + result_name + "'")) if len(measures) == 0: issues.append((3, "At least one measure should be specified")) if len(agg_funcs) == 0: issues.append( (2, "No aggregation function specified. Assuming 'average'")) agg_funcs.append("average") if not result_name: result_name = source + "_" + dataset_name issues.append( (2, "No result name specified. Assuming '" + result_name + "'")) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": None, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None: """ Process a dictionary representing a row of the Interfaces command. The dictionary can come directly from the worksheet or from a dataset. :param field_values: dictionary """ # f_processor_name -> p # f_interface_type_name -> it # f_interface_name -> i # # IF NOT i AND it AND p => i_name = it.name => get or create "i" # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error) # IF i AND p AND NOT it => get "i" (MUST EXIST) f_interface_type_name = field_values.get("interface_type") f_interface_name = field_values.get("interface") if not f_interface_name: if not f_interface_type_name: raise CommandExecutionError( "At least one of InterfaceType or Interface must be defined" + subrow_issue_message(subrow)) f_interface_name = f_interface_type_name processor = self.find_processor(field_values.get("processor"), subrow) # Try to find Interface f_orientation = field_values.get("orientation") interface_type: Optional[FactorType] = None interface: Optional[Factor] = None interfaces: Sequence[Factor] = self._glb_idx.get( Factor.partial_key(processor=processor, name=f_interface_name)) if len(interfaces) == 1: interface = interfaces[0] print(f"DEBUG - Interface '{interface.name}' found") interface_type = interface.taxon if f_interface_type_name and not strcmp(interface_type.name, f_interface_type_name): self._add_issue( IType.WARNING, f"The existing Interface '{interface.name}' has the InterfaceType " f"'{interface_type.name}' which is different from the specified " f"InterfaceType '{f_interface_type_name}'. Record skipped." + subrow_issue_message(subrow)) return elif len(interfaces) > 1: raise CommandExecutionError( f"Interface '{f_interface_name}' found {str(len(interfaces))} times. " f"It must be uniquely identified." + subrow_issue_message(subrow)) elif len(interfaces) == 0: # The interface does not exist, create it below if not f_orientation: raise CommandExecutionError( f"Orientation must be defined for new Interfaces." + subrow_issue_message(subrow)) # InterfaceType still not found if not interface_type: interface_type_name = ifnull(f_interface_type_name, f_interface_name) # Find FactorType # TODO Allow creating a basic FactorType if it is not found? interface_types: Sequence[FactorType] = self._glb_idx.get( FactorType.partial_key(interface_type_name)) if len(interface_types) == 0: raise CommandExecutionError( f"InterfaceType '{interface_type_name}' not declared previously" + subrow_issue_message(subrow)) elif len(interface_types) > 1: raise CommandExecutionError( f"InterfaceType '{interface_type_name}' found {str(len(interface_types))} times. " f"It must be uniquely identified." + subrow_issue_message(subrow)) else: interface_type = interface_types[0] # Get attributes default values taken from Interface Type or Processor attributes # Rows : value of (source) "processor.subsystem_type" # Columns: value of (target) "interface_type.opposite_processor_type" # Cells : CORRECTED value of "opposite_processor_type" # +--------+-------+--------+-------+---------+ # | | Local | Env | Ext | ExtEnv | # +--------+-------+--------+-------+---------+ # | Local | Local | Env | Ext | ExtEnv | # | Env | Local | Env | Ext | ExtEnv? | # | Ext | Ext | ExtEnv | Local | Env | # | ExtEnv | Ext | ExtEnv | Local | Env? | # +--------+-------+--------+-------+---------+ if interface_type.opposite_processor_type: tmp = interface_type.opposite_processor_type.lower() if processor.subsystem_type.lower() in ["local", "environment" ]: # First two rows opposite_processor_type = tmp else: opposite_processor_type = InterfacesAndQualifiedQuantitiesCommand.invert[ tmp] # TODO in doubt. Maybe these are undefined (values with question mark in the table) # if tmp == "externalenvironment" and processor.subsystem_type.lower() in ["environment", "externalenvironment"]: # pass else: opposite_processor_type = None interface_type_values = { "sphere": interface_type.sphere, "roegen_type": interface_type.roegen_type, "opposite_processor_type": opposite_processor_type } # Get internal and user-defined attributes in one dictionary # Use: value specified in Interfaces ELSE value specified in InterfaceTypes ELSE first value of allowed values attributes = { c.name: ifnull( field_values[c.name], ifnull(interface_type_values.get(c.name), head(c.allowed_values))) for c in self._command_fields if c.attribute_of == Factor } if not interface: # f_list: Sequence[Factor] = self._glb_idx.get( # Factor.partial_key(processor=p, factor_type=ft, orientation=f_orientation)) # # if len(f_list) > 0: # raise CommandExecutionError(f"An interface called '{f_list[0].name}' for Processor '{f_processor_name}'" # f" with InterfaceType '{f_interface_type_name}' and orientation " # f"'{f_orientation}' already exists"+subrow_issue_message(subrow)) # Transform text of "interface_attributes" into a dictionary interface_attributes = self.transform_text_attributes_into_dictionary( field_values.get("interface_attributes"), subrow) attributes.update(interface_attributes) location = self.get_location(field_values.get("location"), subrow) interface = Factor.create_and_append( f_interface_name, processor, in_processor_type=FactorInProcessorType(external=False, incoming=False), taxon=interface_type, geolocation=location, tags=None, attributes=attributes) self._glb_idx.put(interface.key(), interface) print(f"DEBUG - Interface '{interface.name}' created") elif not interface.compare_attributes(attributes): initial = ', '.join( [f"{k}: {interface.get_attribute(k)}" for k in attributes]) new = ', '.join([f"{k}: {attributes[k]}" for k in attributes]) name = interface.processor.full_hierarchy_names( self._glb_idx)[0] + ":" + interface.name raise CommandExecutionError( f"The same interface '{name}', is being redeclared with different properties. " f"INITIAL: {initial}; NEW: {new}." + subrow_issue_message(subrow)) f_unit = field_values.get("unit") if not f_unit: f_unit = interface_type.unit # Unify unit (it must be done before considering RelativeTo -below-, because it adds a transformation to "f_unit") f_value = field_values.get("value") if f_value is not None and f_unit != interface_type.unit: try: f_value = UnitConversion.convert(f_value, f_unit, interface_type.unit) except DimensionalityError: raise CommandExecutionError( f"Dimensions of units in InterfaceType ({interface_type.unit}) and specified ({f_unit}) are not convertible" + subrow_issue_message(subrow)) f_unit = interface_type.unit # Search for a relative_to interface f_relative_to = field_values.get("relative_to") relative_to_interface: Optional[Factor] = None if f_relative_to: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.factor_unit, f_relative_to) except: raise CommandExecutionError( f"Could not parse the RelativeTo column, value {str(f_relative_to)}. " + subrow_issue_message(subrow)) relative_to_interface_name = ast_to_string(ast["factor"]) # rel_unit_name = ast["unparsed_unit"] # try: # f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units) # except (UndefinedUnitError, AttributeError) as ex: # raise CommandExecutionError(f"The final unit could not be computed, interface '{f_unit}' / " # f"relative_to '{rel_unit_name}': {str(ex)}"+subrow_issue_message(subrow)) relative_to_interface = first( interface.processor.factors, lambda ifc: strcmp(ifc.name, relative_to_interface_name)) if not relative_to_interface: raise CommandExecutionError( f"Interface specified in 'relative_to' column " f"'{relative_to_interface_name}' has not been found." + subrow_issue_message(subrow)) if f_value is None and relative_to_interface is not None: # Search for a Interface Type Conversion defined in the ScaleChangeMap command interface_types_transforms: List[FactorTypesRelationUnidirectionalLinearTransformObservation] = \ find_factor_types_transform_relation(self._glb_idx, relative_to_interface.taxon, interface.taxon, processor, processor) # Overwrite any specified unit, it doesn't make sense without a value, i.e. it cannot be used for conversion f_unit = interface.taxon.unit if len(interface_types_transforms) == 1: f_value = interface_types_transforms[0].scaled_weight else: interface_types_transforms_message = "an interface type conversion doesn't exist" \ if (len(interface_types_transforms) == 0) \ else f"{len(interface_types_transforms)} interface type conversions exist" f_value = "0" self._add_issue( IType.WARNING, f"Field 'value' should be defined for interfaces having a " f"'RelativeTo' interface, and {interface_types_transforms_message}. " f"Using value '0'." + subrow_issue_message(subrow)) # Create quantitative observation if f_value is not None: f_uncertainty = field_values.get("uncertainty") f_assessment = field_values.get("assessment") f_pedigree_matrix = field_values.get("pedigree_matrix") f_pedigree = field_values.get("pedigree") f_time = field_values.get("time") f_comments = field_values.get("comments") f_source = field_values.get("qq_source") # TODO: source is not being used source = self.get_source(f_source, subrow) # Find Observer observer: Optional[Observer] = None if f_source: observer = self._glb_idx.get_one( Observer.partial_key(f_source)) if not observer: self._add_issue( IType.WARNING, f"Observer '{f_source}' has not been found." + subrow_issue_message(subrow)) # If an observation exists then "time" is mandatory if not f_time: raise CommandExecutionError( f"Field 'time' needs to be specified for the given observation." + subrow_issue_message(subrow)) # An interface can have multiple observations if each of them have a different [time, observer] combination for observation in interface.quantitative_observations: observer_name = observation.observer.name if observation.observer else None if strcmp(observation.attributes["time"], f_time) and strcmp( observer_name, f_source): raise CommandExecutionError( f"The interface '{interface.name}' in processor '{interface.processor.name}' already has an " f"observation with time '{f_time}' and source '{f_source}'." ) self.check_existence_of_pedigree_matrix(f_pedigree_matrix, f_pedigree, subrow) # Transform text of "number_attributes" into a dictionary number_attributes = self.transform_text_attributes_into_dictionary( field_values.get("number_attributes"), subrow) o = _create_or_append_quantitative_observation( interface, f_value, f_unit, f_uncertainty, f_assessment, f_pedigree, f_pedigree_matrix, observer, relative_to_interface, f_time, None, f_comments, None, number_attributes)
def parse_data_input_command(sh: Worksheet, area: AreaTupleType, processors_type: str, state=None) -> IssuesLabelContentTripleType: """ Scans the "area" of input worksheet "sh" where it is assumed a "data input" command is present. It obtains a list of observations, a list of processors, a list of observables, a list of tags All those are represented in JSON format :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param processors_type: Name for the type of processors. Also label of the command :param state: Transient state useful for checking existence of variables :return: DataInputCommand, list of issues (issue_type, message) """ some_error = False issues = [] # Define a set of observations (qualified quantities) of observables # This set can be replicated. So, ?how to refer to each replica? # Regular expression, internal name, Mandatory (True|False) known_columns = [ (r"Name|Processor[_ ]name", "processor", False), (r"Level", "level", False), (r"Parent", "parent", False), (r"FF[_ ]type", "ff_type", True), (r"Var|Variable", "factor", True), (r"Value|NUSAP\.N", "value", False), # If value is not specified, then just declare the Factor (r"Unit|NUSAP\.U", "unit", True), # If blank, a dimensionless amount is assumed (r"Relative[_ ]to", "relative_to", False), (r"Uncertainty|Spread|NUSAP\.S", "uncertainty", False), (r"Assessment|NUSAP\.A", "assessment", False), (r"Pedigree[_ ]matrix|NUSAP\.PM", "pedigree_matrix", False), (r"Pedigree|NUSAP\.P", "pedigree", False), (r"Time|Date", "time", False), (r"Geo|Geolocation", "geolocation", False), (r"Source", "source", False), (r"Comment|Comments", "comments", False) ] label = "Processors " + processors_type # First, examine columns, to know which fields are being specified # Special cases: # Open columns: the field is specified in the cell togheter with the value. Like "attr1=whatever", instead of a header "attr1" and in a row below, a value "whatever" # Complex values: the value has syntactic rules. Like expressions for both quantities AND qualities (like NUSAP) # References: the field refers to additional information in another worksheet. Unique names or ref holder (worksheet name) plus ref inside the worksheet, would be allowed. Also ref type can disambiguate mandatory = {t[1]: t[2] for t in known_columns} cre = { } # Column Regular Expression dictionary (K: regular expression; V: RegularExpression object) if not case_sensitive: flags = re.IGNORECASE else: flags = 0 for kc in known_columns: cre[kc[0]] = re.compile(kc[0], flags=flags) col_names = {} standard_cols = { } # Internal (standardized) column name to column index in the worksheet (freedom in the order of columns) attribute_cols = create_dictionary( ) # Not recognized columns are considered freely named categories, attributes or tags attributes = [ ] # List of attributes or tags (keys of the previous dictionary) col_allows_dataset = create_dictionary( ) # If the column allows the reference to a dataset dimension for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value if not col_name: continue col_name = col_name.replace("\n", " ") col_names[c] = col_name # Match found = False for kc in known_columns: res = cre[kc[0]].search(col_name) if res: if kc[1] in standard_cols: issues.append( (2, "Cannot repeat column name '" + col_name + "' (" + kc[0] + ") in data input command '" + processors_type + "'")) else: standard_cols[kc[1]] = c col_names[c] = kc[ 1] # Override column name with pseudo column name for standard columns if col_names[c].lower() in [ "factor", "value", "time", "geolocation" ]: col_allows_dataset[col_names[c]] = True else: col_allows_dataset[col_names[c]] = False found = True break if not found: if col_name not in attribute_cols: # TODO Check valid col_names. It must be a valid Variable Name attribute_cols[col_name] = c attributes.append(col_name) col_allows_dataset[col_name] = True else: issues.append( (2, "Cannot repeat column name '" + col_name + "' in data input command '" + processors_type + "'")) del cre # Check if there are mandatory columns missing # TODO There could be combinations of columns which change the character of mandatory of some columns # TODO For instance, if we are only specifying structure, Value would not be needed print("BORRAME - " + str(known_columns)) print("BORRAME 2 - " + str(standard_cols)) for kc in known_columns: # "kc[2]" is the flag indicating if the column is mandatory or not # col_map contains standard column names present in the worksheet if kc[2] and kc[1] not in standard_cols: some_error = True issues.append((3, "Column name '" + kc[0] + "' must be specified in data input command '" + processors_type + "'")) # If there are errors, do not continue if some_error: return issues, label, None processor_attribute_exclusions = create_dictionary() processor_attribute_exclusions[ "scale"] = None # Exclude these attributes when characterizing the processor processor_attributes = [ t for t in attributes if t not in processor_attribute_exclusions ] # SCAN rows lst_observations = [ ] # List of ALL observations. -- Main outcome of the parse operation -- set_pedigree_matrices = create_dictionary() # List of pedigree templates set_processors = create_dictionary() # List of processor names set_factors = create_dictionary() # List of factors set_taxa = create_dictionary( ) # Dictionary of taxa with their lists of values. Useful to return CODE LISTS set_referenced_datasets = create_dictionary( ) # Dictionary of datasets to be embedded into the result (it is a job of the execution part) processors_taxa = create_dictionary( ) # Correspondence "processor" -> taxa (to avoid changes in this correspondence) dataset_column_rule = parser_field_parsers.dataset_with_column values = [None] * area[3] # LOOP OVER EACH ROW for r in range(area[0] + 1, area[1]): # Scan rows (observations) # Each row can specify: the processor, the factor, the quantity and qualities about the factor in the processor # It can also specify a "flow+containment hierarchy" relation row = {} # Store parsed values of the row taxa = create_dictionary() # Store attributes or taxa of the row referenced_dataset = None # Once defined in a row, it cannot change!! # Scan the row first, looking for the dataset. The specification is allowed in certain columns: # attribute_cols and some standard_cols already_processed = create_dictionary() for c in range(area[2], area[3]): if c in col_names: value = sh.cell(row=r, column=c).value if isinstance(value, str) and value.startswith("#"): col_name = col_names[c] if col_allows_dataset[col_name]: if not referenced_dataset: try: ast = parser_field_parsers.string_to_ast( dataset_column_rule, value[1:]) if len(ast["parts"]) == 2: referenced_dataset = ast["parts"][0] # Remove the dataset variable. It will be stored in "_referenced_dataset" value = "#" + ast["parts"][1] else: some_error = True issues.append(( 3, "The first dataset reference of the row must contain the " "dataset variable name and the dimension name, row " + str(r))) # Mark as processed already_processed[col_name] = None except: some_error = True issues.append( (3, "Column '" + col_name + "' has an invalid dataset reference '" + value + "', in row " + str(r))) else: try: ast = parser_field_parsers.string_to_ast( simple_ident, value[1:]) # Mark as processed already_processed[col_name] = None except: some_error = True issues.append( (3, "Column '" + col_name + "' has an invalid dataset reference '" + value + "', in row " + str(r))) if col_name in standard_cols: row[col_name] = value else: taxa[col_name] = value values[c] = value # TODO If the flow type is decomposed, compose it first for c in standard_cols: if c in already_processed: continue value = values[standard_cols[c]] # != "" or not if value is None or (value is not None and value == ""): if c == "unit": value = "-" if not value: if mandatory[c]: some_error = True issues.append( (3, "Column '" + c + "' is mandatory, row " + str(r))) continue # Skip the rest of the iteration! # Parse the value if c in ["processor", "factor"]: # Check that it is a variable name, and allow hierarchical names parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, value) elif c == "pedigree_matrix": parser_field_parsers.string_to_ast( parser_field_parsers.simple_ident, value) elif c == "relative_to": # Two elements, the first a hierarchical name, the second a unit name s = value.split(" ") if len(s) != 2: some_error = True issues.append(( 3, "The Relative To value has to have two parts, factor name and unit, separated by a whitespace (specified '" + value + "'), in row " + str(r))) else: try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, s[0]) except: some_error = True issues.append(( 3, "The name specified for the relative to factor '" + s[0] + "' is not valid, in row " + str(r))) # It must be a recognized unit. Check with Pint try: ureg(s[1]) ureg.parse_unit_name(s[1], case_sensitive) except UndefinedUnitError: some_error = True issues.append(( 3, "The unit name '" + s[1] + "' is not registered in the units processing package, in row " + str(r))) elif c == "level": # A valid level name try: parser_field_parsers.string_to_ast( parser_field_parsers.level_name, value) except: some_error = True issues.append((3, "The level '" + value + "' syntax is not valid, in row " + str(r))) elif c == "parent": # Check that value is a valid parent name. It can be either a list of tags OR # a processor name, something defining a single processor try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, value) except: try: parser_field_parsers.string_to_ast( parser_field_parsers.named_parameters_list, value) except: some_error = True issues.append((3, "Could not parse '" + value + "' as 'parent' in row " + str(r))) elif c == "ff_type": # The type of flow/fund must be one of a set of possible values. DEFINE THE LIST if value.lower() not in allowed_ff_types: some_error = True issues.append( (3, "ff_type must be one of :" + ', '.join(allowed_ff_types) + ", in row " + str(r))) elif c == "value": if not isinstance(value, str): value = str(value) # Expression allowed. Check syntax only. It can refer to parameters. ast = parser_field_parsers.string_to_ast( parser_field_parsers.expression, value) # TODO Check existence of used variables # TODO basic_elements_parser.ast_evaluator(ast, state, None, issues, "static") elif c == "unit": # It must be a recognized unit. Check with Pint try: value = value.replace("€", "Euro").replace("$", "Dollar") if value == "-": value = "" # Dimensionless ureg(value) ureg.parse_unit_name(value, case_sensitive) except: some_error = True issues.append(( 3, "The unit name '" + value + "' is not registered in the units processing package, in row " + str(r))) elif c == "uncertainty": # TODO It must be a valid uncertainty specifier pass elif c == "assessment": # See page 135 of Funtowicz S., Ravetz J., "Uncertainty and Quality in Science for Policy" # "c" is "cognitive" assessment, "p" is pragmatic assessment. allowed = [ "nil", "low", "medium", "high", "total", "nil_c", "low_c", "medium_c", "high_c", "total_c", "nil_p", "low_p", "medium_p", "high_p", "total_p" ] if value and value.lower() not in allowed: issues.append((3, "Assessment must be empty or one of: " + ", ".join(allowed))) elif c == "pedigree": # A valid pedigree specification is just an integer try: int(value) except: issues.append((3, "The pedigree specification '" + value + "' must be an integer")) elif c == "time": # A valid time specification. Possibilities: Year, Month-Year / Year-Month, Time span (two dates) if not isinstance(value, str): value = str(value) ast = parser_field_parsers.string_to_ast( parser_field_parsers.time_expression, value) elif c == "geolocation": # A reference to a geolocation try: parser_field_parsers.string_to_ast( parser_field_parsers.reference, value) except: some_error = True issues.append((3, "The geolocation must be a reference")) elif c == "source": # Who or what provided the information. It can be formal or informal. Formal can be references (but evaluated later) pass elif c == "comments": # Free text pass # Store the parsed value row[c] = value for c in attribute_cols: if c in already_processed: continue value = values[attribute_cols[c]] # != "" or not if not value: taxa[c] = None continue # Skip the rest of the iteration! # TODO Check value. Valid identifier, no whitespace # Validate "value", it has to be a simple ID try: if not isinstance(value, str): value = str(value) parser_field_parsers.simple_ident.parseString(value, parseAll=True) except: value = None some_error = True issues.append(( 3, "The value in column '" + c + "' has to be a simple identifier: start with letter, then letters, numbers and '_', no whitespace, in row " + str(r))) taxa[c] = value # Disable the registration of taxa. If a Dataset reference is used, there is no way to register # taxa at parse time (the dataset is still not obtained). Leave it for the execution if c not in set_taxa: set_taxa[c] = create_dictionary() if value is not None: set_taxa[c][value] = None # Now that individual columns have been parsed, do other things if referenced_dataset: row["_referenced_dataset"] = referenced_dataset # If "processor" not specified, concatenate taxa columns in order to generate an automatic name # (excluding the processor type) p_taxa = taxa.copy() for k in processor_attribute_exclusions: if k in p_taxa: del p_taxa[k] if "processor" not in row: row["processor"] = "_".join( [str(taxa[t]) for t in processor_attributes] ) # TODO Which order? (the current is "order of appearance"; maybe "alphabetical order" would be better option) # Add as "taxa" the processor type (which is an optional input parameter to this function) if processors_type: taxa["_processors_type"] = processors_type # Store taxa (attributes and taxa) row["taxa"] = taxa # Store taxa if the processor still does not have it if row["processor"] not in processors_taxa: processors_taxa[row[ "processor"]] = p_taxa # "::".join([taxa[t] for t in lst_taxa_cols]) else: # Taxa should be the same for each "processor". Error if different t = processors_taxa[row["processor"]] if t != p_taxa: issues.append( (3, "The processor '" + row["processor"] + "' has different taxa assigned, in row " + str(r))) # Register new processor names, pedigree templates, and variable names if "processor" in row: set_processors[row["processor"]] = None if "pedigree_matrix" in row: set_pedigree_matrices[row["pedigree_matrix"]] = None if "factor" in row: set_factors[row["factor"]] = None if referenced_dataset: set_referenced_datasets[referenced_dataset] = None lst_observations.append(row) content = { "factor_observations": lst_observations, "processor_attributes": processor_attributes, "processors": [k for k in set_processors], "pedigree_matrices": [k for k in set_pedigree_matrices], "factors": [k for k in set_factors], "referenced_datasets": [ds for ds in set_referenced_datasets], "code_lists": {k: [k2 for k2 in set_taxa[k]] for k in set_taxa} } return issues, label, content
def _process_row(self, fields: Dict[str, Any], subrow=None) -> None: def process_relation(relation_class): source_processor = self._get_processor_from_field( "source_processor") target_processor = self._get_processor_from_field( "target_processor") self._check_fields(relation_class, source_processor, target_processor, subrow) if relation_class.is_between_processors: create_relation_observations( self._glb_idx, source_processor, [(target_processor, relation_class)], relation_class, None, attributes=attributes) elif relation_class.is_between_interfaces: try: source_interface = self._get_interface_from_field( "source_interface", source_processor) if self._fields.get( "source_interface" ) else self._get_interface_from_field( "target_interface", source_processor) except CommandExecutionError as e: source_interface = None if not str(e).startswith("The interface"): raise e else: self._add_issue(IType.WARNING, str(e)) try: target_interface = self._get_interface_from_field( "target_interface", target_processor) if self._fields.get( "target_interface" ) else self._get_interface_from_field( "source_interface", target_processor) except CommandExecutionError as e: target_interface = None if not str(e).startswith("The interface"): raise e else: self._add_issue(IType.WARNING, str(e)) if not source_interface or not target_interface: return if fields["back_interface"]: relation_class = RelationClassType.ff_directed_flow_back if relation_class == RelationClassType.ff_directed_flow_back: back_interface = self._get_interface_from_field( "back_interface", source_processor) self._check_flow_back_interface_types( source_interface, target_interface, back_interface) attributes.update(dict(back_interface=back_interface)) if relation_class.is_flow: self._check_flow_orientation( source_processor, target_processor, source_interface, target_interface, is_direct_flow=(relation_class == RelationClassType.ff_directed_flow)) if source_interface.taxon != target_interface.taxon: interface_types_transforms = find_factor_types_transform_relation( self._glb_idx, source_interface.taxon, target_interface.taxon, source_processor, target_processor) # ChangeOfTypeScale if self._fields.get("change_type_scale"): o = FactorTypesRelationUnidirectionalLinearTransformObservation.create_and_append( source_interface.taxon, target_interface.taxon, self._fields.get("change_type_scale"), source_interface.processor, target_interface. processor, # AdHoc source-target Context None, None, # No unit conversion find_or_create_observer( Observer.no_observer_specified, self._glb_idx)) self._glb_idx.put(o.key(), o) if len(interface_types_transforms) > 0: self._add_issue( IType.WARNING, f"Preexisting matching ScaleChangeMap entry found. Overriding with " f"{self._fields.get('change_type_scale')}") interface_types_transform = self._get_interface_types_transform( source_interface.taxon, source_processor, target_interface.taxon, target_processor, subrow) attributes.update( dict(scale_change_weight=interface_types_transform. scaled_weight)) create_relation_observations( self._glb_idx, source_interface, [(target_interface, relation_class, fields["flow_weight"]) ], relation_class, None, attributes=attributes) if not self._all_processors: self._all_processors = get_processor_names_to_processors_dictionary( self._glb_idx) # source_cardinality = fields["source_cardinality"] # target_cardinality = fields["target_cardinality"] source_processors = self._fields["source_processor"] target_processors = self._fields["target_processor"] attributes = self._get_attributes_from_field("attributes") try: # Get relation class type relation_class = RelationClassType.from_str( fields["relation_type"]) except NotImplementedError as e: raise CommandExecutionError(str(e)) if ".." in source_processors or ".." in target_processors: if ".." in source_processors: source_processor_names = obtain_matching_processors( string_to_ast(processor_names, self._fields["source_processor"]), self._all_processors) else: source_processor_names = [source_processors] if ".." in target_processors: target_processor_names = obtain_matching_processors( string_to_ast(processor_names, self._fields["target_processor"]), self._all_processors) else: target_processor_names = [target_processors] for s in source_processor_names: for t in target_processor_names: self._fields["source_processor"] = s self._fields["target_processor"] = t process_relation(relation_class) else: process_relation(relation_class)
def execute(self, state: "State"): """ Process each of the specified relations, creating the endpoints if they do not exist already {"name": <processor or factor>, "attributes": {"<attr>": "value"}, "type": <default relation type>, "dests": [ {"name": <processor or factor>, ["type": <relation type>,] "weight": <expression resulting in a numeric value> } } """ some_error = False issues = [] glb_idx, _, _, _, _ = get_case_study_registry_objects(state) # Process each record for i, o in enumerate(self._content["structure"]): # origin processor[+factor] -> relation (weight) -> destination processor[+factor] origin_name = o["origin"] if "source" in o: source = o["source"] else: source = None if "default_relation" in o: default_relation = o["default_relation"] else: default_relation = None destinations = [] for r in o["destinations"]: try: result = parser_field_parsers.string_to_ast( parser_field_parsers.factor_name, r) except: try: result = parser_field_parsers.string_to_ast( parser_field_parsers.relation_expression, r) except: traceback.print_exc() some_error = True issues.append(( 3, "The specification of destination, '" + r + "', is not valid, in element " + str(r) + ". It is a sequence of weight (optional) relation (optional) destination element (mandatory)" )) if result: if result["type"] == "pf_name": base = result else: base = result["name"] tmp = base["processor"] destination_name = ( (tmp["ns"] + "::") if "ns" in tmp and tmp["ns"] else '') + '.'.join(tmp["parts"]) if "factor" in base and base["factor"]: tmp = base["factor"] destination_name += ':' + ( (tmp["ns"] + "::") if "ns" in tmp and tmp["ns"] else '') + '.'.join(tmp["parts"]) if "relation_type" in result and result["relation_type"]: rel_type = result["relation_type"] else: rel_type = None if "weight" in result and result["weight"]: weight = ast_to_string( result["weight"]) # For flow relations else: weight = None if rel_type and weight: t = (destination_name, rel_type, weight) elif rel_type and not weight: t = (destination_name, rel_type) elif not rel_type and not weight: t = tuple( [destination_name] ) # Force it to be a tuple (create_relation_observations expects that) destinations.append(t) rels = create_relation_observations(glb_idx, origin_name, destinations, default_relation, source) return issues, None
def get_interfaces(glb_idx: PartialRetrievalDictionary) -> pd.DataFrame: # Used to examine "value" as expression, and find variables that are interface names vs parameter names params = create_dictionary( data={p.name: None for p in glb_idx.get(Parameter.partial_key())}) s = State() procs = glb_idx.get(Processor.partial_key()) d = {} for p in procs: parent_relations = glb_idx.get( ProcessorsRelationPartOfObservation.partial_key(child=p)) d[p.ident] = set([p.parent_processor.ident for p in parent_relations]) lst = [[ "Processor", "InterfaceType", "Interface", "Sphere", "RoegenType", "Orientation", "OppositeSubsystemType", "GeolocationRef", "GeolocationCode", "InterfaceAttributes", "Value", "Unit", "RelativeTo", "Uncertainty", "Assessment", "PedigreeMatrix", "Pedigree", "Time", "Source", "NumberAttributes", "Comments" ]] # Elaborate a DAG, then iterate over it for ident in list(toposort.toposort_flatten(d)): p = glb_idx.get(Processor.partial_key(ident=ident))[0] ifaces = glb_idx.get((Factor.partial_key(processor=p))) iface_names = create_dictionary( data={iface.name: iface for iface in ifaces}) # Elaborate DAG of Interfaces because of Observations d = {} for iface in ifaces: if iface.ident not in d: d[iface.ident] = set() for obs in iface.quantitative_observations: if obs.relative_factor: d[iface.ident].add(obs.relative_factor.ident) # Consider obs.value and non linear dependencies if isinstance(obs.value, str): ast = string_to_ast(expression_with_parameters, obs.value) evaluation_issues = [] value, unresolved_vars = ast_evaluator( exp=ast, state=s, obj=None, issue_lst=evaluation_issues) for unresolved in unresolved_vars: if unresolved not in params: d[iface.ident].add(iface_names[unresolved].ident) for ident2 in list(toposort.toposort_flatten(d)): iface = glb_idx.get(Factor.partial_key(ident=ident2))[0] lst1 = [ iface.processor.name, iface.taxon.name, iface.name, iface.sphere, iface.roegen_type.name, iface.orientation, iface.opposite_processor_type, "", "", "" ] observations = iface.quantitative_observations if len(observations) > 0: for obs in observations: lst2 = [ obs.value, obs.attributes.get("unit", ""), obs.relative_factor.name if obs.relative_factor else "", obs.attributes.get("spread", ""), obs.attributes.get("assessment", ""), obs.attributes.get("pedigree_template", ""), obs.attributes.get("pedigree", ""), obs.attributes.get("time", ""), obs.observer.name if obs.observer else "", "", obs.attributes.get("comments", "") ] lst.append(lst1 + lst2) else: lst.append(lst1 + ["", "", "", "", "", "", "", "", "", ""]) return list_to_dataframe(lst)
def parse_command_in_worksheet(sh: Worksheet, area: AreaTupleType, name: Optional[str], cmd_name: str) -> IssuesLabelContentTripleType: """ Parse command in general Generate a JSON Generate a list of issues :param sh: Worksheet to read :param area: Area of the worksheet :param name: Name of the worksheet :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions :return: issues List, None, content (JSON) """ def check_expandable(v, location): """ Check if curly braces match, that what is inside is syntactically correct, (and that the value exists) :param v: :return: """ import re reg = re.compile(r"{.*?}") matches = reg.findall(v) output = set() if len(matches) == 0: issues.append( Issue( itype=IType.ERROR, description=f"Incorrect syntax, no macro expansion found", location=location)) else: for m in matches: h_name = m[1:-1] try: parser_field_parsers.string_to_ast( arith_boolean_expression, h_name) # simple_h_name output.add(h_name) except: issues.append( Issue( itype=IType.ERROR, description= f"The value {m[1:-1]} is not a valid hierarchical name", location=location)) return output def commented_row(rn): commented = False v = sh.cell(row=r, column=1).value if v is not None: if str(v).startswith("#"): commented = True return commented issues: List[Issue] = [] from nexinfosys.command_field_definitions import command_fields cols = command_fields[ cmd_name] # List of CommandField that will guide the parsing col_map, local_issues = check_columns(sh, name, area, cols, cmd_name) if any([i.itype == IType.ERROR for i in local_issues]): return local_issues, None, None issues.extend(local_issues) # The "mandatoriness" of a field may depend on values in other fields (like in RefBibliographic command fields) # Elaborate a list of fields having this "complex" mandatory property complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)] content = [] # The output JSON # Parse each Row for r in range(area[0] + 1, area[1]): line = {} expandable = set( ) # A set of variables to be expanded. If empty, it is a literal line (not expandable) complex = False # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast) # A row is commented if the value in the first column starts with "#" (a first empty column could be inserted # to ease this, just to signal commented rows) if commented_row(r): continue # Constant mandatory values mandatory_not_found = set([ c.name for c in cols if c.mandatory and isinstance(c.mandatory, bool) ]) # Each "field" for field_def in col_map.keys(): field_name = field_def.name field_defined = False # Appearances of field (normally just once, there are attributes allowing more than one appearance) for col_name, col_idx in col_map[field_def]: # Read and prepare "value" value = sh.cell(row=r, column=col_idx).value if value is not None: if isinstance(value, float): if value == int(value): value = str(int(value)) else: value = str(value) elif not isinstance(value, str): value = str(value) value = value.strip() field_defined = True else: continue # Check if value contains "{", expansion if "{" in value: # Expandable. Do not parse now. Check: curly pairs, and that what is between is a # simple_h_name and that it exists: as dataset expandable.update( check_expandable( value, IssueLocation(sheet_name=name, row=r, column=col_idx))) # With many appearances, just a "Key-Value list" syntax is permitted if field_def.many_appearances: if field_name in line: line[ field_name] += ", " + col_name + "='" + value + "'" else: line[field_name] = col_name + "='" + value + "'" else: if field_name in line: line[field_name] += ", " + value else: line[field_name] = value # Store the value else: if field_def.allowed_values: # If the CommandField checks for a list of allowed values allowed_values_dict: Dict[str, str] = { v.lower(): v for v in field_def.allowed_values } if value.lower( ) not in allowed_values_dict: # TODO Case insensitive CI issues.append( Issue( itype=IType.ERROR, description= f"Field '{col_name}' of command '{cmd_name}' has invalid category " f"'{value}'. Allowed values are: {', '.join(field_def.allowed_values)}.", location=IssueLocation(sheet_name=name, row=r, column=col_idx))) else: # Use case from allowed values line[field_name] = allowed_values_dict[ value.lower()] else: # Instead of a list of values, check if a syntactic rule is met by the value if field_def.parser: # Parse, just check syntax (do not store the AST) try: standalone_attribute_value = "@" in field_def.allowed_names[ 0] if not standalone_attribute_value: ast = parser_field_parsers.string_to_ast( field_def.parser, value) else: try: ast = parser_field_parsers.string_to_ast( field_def.parser, value) except: ast = parser_field_parsers.string_to_ast( unquoted_string, value) # Rules are in charge of informing if the result is expandable and if it complex if "expandable" in ast and ast["expandable"]: issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{col_header}' of command " f"'{cmd_name}' should not be expandable. Entered: {value}", location=IssueLocation( sheet_name=name, row=r, column=col_idx))) if "complex" in ast and ast["complex"]: complex = True # With many appearances, just a "Key-Value list" syntax is permitted if field_def.many_appearances: if field_name in line: line[ field_name] += ", " + col_name + "='" + value + "'" else: line[ field_name] = col_name + "='" + value + "'" else: if field_name in line: line[field_name] += ", " + value else: line[ field_name] = value # Store the value except: import traceback traceback.print_exc() col_header = sh.cell(row=1, column=col_idx).value issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{col_header}' of command " f"'{cmd_name}' is not syntactically correct. Entered: {value}", location=IssueLocation( sheet_name=name, row=r, column=col_idx))) else: line[ field_name] = value # No parser, just store blindly the value if field_defined and field_def.name in mandatory_not_found: mandatory_not_found.discard(field_def.name) if len(line) == 0: continue # Empty line (allowed) # Flags to accelerate the second evaluation, during execution line["_row"] = r line["_expandable"] = list(expandable) line["_complex"] = complex # Append if all mandatory fields have been filled may_append = True if len(mandatory_not_found) > 0: issues.append( Issue(itype=IType.ERROR, description="Mandatory columns: " + ", ".join(mandatory_not_found) + " have not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) may_append = False # Check varying mandatory fields (fields depending on the value of other fields) for c in complex_mandatory_cols: field_def = c.name # next(c2 for c2 in col_map if strcmp(c.name, c2.name)) if isinstance(c.mandatory, str): # Evaluate mandatory = eval(c.mandatory, None, line) may_append = (mandatory and field_def in line) or (not mandatory) if mandatory and field_def not in line: issues.append( Issue(itype=IType.ERROR, description="Mandatory column: " + field_def + " has not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) if may_append: content.append(line) return issues, None, {"items": content, "command_name": name}
def process_row(row): """ Process a dictionary representing a row of the data input command. The dictionary can come directly from the worksheet or from a dataset. Implicitly uses "glb_idx" :param row: dictionary """ # From "ff_type" extract: flow/fund, external/internal, incoming/outgoing # ecosystem/society? ft = row["ff_type"].lower() if ft == "int_in_flow": roegen_type = FlowFundRoegenType.flow internal = True incoming = True elif ft == "int_in_fund": roegen_type = FlowFundRoegenType.fund internal = True incoming = True elif ft == "ext_in_fund": roegen_type = FlowFundRoegenType.fund internal = False incoming = True elif ft == "int_out_flow": roegen_type = FlowFundRoegenType.flow internal = True incoming = False elif ft == "ext_in_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = True elif ft == "ext_out_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = False elif ft == "env_out_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = False elif ft == "env_in_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = True elif ft == "env_in_fund": roegen_type = FlowFundRoegenType.fund internal = False incoming = True # Split "taxa" attributes. "scale" corresponds to the observation p_attributes = row["taxa"].copy() if "scale" in p_attributes: other_attrs = create_dictionary() other_attrs["scale"] = p_attributes["scale"] del p_attributes["scale"] else: other_attrs = None # Check existence of PedigreeMatrix, if used if "pedigree_matrix" in row: pm = glb_idx.get( PedigreeMatrix.partial_key(name=row["pedigree_matrix"])) if len(pm) != 1: issues.append((3, "Could not find Pedigree Matrix '" + row["pedigree_matrix"] + "'")) del row["pedigree_matrix"] else: try: lst = pm[0].get_modes_for_code(row["pedigree"]) except: issues.append( (3, "Could not decode Pedigree '" + row["pedigree"] + "' for Pedigree Matrix '" + row["pedigree_matrix"] + "'")) del row["pedigree"] del row["pedigree_matrix"] else: if "pedigree" in row: issues.append(( 3, "Pedigree specified without accompanying Pedigree Matrix" )) del row["pedigree"] # Source if "source" in row: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, row["source"]) ref_id = ast["ref_id"] references = glb_idx.get(Reference.partial_key(ref_id), ref_type="provenance") if len(references) == 1: source = references[0] except: source = row["source"] else: source = None # Geolocation if "geolocation" in row: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, row["geolocation"]) ref_id = ast["ref_id"] references = glb_idx.get(Reference.partial_key(ref_id), ref_type="geographic") if len(references) == 1: geolocation = references[0] except: geolocation = row["geolocation"] else: geolocation = None # CREATE FactorType, A Type of Observable, IF it does not exist # AND ADD Quantitative Observation p, ft, f, o = create_or_append_quantitative_observation( glb_idx, factor=row["processor"] + ":" + row["factor"], value=row["value"] if "value" in row else None, unit=row["unit"], observer=source, spread=row["uncertainty"] if "uncertainty" in row else None, assessment=row["assessment"] if "assessment" in row else None, pedigree=row["pedigree"] if "pedigree" in row else None, pedigree_template=row["pedigree_matrix"] if "pedigree_matrix" in row else None, relative_to=row["relative_to"] if "relative_to" in row else None, time=row["time"] if "time" in row else None, geolocation=None, comments=row["comments"] if "comments" in row else None, tags=None, other_attributes=other_attrs, proc_aliases=None, proc_external=False, # TODO proc_attributes=p_attributes, proc_location=None, ftype_roegen_type=roegen_type, ftype_attributes=None, fact_external=not internal, fact_incoming=incoming, fact_location=geolocation) if p_set.append( p, glb_idx ): # Appends codes to the pset if the processor was not member of the pset p_set.append_attributes_codes(row["taxa"])
def parse_indicators_command(sh, area): """ :param sh: :param area: :return: """ some_error = False issues = [] """ self._name = name self._formula = formula self._from_indicator = from_indicator self._benchmark = benchmark self._indicator_category = indicator_category """ # Scan the sheet, the first column must be one of the keys col_names = { ("name", ): "name", # Name of the indicator ( "formula", "expression", ): "formula", # Expression to compute the indicator ("benchmark", ): "benchmark", # Once calculated, a frame to qualify the goodness of the indicator ("description", "label", "desc"): "description" } # Check columns col_map = {} for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value for k in col_names: if col_name.lower() in k: col_map[col_names[k]] = c break # Map key to a list of values content = [] # Dictionary of lists, one per metadata key for r in range(area[0] + 1, area[1]): indicator = {} for k in col_names.values(): if k not in col_map: continue value = sh.cell(row=r, column=col_map[k]).value if not value: continue if k == "name": # Mandatory # Check syntax try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_ident, value) indicator[k] = value except: some_error = True issues.append( (3, "The name specified for the indicator, '" + value + "', is not valid, in row " + str(r) + ". It must be a simple identifier.")) elif k == "formula": # Mandatory # Check syntax try: parser_field_parsers.string_to_ast( parser_field_parsers.indicator_expression, value) indicator[k] = value except: some_error = True issues.append( (3, "The Formula specified for the indicator, '" + value + "', is not valid, in row " + str(r) + ".")) elif k == "benchmark": # Optional # This column can appear multiple times. # Check syntax if value.lower().strip() in (): if value.lower().strip() in ("number", "float"): value = "number" # "float" --> "number" indicator[k] = value else: some_error = True issues.append( (3, "The Type specified for the parameter, '" + value + "', is not valid, in row " + str(r) + ". It must be one of 'category', 'integer', 'number'." )) elif k == "description": # Optional indicator[k] = value # Check indicator completeness before adding it to the list of indicators if "name" not in indicator: issues.append((3, "The indicator must have a Name, row " + str(r))) continue if "formula" not in indicator: issues.append( (3, "The indicator must have a Formula, row " + str(r))) continue content.append(indicator) return issues, None, content