Exemplo n.º 1
0
    def check_expandable(v, location):
        """
        Check if curly braces match, that what is inside is syntactically correct, (and that the value exists)

        :param v:
        :return:
        """
        import re
        reg = re.compile(r"{.*?}")
        matches = reg.findall(v)
        output = set()
        if len(matches) == 0:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=f"Incorrect syntax, no macro expansion found",
                    location=location))
        else:
            for m in matches:
                h_name = m[1:-1]
                try:
                    parser_field_parsers.string_to_ast(
                        arith_boolean_expression, h_name)  # simple_h_name
                    output.add(h_name)
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value {m[1:-1]} is not a valid hierarchical name",
                            location=location))
        return output
def check_parameter_value(glb_idx, p, value, issues, sheet_name, row):
    retval = True
    if p.range:
        try:  # Try "numeric interval"
            ast = string_to_ast(number_interval, p.range)
            # try Convert value to float
            ast2 = string_to_ast(expression_with_parameters, value)
            evaluation_issues: List[Tuple[int, str]] = []
            s = State()
            value, unresolved_vars = ast_evaluator(exp=ast2, state=s, obj=None, issue_lst=evaluation_issues)
            if value is not None:
                try:
                    value = float(value)
                    left = ast["left"]
                    right = ast["right"]
                    left_number = ast["number_left"]
                    right_number = ast["number_right"]
                    if left == "[":
                        value_meets_left = value >= left_number
                    else:
                        value_meets_left = value > left_number
                    if right == "]":
                        value_meets_right = value <= right_number
                    else:
                        value_meets_right = value < right_number
                    if not value_meets_left or not value_meets_right:
                        issues.append(Issue(itype=IType.ERROR,
                                            description=f"The value {value} specified for the parameter '{p.name}' is out of the range {p.range}",
                                            location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                        retval = False
                except:
                    issues.append(Issue(itype=IType.ERROR,
                                        description=f"The parameter '{p.name}' has a non numeric value '{value}', and has been constrained with a numeric range. Please, either change the Value or the Range",
                                        location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                    retval = False
            else:
                pass  # The parameter depends on other parameters, a valid situation

        except:  # A hierarchy name
            h = glb_idx.get(Hierarchy.partial_key(p.range))
            h = h[0]
            if value not in h.codes.keys():
                issues.append(Issue(itype=IType.ERROR,
                                    description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}",
                                    location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                retval = False

    return retval
Exemplo n.º 3
0
    def get_source(self, reference_name, subrow) -> Any:
        reference = None

        if reference_name:
            try:
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.reference, reference_name)
                ref_id = ast["ref_id"]
                references = self._glb_idx.get(
                    ProvenanceReference.partial_key(ref_id))
                if len(references) == 1:
                    reference = references[0]
                else:
                    references = self._glb_idx.get(
                        BibliographicReference.partial_key(ref_id))
                    if len(references) == 1:
                        reference = references[0]
                    else:
                        raise CommandExecutionError(
                            f"Reference '{reference_name}' not found" +
                            subrow_issue_message(subrow))
            except:
                # TODO Change when Ref* are implemented
                reference = reference_name + " (not found)"

        return reference
Exemplo n.º 4
0
    def _get_scale_value(self, scale: str):
        try:
            value = float(scale)
        except ValueError:
            ast = string_to_ast(expression_with_parameters, scale)

            evaluation_issues: List[Tuple[int, str]] = []
            s = State()
            value, unresolved_vars = ast_evaluator(exp=ast,
                                                   state=s,
                                                   obj=None,
                                                   issue_lst=evaluation_issues)

            if len(evaluation_issues) > 0:
                evaluation_issues_str = [i[1] for i in evaluation_issues]
                raise CommandExecutionError(
                    f"Problems evaluating scale expression '{scale}': "
                    f"{', '.join(evaluation_issues_str)}")
            elif len(unresolved_vars) > 0:
                raise CommandExecutionError(
                    f"Unresolved variables evaluating the scale expression '{scale}':"
                    f" {', '.join(unresolved_vars)}")

            elif not value:
                raise CommandExecutionError(
                    f"The scale expression '{scale}' could not be evaluated.")

        return value
def dictionary_from_key_value_list(kvl, state: State = None):
    """
    From a string containing a list of keys and values, return a dictionary
    Keys must be literals, values can be expressions, to be evaluated at a later moment

    (syntactic validity of expressions is not checked here)

    :param kvl: String containing the list of keys and values
    :except If syntactic problems occur
    :return: A dictionary
    """
    pairs = kvl.split(",")
    d = create_dictionary()
    for p in pairs:
        k, v = p.split("=", maxsplit=1)
        if not k:
            raise Exception(
                "Each key-value pair must be separated by '=' and key has to be defined, value can be empty: " + kvl)
        else:
            try:
                k = k.strip()
                v = v.strip()
                string_to_ast(simple_ident, k)
                try:
                    # Simplest: string
                    string_to_ast(quotedString, v)
                    v = v[1:-1]
                except:
                    issues = []
                    ast = string_to_ast(expression_with_parameters, v)
                    res, unres = ast_evaluator(ast, state, None, issues)
                    if len(unres) == 0:
                        v = res

                d[k] = v
            except:
                raise Exception("Key must be a string: " + k + " in key-value list: " + kvl)
    return d
Exemplo n.º 6
0
def parse_line(item, fields):
    """
    Convert fields from a line to AST

    :param item:
    :param fields:
    :return:
    """
    asts = {}
    for f, v in item.items():
        if not f.startswith("_"):
            field = fields[f]
            # Parse (success is guaranteed because of the first pass dedicated to parsing)
            asts[f] = parser_field_parsers.string_to_ast(field.parser, v)
    return asts
Exemplo n.º 7
0
    def get_location(self, reference_name, subrow) -> Any:
        reference = None

        if reference_name:
            try:
                # TODO Change to parser for Location (includes references, but also Codes)
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.reference, reference_name)
                ref_id = ast["ref_id"]
                references = self._glb_idx.get(
                    GeographicReference.partial_key(ref_id))
                if len(references) == 1:
                    reference = references[0]
                else:
                    raise CommandExecutionError(
                        f"Reference '{reference_name}' not found" +
                        subrow_issue_message(subrow))
            except:
                reference = reference_name

        return reference
def parse_hierarchy_command(sh: Worksheet, area: AreaTupleType, name: str,
                            n_type: str) -> IssuesLabelContentTripleType:
    """
    Analyze a "hierarchy" command expressed in a worksheet of a spreadsheet

    The resulting JSON will be:
    {
    "name": <hierarchy name>,
    "type": ("Category", "FactorType", "Processor"), (this determines if the hierarchy is "is-a" -categories or factor types- or "part-of" -processors-)
    "h": [{"name": ..., "description": ..., "expression": ..., children: []},
         ]
    }

    In a hierarchy only simple names (not hierarchic) are allowed. The full name is determined by its position in the tree
    At execution time, if the elements already exist, their location in the hierarchy is updated (and the description, if present, is added)

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param n_type: Type of hierarchy node: "C" (Category), "I" (InterfaceType) or "P" (Processor)
    :return: list of issues [(issue_type, message)], command label, command content
    """
    some_error = False
    issues = []

    col_names = {
        ("expression", "formula"): "expression",
        ("code", "name"): "code",
        ("description", ): "description"
    }

    # Scan columns to prepare:
    # * "expression_column". The column that can contain an expression (it is Optional)
    # * "levels". List of Levels, formed by pairs "code, description", where "description" is optional
    expression_column = None
    levels = []
    for c in range(area[2], area[3]):  # Scan all columns
        col_name = sh.cell(row=area[0], column=c).value
        if not col_name:
            continue

        for k in col_names:
            col_name = col_name.lower()
            if col_name in k:
                if col_name == "expression":
                    expression_column = c
                elif col_name == "code":
                    levels.append(tuple([c]))
                elif col_name == "description":
                    # Description if there is an active CODE. If the description for the active CODE was
                    # already satified, replace it...
                    if len(levels) > 0:
                        tmp = levels[-1]
                        levels[-1] = (tmp[0], c)  # Code, Description
                break

    # Now, scan rows.
    # Only one Level can be active at a time.
    # Current level starts in zero, and is updated in each row.
    # Level can increase by one with regard to the previous level, or freely decrease
    nodes = {
    }  # Store nodes to check expressions later. Row number is key of the dictionary, the node is the value
    nodes_stack = []
    current_level = -1
    for r in range(area[0] + 1, area[1]):
        found = False
        for level, t in enumerate(levels):
            code_column = t[0]

            value = sh.cell(row=r, column=code_column).value
            if value:
                found = True
                break
        if found:
            # Value syntax. A simple identity name
            try:
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_ident, value)
            except:
                issues.append(
                    (3,
                     "The name of the category must be a simple name. Row " +
                     str(r)))
            # Description
            if len(t) > 1:
                description_column = t[1]
                description = sh.cell(row=r, column=description_column).value
            else:
                description = None
            # Expression
            if expression_column:
                expression = sh.cell(row=r, column=expression_column).value
            else:
                expression = None
            # Create the hierarchy node
            n = dict(code=value,
                     description=description,
                     expression=expression,
                     children=[])
            if not n["expression"]:
                del n["expression"]
            if not n["description"]:
                del n["description"]
            # Store the node
            nodes[r] = n

            # Process hierarchical information
            add_node = True
            if level == current_level + 1:
                # New (empty) list
                nodes_stack.append([])
                current_level = level
            elif level <= current_level:
                while current_level > level:
                    lst = nodes_stack.pop(
                    )  # Take and remove last element of the stack
                    current_level -= 1
                    if current_level >= 0:
                        # From the current level, children of the last node of the list are defined in "lst"
                        nodes_stack[current_level][-1]["children"] = lst
            else:
                issues.append((
                    3,
                    "Hierarchical level must increase by one, not more. Previous level was "
                    + str(current_level) + ", current is " + str(level) +
                    ". Row " + str(r)))
                add_node = False
            # Append the new node to the current level
            if add_node:
                nodes_stack[current_level].append(n)

    # Close
    while current_level > 0:
        lst = nodes_stack.pop()  # Take and remove last element of the stack
        current_level -= 1
        if current_level >= 0:
            # From the current level, children of the last node of the list are defined in "lst"
            nodes_stack[current_level][-1]["children"] = lst
    # Check that expressions are correct and that they refer to existing codes
    # TODO Check that expressions are not circular
    codes = set([n["code"].lower() for n in nodes.values()])
    for r, n in nodes.items():
        code = n["code"]
        if "expression" in n:
            expression = n["expression"]
            ast = parser_field_parsers.string_to_ast(
                parser_field_parsers.hierarchy_expression, expression)
            for p in ast["terms"]:
                if isinstance(p, str):
                    if p.lower() not in codes:
                        issues.append(
                            (3, "The code '" + p + "' in the expression '" +
                             expression + "' (declaration of code '" + code +
                             "') was not defined. Row: " + str(r)))

    content = {"name": name, "type": n_type, "h": nodes_stack[0]}

    return issues, None, content
Exemplo n.º 9
0
def construct_flow_graph_2(state: State,
                           query: IQueryObjects,
                           filt: Union[str, dict],
                           format: str = "visjs"):
    """
    Prepare a graph from which conclusions about factors can be extracted

    Example:
        1) Obtain "s", the serialized state from Redis or from a test file
        2) state = deserialize_state(s)
        3) query = BasicQuery(state) # Create a Query and execute a query
        4) construct_solve_graph(state, query, None)

    :param state: State
    :param query: A IQueryObjects instance (which has been already injected the state)
    :param filt: A filter to be passed to the query instance
    :param format: VisJS, GML, ...
    :return:
    """
    include_processors = False  # For completeness (not clarity...), include processors nodes, as a way to visualize grouped factors
    will_write = True  # For debugging purposes, affects how the properties attached to nodes and edges are elaborated
    expand_factors_graph = False  # Expand transformation between FactorTypes into instances of Factors

    # Format for different node types
    stated_factor_no_observation = dict(graphics={'fill': "#999900"})  # Golden
    stated_factor_some_observation = dict(graphics={'fill':
                                                    "#ffff00"})  # Yellow
    qq_attached_to_factor = dict(graphics={
        'fill': "#eeee00",
        "type": "ellipse"
    })  # Less bright Yellow
    non_stated_factor = dict(graphics={'fill': "#999999"})
    a_processor = dict(graphics={"type": "hexagon", "color": "#aa2211"})

    # Format for different edge types
    edge_from_factor_type = dict(graphics={
        "fill": "#ff0000",
        "width": 1,
        "targetArrow": "standard"
    })
    edge_processor_to_factor = dict(graphics={
        "fill": "#ff00ff",
        "width": 3,
        "targetArrow": "standard"
    })
    edge_factors_flow = dict(graphics={
        "fill": "#000000",
        "width": 5,
        "targetArrow": "standard"
    })
    edge_factors_scale = dict(graphics={
        "fill": "#333333",
        "width": 3,
        "targetArrow": "standard"
    })
    edge_factors_relative_to = dict(graphics={
        "fill": "#00ffff",
        "width": 3,
        "targetArrow": "standard"
    })
    edge_factor_value = dict(graphics={
        "fill": "#aaaaaa",
        "width": 1,
        "targetArrow": "standard"
    })

    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Obtain the information needed to elaborate the graph
    objs = query.execute([
        Processor, Factor, FactorType,
        FactorTypesRelationUnidirectionalLinearTransformObservation,
        FactorsRelationScaleObservation, FactorsRelationDirectedFlowObservation
    ], filt)

    # 1) Graphical Representation: BOX -- BOX
    #
    # 2) Internal (not for end-users), pseudo-code:
    #
    # Processor1 <- Factor1 -> FactorType0
    # Processor2 <- Factor2 -> FactorType0
    # Processor3 <- Factor3 -> FactorType1
    # Processor3 <- Factor4 -> FactorType0
    # Factor1 <- FactorsRelationDirectedFlowObservation(0.4) -> Factor2
    # Factor1 <- FactorsRelationDirectedFlowObservation(0.6) -> Factor4
    # Factor1 <- FactorQuantitativeObservation(5.3 m²)
    # FactorType0 -> FactorTypesRelationUnidirectionalLinearTransformObservation(ctx) -> FactorType1
    # Factor4 -> w1 -> Factor3
    # Factor5 -> w2 -> Factor3
    #

    # Index quantitative observations.
    # Also, mark Factors having QQs (later this will serve to color differently these nodes)
    qqs = {}
    qq_cont = 0
    factors_with_some_observation = set()
    for o in find_quantitative_observations(glb_idx):
        # Index quantitative observations.
        if "relative_to" in o.attributes and o.attributes["relative_to"]:
            continue  # Do not index intensive quantities, because they are translated as edges in the graph
        if o.factor in qqs:
            lst = qqs[o.factor]
        else:
            lst = []
            qqs[o.factor] = lst
        lst.append(o)
        # Mark Factors having QQs (later this will serve to color differently these nodes)
        factors_with_some_observation.add(o.factor)

    # ---- MAIN GRAPH: Factors and relations between them --------------------------------------------------------------
    the_node_names_set = set()

    # --   Nodes: "Factor"s passing the filter, and QQs associated to some of the Factors
    n = []
    e = []
    f_types = {}  # Contains a list of Factors for each FactorType
    p_factors = {}  # Contains a list of Factors per Processor7
    rel_to_observations = set(
    )  # Set of FactorObservation having "relative_to" property defined
    factors = create_dictionary()  # Factor_ID -> Factor
    for f in objs[Factor]:
        f_id = get_factor_id(f, prd=glb_idx)
        factors[f_id] = f  # Dictionary Factor_ID -> Factor
        # f_types
        if f.taxon in f_types:
            lst = f_types[f.taxon]
        else:
            lst = []
            f_types[f.taxon] = lst
        lst.append(f)
        # p_factors
        if f.processor in p_factors:
            lst = p_factors[f.processor]
        else:
            lst = []
            p_factors[f.processor] = lst
        lst.append(f)

        # Add Node to graph
        the_node_names_set.add(f_id)
        if will_write:
            n.append((f_id, stated_factor_some_observation
                      if f in factors_with_some_observation else
                      stated_factor_no_observation))
            if f in qqs:
                for qq in qqs[f]:
                    if not ("relative_to" in qq.attributes
                            and qq.attributes["relative_to"]):
                        # value = str(qq.value)  # str(qq_cont) + ": " + str(qq.value)
                        value_node_name = f_id + " " + str(qq.value)
                        n.append((value_node_name, qq_attached_to_factor))
                        e.append((value_node_name, f_id, {
                            "w": "",
                            "label": "",
                            **edge_factor_value
                        }))
                        qq_cont += 1
                    else:
                        rel_to_observations.add(qq)
        else:
            qqs2 = [
                qq for qq in qqs if not ("relative_to" in qq.attributes
                                         and qq.attributes["relative_to"])
            ]
            d = dict(factor=factor_to_dict(f),
                     observations=qqs[f_id] if f_id in qqs2 else [])
            n.append((f_id, d))

    # --   Edges
    # "Relative to" relation (internal to the Processor) -> Intensive to Extensive
    for o in rel_to_observations:
        if "relative_to" in o.attributes and o.attributes["relative_to"]:
            # Parse "defining_factor", it can be composed of the factor name AND the unit
            defining_factor = o.attributes["relative_to"]
            ast = parser_field_parsers.string_to_ast(
                parser_field_parsers.factor_unit, defining_factor)
            factor_type = ast_to_string(ast["factor"])
            unit_name = ast["unparsed_unit"]
            ureg(unit_name)
            f_id = get_factor_id(o.factor, prd=glb_idx)
            # Check that "f_id" exists in the nodes list (using "factors")
            factors[f_id]
            # If "defining_factor" exists in the processor, ok. If not, create it.
            # Find factor_type in the processor
            factor_name = get_processor_id(
                o.factor.processor) + ":" + factor_type
            factors[factor_name]
            e.append((factor_name, f_id, {
                "w": o.value.expression,
                "label": o.value.expression,
                **edge_factors_relative_to
            }))

    # Directed Flows between Factors
    for df in objs[FactorsRelationDirectedFlowObservation]:
        sf = get_factor_id(df.source_factor, prd=glb_idx)
        tf = get_factor_id(df.target_factor, prd=glb_idx)
        # Check that both "sf" and "tf" exist in the nodes list (using "factors")
        factors[sf]
        factors[tf]
        weight = df.weight if df.weight else "1"
        e.append((sf, tf, {"w": weight, "label": weight, **edge_factors_flow}))

    # Scale Flows between Factors
    for df in objs[FactorsRelationScaleObservation]:
        sf = get_factor_id(df.origin, prd=glb_idx)
        tf = get_factor_id(df.destination, prd=glb_idx)
        # Check that both "sf" and "tf" exist in the nodes list (using "factors")
        factors[sf]
        factors[tf]
        weight = str(df.quantity) if df.quantity else "1"
        e.append((sf, tf, {
            "w": weight,
            "label": weight,
            **edge_factors_scale
        }))

    # TODO Consider Upscale relations
    # e.append((..., ..., {"w": upscale_weight, "label": upscale_weight, **edge_factors_upscale}))

    # -- Create the graph
    factors_graph = nx.DiGraph()
    factors_graph.add_nodes_from(n)
    factors_graph.add_edges_from(e)

    # nx.write_gml(factors_graph, "/home/rnebot/IntermediateGraph.gml")

    # ---- AUXILIARY GRAPH: FACTOR TYPES AND THEIR INTERRELATIONS ----
    n = []
    e = []
    # --   Nodes: "FactorType"s passing the filter
    for ft in objs[FactorType]:
        n.append((get_factor_type_id(ft), dict(factor_type=ft)))

    # --   Edges
    # Hierarchy and expressions stated in the hierarchy
    ft_in = {
    }  # Because FactorTypes cannot be both in hierarchy AND expression, marks if it has been specified one was, to raise an error if it is specified also the other way
    for ft in objs[FactorType]:
        ft_id = get_factor_type_id(ft)
        if ft.expression:
            if ft not in ft_in:
                # TODO Create one or more relations, from other FactorTypes (same Hierarchy) to this one
                # TODO The expression can only be a sum of FactorTypes (same Hierarchy)
                ft_in[ft] = "expression"
                # TODO Check that both "ft-id" and "..." exist in the nodes list (keep a temporary set)
                # weight = ...
                # e.append((ft_id, ..., {"w": weight, "label": weight, "origin": ft, "destination": ...}))

        if ft.parent:
            if ft.parent not in ft_in or (ft.parent in ft_in
                                          and ft_in[ft.parent] == "hierarchy"):
                # Create an edge from this FactorType
                ft_in[ft.parent] = "hierarchy"
                parent_ft_id = get_factor_type_id(ft.parent)
                # TODO Check that both "ft-id" and "parent_ft_id" exist in the nodes list (keep a temporary set)
                # Add the edge
                e.append((ft_id, parent_ft_id, {
                    "w": "1",
                    "origin": ft,
                    "destination": ft.parent
                }))
            else:
                raise Exception(
                    "The FactorType '" + ft_id +
                    "' has been specified by an expression, it cannot be parent."
                )
    # Linear transformations
    for f_rel in objs[
            FactorTypesRelationUnidirectionalLinearTransformObservation]:
        origin = get_factor_type_id(f_rel.origin)
        destination = get_factor_type_id(f_rel.destination)
        e.append((origin, destination, {
            "w": f_rel.weight,
            "label": f_rel.weight,
            "origin": f_rel.origin,
            "destination": f_rel.destination
        }))

    # ---- Create FACTOR TYPES graph ----

    factor_types_graph = nx.DiGraph()
    factor_types_graph.add_nodes_from(n)
    factor_types_graph.add_edges_from(e)

    # ---- EXPAND "FACTORS GRAPH" with "FACTOR TYPE" RELATIONS ----
    sg_list = []  # List of modified (augmented) subgraphs
    if expand_factors_graph:
        # The idea is: clone a FactorTypes subgraph if a Factor instances some of its member nodes
        # This cloning process can imply creating NEW Factors

        the_new_node_names_set = set()

        # Obtain weak components of the main graph. Each can be considered separately

        # for sg in nx.weakly_connected_component_subgraphs(factors_graph):  # For each subgraph
        #     print("--------------------------------")
        #     for n in sg.nodes():
        #         print(n)

        # ---- Weakly connected components of "factor_types_graph" ----
        factor_types_subgraphs = list(
            nx.weakly_connected_component_subgraphs(factor_types_graph))

        for sg in nx.weakly_connected_component_subgraphs(
                factors_graph):  # For each subgraph
            sg_list.append(sg)
            # Consider each Factor of the subgraph
            unprocessed_factors = set(sg.nodes())
            while unprocessed_factors:  # For each UNPROCESSED Factor
                tmp = unprocessed_factors.pop(
                )  # Get next unprocessed "factor name"
                if tmp not in factors:  # QQ Observations are in the graph and not in "factors". The same with Processors
                    continue
                f_ = factors[tmp]  # Obtain Factor from "factor name"
                ft_id = get_factor_type_id(
                    f_)  # Obtain FactorType name from Factor
                # Iterate through FactorTypes and check if the Factor appears
                for sg2 in factor_types_subgraphs:  # Each FactorTypes subgraph
                    if ft_id in sg2:  # If the current Factor is in the subgraph
                        if len(
                                sg2.nodes()
                        ) > 1:  # If the FactorType subgraph has at least two nodes
                            # CLONE FACTOR TYPES SUBGRAPH
                            # Nodes. Create if not present already
                            n = []
                            e = []
                            for n2, attrs in sg2.nodes().items(
                            ):  # Each node in the FactorTypes subgraph
                                ft_ = attrs["factor_type"]
                                f_id = get_factor_id(f_.processor,
                                                     ft_,
                                                     prd=glb_idx)
                                if f_id not in sg:  # If the FactorType is not
                                    # Create Factor, from processor and ft_ -> f_new
                                    _, _, f_new = find_or_create_observable(
                                        state, name=f_id, source="solver")
                                    factors[f_id] = f_new
                                    if f_id not in the_node_names_set:
                                        if will_write:
                                            n.append((f_id, non_stated_factor))
                                        else:
                                            d = dict(
                                                factor=factor_to_dict(f_new),
                                                observations=[])
                                            n.append((f_id, d))
                                    if f_id not in the_node_names_set:
                                        the_new_node_names_set.add(f_id)
                                    the_node_names_set.add(f_id)
                                else:
                                    unprocessed_factors.discard(f_id)
                            # Edges. Create relations between factors
                            for r2, w_ in sg2.edges().items():
                                # Find origin and destination nodes. Copy weight. Adapt weight? If it refers to a FactorType, instance it?
                                origin = get_factor_id(f_.processor,
                                                       w_["origin"],
                                                       prd=glb_idx)
                                destination = get_factor_id(f_.processor,
                                                            w_["destination"],
                                                            prd=glb_idx)
                                if origin in the_new_node_names_set or destination in the_new_node_names_set:
                                    graphics = edge_from_factor_type
                                else:
                                    graphics = {}
                                e.append((origin, destination, {
                                    "w": w_["w"],
                                    "label": w_["w"],
                                    **graphics
                                }))
                            sg.add_nodes_from(n)
                            sg.add_edges_from(e)
                            break

        # for sg in sg_list:
        #     print("--------------------------------")
        #     for n in sg.nodes():
        #         print(n)

    # Recompose the original graph
    if sg_list:
        factors_graph = nx.compose_all(sg_list)
    else:
        pass
        ##factors_graph = nx.DiGraph()

    # ----
    # Add "Processor"s just as a way to visualize grouping of factors (they do not influence relations between factors)
    # -
    if include_processors:
        n = []
        e = []
        for p in objs[Processor]:
            p_id = get_processor_id(p)
            if will_write:
                n.append((p_id, a_processor))
            else:
                n.append((p_id, processor_to_dict(p)))
            # Edges between Processors and Factors
            for f in p_factors[p]:
                f_id = get_factor_id(f, prd=glb_idx)
                e.append((p_id, f_id, edge_processor_to_factor))
        factors_graph.add_nodes_from(n)
        factors_graph.add_edges_from(e)

    #
    # for ft in objs[FactorType]:
    #     if ft.parent:
    #         # Check which Factors are instances of this FactorType
    #         if ft in f_types:
    #             for f in f_types[ft]:
    #                 # Check if the processor contains the parent Factor
    #                 processor_factors = p_factors[f.processor]
    #                 if ft.parent not in processor_factors:
    #                     factor_data = (f.processor, ft)
    #                 else:
    #                     factor_data = None
    #                 create_factor = f in qqs  # If there is some Observation
    #                 create_factor = True # Force creation
    #
    #
    #         # Consider the creation of a relation
    #         # Consider also the creation of a new Factor (a new Node for now): if the child has some observation for sure (maybe a child of the child had an observation, so it is the same)
    #         ft_id =
    #     ft_id =

    # Plot graph to file
    # import matplotlib.pyplot as plt
    # ax = plt.subplot(111)
    # ax.set_title('Soslaires Graph', fontsize=10)
    # nx.draw(factors_graph, with_labels=True)
    # plt.savefig("/home/rnebot/Graph.png", format="PNG")

    # GML File
    # nx.write_gml(factors_graph, "/home/rnebot/Graph.gml")

    ret = None
    if format == "visjs":
        # Assign IDs to nodes. Change edges "from" and "to" accordingly
        ids_map = create_dictionary()
        id_count = 0
        for node in factors_graph.nodes(data=True):
            sid = str(id_count)
            node[1]["id"] = sid
            ids_map[node[0]] = sid
            id_count += 1

        vis_nodes = []
        vis_edges = []
        for node in factors_graph.nodes(data=True):
            d = dict(id=node[1]["id"], label=node[0])
            if "shape" in node[1]:
                # circle, ellipse, database, box, diamond, dot, square, triangle, triangleDown, text, star
                d["shape"] = node[1]["shape"]
            else:
                d["shape"] = "box"
            if "color" in node[1]:
                d["color"] = node[1]["color"]
            vis_nodes.append(d)
        for edge in factors_graph.edges(data=True):
            f = ids_map[edge[0]]
            t = ids_map[edge[1]]
            d = {"from": f, "to": t, "arrows": "to"}
            data = edge[2]
            if "w" in data:
                d["label"] = data["w"]
                d["font"] = {"align": "horizontal"}

            vis_edges.append(d)
        ret = {"nodes": vis_nodes, "edges": vis_edges}
    elif format == "gml":
        ret1 = io.BytesIO()
        nx.write_gml(factors_graph, ret1)
        ret = ret1.getvalue()
        ret1.close()

    return ret

    # #########################################################################3

    # GEXF File
    # nx.write_gexf(factors_graph, "/home/rnebot/Graph.gexf")

    # Legend graph
    n = []
    e = []
    n.append(("Factor with Observation", stated_factor_some_observation))
    n.append(("Factor with No Observation", stated_factor_no_observation))
    if include_processors:
        n.append(("Processor", a_processor))
    n.append(("Factor from FactorType", non_stated_factor))
    n.append(("QQ Observation", qq_attached_to_factor))
    n.append(("QQ Intensive Observation", qq_attached_to_factor))

    e.append(("A Factor", "Another Factor", {
        "label": "Flow between Factors, attaching the weight",
        **edge_factors_flow
    }))
    e.append(("Factor #1", "Factor #2", {
        "label": "Relation from a FactorType",
        **edge_from_factor_type
    }))
    if include_processors:
        e.append(("Processor", "A Factor", {
            "label": "Link from Processor to Factor",
            **edge_processor_to_factor
        }))
    e.append(("A Factor", "Same Factor in another processor", {
        "label": "Upscale a Factor in two processors",
        **edge_factors_upscale
    }))
    e.append(("Factor with Observation", "QQ Intensive Observation", {
        "label":
        "Observation proportional to extensive value of factor same processor",
        **edge_factors_relative_to
    }))
    e.append(("QQ Observation", "A Factor", {
        "label": "A QQ Observation",
        **edge_factor_value
    }))
    factors_graph = nx.DiGraph()
    factors_graph.add_nodes_from(n)
    factors_graph.add_edges_from(e)
Exemplo n.º 10
0
def parse_pedigree_matrix_command(sh: Worksheet, area: AreaTupleType,
                                  name: str) -> IssuesLabelContentTripleType:
    """
    A pedigree matrix is formed by several columns, with a header naming a phase, and below a list of modes, normally in
    ascending qualitative order.

    Modes can be referred later by the order number specified in the "Code" column (mandatory). The order of the columns
    serves also to sequence the codes of the matrix, from left to right.

    Columns can be accompanied by a description column, to the right

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param name: Name of the Pedigree Matrix
    :return: list of issues (issue_type, message), command label, command content
    """

    issues = []

    # Analyze columns
    phases = []  # A phase per column
    codes = None  # Column with codes
    max_len = 0  # Column with max length
    for c in range(area[2], area[3]):
        phase_modes = []
        current_phase = None
        for r in range(area[0], area[1]):
            value = sh.cell(row=r, column=c).value
            # First row has to be defined. If not, skip to the next column
            if r == area[0] and not value:
                break
            if value is None:
                continue

            if r == area[0]:
                current_phase = value

            try:
                if current_phase.lower() != "code":
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.simple_ident, value)
                else:
                    if r != area[0]:
                        # An Integer
                        try:
                            int(value)
                        except:
                            issues.append((3, "The code must be an integer"))
            except:
                if r == area[0]:
                    issues.append((
                        3, "Phase '" + value +
                        "' of the Pedigree Matrix must be a simple identity (alphabet letter followed by either alphabet letters or numbers"
                    ))
                else:
                    issues.append((
                        3,
                        "A mode (" + value + ") in phase '" + current_phase +
                        "' of the Pedigree Matrix must be a simple identity (alphabet letter followed by either alphabet letters or numbers"
                    ))

            # Append mode to the current phase
            phase_modes.append(dict(mode=value, description=""))

        # Check: at least one element
        if len(phase_modes) < 2:
            issues.append((3, "Phase '" + current_phase +
                           "' should have at least one mode"))

        # Check: no repetitions
        if len(phase_modes) != len(set([mode["mode"]
                                        for mode in phase_modes])):
            if current_phase.lower() != "code":
                issues.append(
                    (3, "There is at least a repeated mode in phase '" +
                     current_phase + "'"))
            else:
                issues.append(
                    (3,
                     "There is at least a repeated code in the list of codes"))

        # Update max column length
        if len(phase_modes) > max_len:
            max_len = len(phase_modes)

        if current_phase.lower() != "code":
            phases.append(phase_modes)
        else:
            codes = phase_modes[1:]

    # If not codes
    if not codes:
        codes = [str(i) for i in range(max_len - 2, -1, -1)]

    return issues, None, dict(name=name, codes=codes, phases=phases)
Exemplo n.º 11
0
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    if dataset_name is None:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command",
                location=IssueLocation(sheet_name=name, row=None,
                                       column=None)))
        return issues, None, None

    # Obtain the source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join(
                                [m2["measure"]
                                 for m2 in out_measures.values]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "starttime", "endperiod", "endtime"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                if col_name.lower() == "starttime":
                    col_name = "StartPeriod"
                elif col_name.lower() == "endtime":
                    col_name = "EndPeriod"
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=IType.WARNING,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=IType.ERROR,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=IType.WARNING,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    ds.database = None
                    ds.attributes = {}
                    current_ds[dsd_dataset_name] = ds
                if not dsd_concept_type:
                    if ds.attributes.get("_location"):
                        issues.append(
                            Issue(
                                itype=IType.WARNING,
                                description=
                                f"Location of data for dataset {ds.code} previously declared. "
                                f"Former: {attributes.get('_location')}, "
                                f"Current: {dsd_dataset_data_location}",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        attributes = ds.attributes
                    else:
                        attributes["_dataset_first_row"] = r
                    attributes[
                        "_location"] = dsd_dataset_data_location  # Location
                    ds.description = dsd_concept_description
                    ds.attributes = attributes  # Set attributes
                else:  # If concept_type is defined => add a concept
                    # Check if the concept name already appears --> Error
                    for d1 in ds.dimensions:
                        if strcmp(d1.code, dsd_concept_name):
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept {dsd_concept_name} already declared for dataset {ds.code}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            break

                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl

                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes

        # -------------------------------------------------------------------------------------------------------------
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.name for ds in datasets]

        # List of available Category hierarchies
        hierarchies = create_dictionary()
        for h in hh:
            hierarchies[h.name] = hh

        # Datasets being defined in this Worksheet
        current_ds = create_dictionary()  # type: Dict[str, Dataset]

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                pass
            else:
                process_line(line)

        # Any error?
        error = any_error_issue(issues)

        # Load the data for those datasets that are not local (data defined later in the same spreadsheet)
        for ds in current_ds.values():
            if "_location" not in ds.attributes:
                error = True
                issues.append(
                    Issue(itype=IType.ERROR,
                          description=
                          "Location of data not specified, for dataset '" +
                          ds.code + "'",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
            else:
                loc = ds.attributes["_location"]
                ast = parser_field_parsers.string_to_ast(url_parser, loc)
                if ast["scheme"] != "data":
                    df = load_dataset(loc)
                    if df is None:
                        error = True
                        issues.append(
                            Issue(
                                itype=IType.ERROR,
                                description=
                                f"Could not obtain data for dataset '{ds.code}' at '{loc}'",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                    else:
                        iss = prepare_dataframe_after_external_read(
                            ds, df, name)
                        issues.extend(iss)
                        # Everything ok? Store the dataframe!
                        if len(iss) == 0:
                            ds.data = df

        if not error:
            # If no error happened, add the new Datasets to the Datasets in the "global" state
            for ds in current_ds:
                r = current_ds[ds].attributes["_dataset_first_row"]
                df = current_ds[ds].data
                if df is not None:
                    # Loop over "ds" concepts.
                    # - "dimension" concepts of type "string" generate a CodeHierarchy
                    # - Check that the DataFrame contains ALL declared concepts. If not, generate issue
                    cid = create_dictionary(
                        data={col: col
                              for col in df.columns})
                    col_names = list(df.columns)
                    for c in current_ds[ds].dimensions:
                        if c.code in df.columns:
                            col_names[df.columns.get_loc(
                                cid[c.code])] = c.code  # Rename column
                            dsd_concept_data_type = c.attributes["_datatype"]
                            if dsd_concept_data_type.lower(
                            ) == "string" and not c.is_measure:  # Freely defined dimension
                                cl = df[cid[c.code]].unique().tolist()
                                c.code_list = CodeList.construct(
                                    c.code,
                                    c.code, [""],
                                    codes=[
                                        CodeImmutable(c, c, "", []) for c in cl
                                    ])
                        else:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept '{c.code}' not defined for '{ds}' in {loc}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                    df.columns = col_names
                datasets[ds] = current_ds[ds]

        return issues, None
                issues.append(Issue(itype=IType.ERROR,
                                    description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}",
                                    location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                retval = False

    return retval


if __name__ == '__main__':
    from nexinfosys.model_services import State
    from dotted.collection import DottedDict

    issues = []
    s = State()
    ex = "level =”N - 1”, farm_type =”GH”, efficiency = 0.3"
    ast = string_to_ast(key_value_list, ex)
    res, unres = ast_evaluator(ast, s, None, issues)
    s.set("Param1", 2.1)
    # s.set("Param", 0.1)
    s.set("Param2", 0.2)
    s.set("p1", 2.3)

    ej = "level='n+1', r=[Ref2019], a=5*p1, c=?p1>3 -> 'T1', p1<=2 -> 'T2', 'T3'?"
    ast = string_to_ast(key_value_list, ej)
    res, unres = ast_evaluator(ast, s, None, issues)

    examples = ["?Param1 > 3 -> 5, Param1 <=3 -> 2?",
                "(Param1 * 3 >= 0.3) AND (Param2 * 2 <= 0.345)",
                "cos(Param*3.1415)",
                "{Param} * 3 >= 0.3",
                "'Hola'",
Exemplo n.º 14
0
    def _init_and_process_row(self, row: Dict[str, Any]) -> None:
        def obtain_dictionary_with_not_expandable_fields(d):
            output = {}
            for k, v in d.items():
                if v is None or "{" not in v:
                    output[k] = v
            return output

        self._current_row_number = row["_row"]
        self._fields = self._get_command_fields_values(row)
        tmp_fields = self._fields
        self._check_all_mandatory_fields_have_values()
        # If expandable, do it now
        expandable = row["_expandable"]
        if expandable:
            # Extract variables
            state = State()
            issues = []
            asts = {}
            referenced_variables = create_dictionary()
            for e in expandable:
                ast = parser_field_parsers.string_to_ast(
                    arith_boolean_expression, e)
                c_name = f"{{{e}}}"
                asts[c_name] = ast
                res, vars = ast_evaluator(ast,
                                          state,
                                          None,
                                          issues,
                                          atomic_h_names=True)
                for v in vars:
                    referenced_variables[v] = None

            res = classify_variables2(referenced_variables.keys(),
                                      self._datasets, self._hierarchies,
                                      self._parameters)
            ds_list = res["datasets"]
            ds_concepts = res["ds_concepts"]
            h_list = res["hierarchies"]
            if len(ds_list) >= 1 and len(h_list) >= 1:
                self._add_issue(
                    itype=IType.ERROR,
                    description="Dataset(s): " +
                    ", ".join([d.name
                               for d in ds_list]) + ", and hierarchy(ies): " +
                    ", ".join([h.name for h in h_list]) +
                    ", have been specified. Only a single dataset is supported."
                )
                return
            elif len(ds_list) > 1:
                self._add_issue(
                    itype=IType.ERROR,
                    description="More than one dataset has been specified: " +
                    ", ".join([d.name for d in ds_list]) +
                    ", just one dataset is supported.")
                return
            elif len(h_list) > 0:
                self._add_issue(
                    itype=IType.ERROR,
                    description="One or more hierarchies have been specified: "
                    + ", ".join([h.name for h in h_list]))
                return
            if len(ds_list) == 1:  # Expand dataset
                ds = ds_list[0]
                measure_requested = False
                all_dimensions = set(
                    [c.code for c in ds.dimensions if not c.is_measure])
                requested_dimensions = set()
                requested_measures = set()
                for con in ds_concepts:
                    found = False
                    for c in ds.dimensions:
                        if strcmp(c.code, con):
                            found = True
                            if c.is_measure:
                                measure_requested = True
                                requested_measures.add(c.code)
                            else:  # Dimension
                                all_dimensions.remove(c.code)
                                requested_dimensions.add(c.code)
                    if not found:
                        self._add_issue(
                            itype=IType.ERROR,
                            description=
                            f"The concept '{{{ds.code}.{con}}}' is not in the dataset '{ds.code}'"
                        )
                        return
                ds_concepts = list(requested_measures)
                ds_concepts.extend(list(requested_dimensions))
                all_dimensions_requested = len(all_dimensions) == 0

                if measure_requested and not all_dimensions_requested:
                    self._add_issue(
                        IType.ERROR,
                        f"It is not possible to use a measure ({', '.join(requested_measures)}), if not all dimensions are used "
                        f"(cannot assume implicit aggregation). Dimensions not used: {', '.join(all_dimensions)}"
                    )
                    return
                elif not measure_requested and not all_dimensions_requested:
                    # Reduce the Dataframe to unique tuples of the specified dimensions
                    # TODO Consider the current case -sensitive or not-sensitive-
                    data = ds.data[list(
                        requested_dimensions)].drop_duplicates()
                else:  # Take the dataset as-is
                    data = ds.data

                # Remove Index, and do it NOT-INPLACE
                data = data.reset_index()

                # Drop rows with empty dimension value
                import numpy as np
                data = data.replace(r'^\s*$', np.NaN, regex=True)
                data.dropna(subset=requested_dimensions, inplace=True)

                const_dict = obtain_dictionary_with_not_expandable_fields(
                    self._fields)  # row?
                var_dict = set(
                    [f for f in self._fields.keys() if f not in const_dict])

                re_concepts = {}
                for c in ds_concepts:
                    c_name = f"{{{ds.code}.{c}}}"
                    if case_sensitive:
                        re_concepts[c_name] = re.compile(c_name)
                    else:
                        re_concepts[c_name] = re.compile(c_name, re.IGNORECASE)

                location = IssueLocation(sheet_name=self._command_name,
                                         row=self._current_row_number,
                                         column=None)
                already_parsed_fields = set(const_dict.keys())
                for ds_row, row2 in enumerate(
                        data.iterrows()):  # Each row in the dataset
                    # Initialize constant values (those with no "{..}" expressions)
                    row3 = const_dict.copy()
                    # Prepare state to evaluate functions
                    state = State()
                    for c in ds_concepts:
                        state.set(f"{ds.code}.{c}", str(row2[1][c]))
                    state.set(
                        "_glb_idx", self._glb_idx
                    )  # Pass PartialRetrievalDictionary to the evaluator. For functions needing it

                    # Evaluate all functions
                    expressions = {}
                    for e, ast in asts.items():
                        res, vars = ast_evaluator(ast,
                                                  state,
                                                  None,
                                                  issues,
                                                  atomic_h_names=True)
                        expressions[e] = res
                    # Expansion into var_dict
                    for f in var_dict:
                        v = self._fields[f]  # Initial value
                        for item in sorted(expressions.keys(),
                                           key=len,
                                           reverse=True):
                            v = v.replace(item, expressions[item])
                        row3[f] = v

                    # # Concepts change dictionary
                    # concepts = {}
                    # for c in ds_concepts:
                    #     concepts[f"{{{ds.code}.{c}}}"] = str(row2[1][c])
                    # # Expansion into var_dict
                    # for f in var_dict:
                    #     v = self._fields[f]  # Initial value
                    #     for item in sorted(concepts.keys(), key=len, reverse=True):
                    #         v = re_concepts[item].sub(concepts[item], v)
                    #     row3[f] = v

                    # Syntactic verification of the resulting expansion
                    processable, tmp_issues = parse_cmd_row_dict(
                        self._serialization_type, row3, already_parsed_fields,
                        location)
                    if len(tmp_issues) > 0:
                        self._issues.extend(tmp_issues)
                    # Process row
                    if processable:
                        self._fields = row3
                        self._process_row(row3, ds_row)
                        self._fields = tmp_fields
            elif len(h_list) == 1:  # Expand hierarchy
                pass
        else:
            self._process_row(self._fields)  # Process row
Exemplo n.º 15
0
def parse_cmd_row_dict(cmd_name: str, row: Dict[str, str],
                       already_parsed_fields: Set[str],
                       location: IssueLocation) -> Tuple[bool, List[Issue]]:
    """
    Parse a row (as a dictionary) from a command
    It is used after expansion of "macros"

    :param cmd_name: Name of command
    :param row: A dictionary containing the values to parse syntactically. Keys are field names, Values are field values
    :param already_parsed_fields: Set of fields already known to be syntactically valid
    :param location: IssueLocation object to use when creating Issues
    :return: A tuple: a boolean (True if the row can be used, otherwise False) and a list of Issues
    """

    issues: List[Issue] = []

    from nexinfosys.command_field_definitions import command_fields
    field_defs_dict = {f.name: f for f in command_fields[cmd_name]}
    mandatory_not_found = set([
        c.name for c in command_fields[cmd_name]
        if c.mandatory and isinstance(c.mandatory, bool)
    ])
    print(mandatory_not_found)
    complex_mandatory_cols = [
        c for c in command_fields[cmd_name] if isinstance(c.mandatory, str)
    ]
    may_append = True
    complex_row = False
    for field_name, field_value in row.items():
        field_def = field_defs_dict.get(field_name)
        if not field_def:
            return ParseException(
                f"Field {field_name} not found for command {cmd_name}")

        if field_value is not None:
            if not isinstance(field_value, str):
                field_value = str(field_value)
            field_value = field_value.strip()
        else:
            continue

        # Parse the field
        if field_def.allowed_values:
            if field_value.lower() not in [
                    v.lower() for v in field_def.allowed_values
            ]:  # TODO Case insensitive CI
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        f"Field '{field_name}' of command '{cmd_name}' has invalid value '{field_value}'."
                        f" Allowed values are: {', '.join(field_def.allowed_values)}.",
                        location=location))
                may_append = False
            else:
                pass  # OK
        else:  # Instead of a list of values, check if a syntactic rule is met by the value
            if field_def.parser:  # Parse, just check syntax (do not store the AST)
                try:
                    if field_name not in already_parsed_fields:
                        ast = parser_field_parsers.string_to_ast(
                            field_def.parser, field_value)
                        # Rules are in charge of informing if the result is expandable and if it complex
                        if "expandable" in ast and ast["expandable"]:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Field '{field_name}' of command '{cmd_name}' cannot be expandable again.",
                                    location=location))
                            may_append = False
                        if "complex" in ast and ast["complex"]:
                            complex_row = True
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value in field '{field_name}' of command '{cmd_name}' "
                            f"is not syntactically correct. Entered: {field_value}",
                            location=location))
                    may_append = False
            else:
                pass  # Valid

        if field_def.name in mandatory_not_found:
            mandatory_not_found.discard(field_def.name)

    # MODIFY INPUT Dictionary with this new Key
    if complex_row:
        row["_complex"] = complex_row

    # Append if all mandatory fields have been filled
    if len(mandatory_not_found) > 0:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"Mandatory columns: {', '.join(mandatory_not_found)} have not been specified",
                location=location))
        may_append = False

    # Check varying mandatory fields (fields depending on the value of other fields)
    for c in complex_mandatory_cols:
        field_def = c.name  # next(c2 for c2 in col_map if strcmp(c.name, c2.name))
        if isinstance(c.mandatory, str):
            # Evaluate
            mandatory = eval(c.mandatory, None, row)
            may_append = (mandatory and field_def in row) or (not mandatory)
            if mandatory and field_def not in row:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="Mandatory column: " + field_def +
                          " has not been specified",
                          location=location))
                may_append = False

    return may_append, issues
Exemplo n.º 16
0
def validate_command(command_content_to_validate):
    """
    The input comes in a JSON field "content":
    {"command": "<command name",
     "fields": {"<field name>": "<value", ...}
    }
    :param command_content_to_validate:
    :return: A dictionary with the same fields of the input dictionary, whose values are the diagnosis, None being
            everything-ok, and a string being a message describing the problem.
    """
    def split_expansion_expressions(f, content):
        # Dataset expansion. Isolate each expression
        pieces = []
        offset = 0
        look_for = "{"
        open_brace = False
        s = None
        while offset < len(content):
            pos = content[offset:].find(look_for)
            if pos >= 0:
                if look_for == "{":
                    if pos > 0:
                        pieces.append(
                            (content[offset:offset + pos], False))  # Literal
                    look_for = "}"
                    open_brace = True
                else:
                    if pos > 0:
                        pieces.append(
                            (content[offset:offset + pos], True))  # Expansion
                    else:
                        s = f"Invalid syntax in field '{f}' with value: " + content + ". No expression between curly braces."
                        valid = False
                        break
                    look_for = "{"
                    open_brace = False
                offset += pos + 1
            else:  # Character not found
                if open_brace:
                    s = f"Invalid syntax in field '{f}' with value: " + content + ". Curly brace not closed."
                    valid = False
                    break
                else:  # Add the rest
                    pieces.append((content[offset:], False))
                    offset = len(content)

        return pieces, s

    if "command" in command_content_to_validate:
        command = command_content_to_validate["command"]
    else:
        raise Exception("Must specify 'command'")

    if "fields" in command_content_to_validate:
        fields = command_content_to_validate["fields"]
    else:
        raise Exception("Must specify 'fields'")

    alternative_command_names = command_content_to_validate.get(
        "alternative_command_names", {})

    result = {}
    # Find command from the worksheet name ("command")
    match = None
    for cmd in commands:
        for cmd_name in cmd.allowed_names:
            if cmd_name.lower() in command.lower():
                if match:
                    if match[1] < len(cmd_name):
                        match = (cmd.name, len(cmd_name))
                else:
                    match = (cmd.name, len(cmd_name))
    if not match:
        for k, v in alternative_command_names:
            if k.lower() in command.lower():
                for cmd in commands:
                    for cmd_name in cmd.allowed_names:
                        if cmd_name.lower() in v.lower():
                            match = (cmd.name, 0)
                            break
                    if match:
                        break
                if match:
                    break

    # Fields in the command
    status = True
    if match:
        for f in fields:  # Validate field by field
            for f2 in command_fields[
                    match[0]]:  # Find corresponding field in the command
                if f.lower() in [f3.lower() for f3 in f2.allowed_names]:
                    fld = f2
                    break
            else:
                fld = None
            if fld:  # If found, can validate syntax
                # Validate Syntax
                content = fields[f]
                content_msg = content  # Original "content", to show in case of error
                if isinstance(content, (int, float)):
                    content = str(content)

                # Check if it is an expansion expression
                valid = True
                if "{" in content or "}" in content:
                    # Is expansion allowed in this command?
                    expansion_allowed = True
                    if expansion_allowed:
                        pieces, s = split_expansion_expressions(f, content)
                        if s is None:
                            c = ""
                            for p in pieces:
                                if p[1]:  # Expansion expression
                                    try:
                                        string_to_ast(arith_boolean_expression,
                                                      p[0])
                                        c += "expand"
                                    except:
                                        s = f"Invalid syntax in field '{f}' with value: {content}, expansion expression '{p[0]}' invalid"
                                        result[f] = s
                                        valid = False
                                        break
                                else:
                                    c += p[0]
                            if valid:
                                content = c
                        else:
                            valid = False

                if not valid:
                    result[f] = s
                    status = False
                else:
                    if fld.allowed_values:
                        if content != content_msg:  # It was an expansion expression, cannot check it now, assume it is good
                            result[f] = None
                        else:
                            # Case insensitive comparison
                            if content.lower().strip() in [
                                    f.lower().strip()
                                    for f in fld.allowed_values
                            ]:
                                result[f] = None
                            else:
                                result[
                                    f] = "'" + content + "' in field '" + f + "' must be one of: " + ", ".join(
                                        fld.allowed_values)
                                status = False
                    else:
                        try:
                            string_to_ast(fld.parser, content)
                            result[f] = None
                        except:
                            s = f"Invalid syntax in field '{f}' with value: '{content_msg}'"
                            if fld.examples:
                                s += ". Examples: " + ", ".join(fld.examples)
                            result[f] = s
                            status = False

            else:
                result[
                    f] = "Field '" + f + "' not found in command '" + command + "'. Possible field names: " + ", ".join(
                        [
                            item for f2 in command_fields[command]
                            for item in f2.allowed_names
                        ])
                status = False
    else:
        for f in fields:  # Validate field by field
            result[
                f] = "Command '" + command + "' not found in the list of command names: " + ", ".join(
                    [n for c in commands for n in c.allowed_names])
        status = False

    return result, status
Exemplo n.º 17
0
def parse_scale_conversion_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType:
    """
    Analyze the input area
    Obtain the numerical part
    Read a row above and a column to the left, looking for source (left col) and target (row above) factor types

    FactorTypes do not need to exist previously, they can be created

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """

    def get_subrow(r, c1, c2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        previous = None
        for c in range(c1, c2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)

        return lst

    def get_subcolumn(c, r1, r2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        # !!! This may not be correct at all times: when a cell is intentionally left blank
        # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range
        previous = None
        for r in range(r1, r2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)
        return lst

    # ---------------------------------------------

    some_error = False
    issues = []

    # Detect the matrix defining scales
    m = binary_mask_from_worksheet(sh, True)  # "True" is to focus on cells containing numbers
    # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix
    t = obtain_rectangular_submatrices(m)[0]  # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3]
    t = (t[0]+1, t[1]+1, t[2]+1, t[3]+1)  # The previous calculation is done using Numpy, so it is Zero based. Correct this

    # Obtain the factor type names in the subrow on top of the matrix
    subrow = get_subrow(t[0]-1, t[2], t[3])
    # Obtain the factor type names in the subcolumn to the left of the matrix
    subcol = get_subcolumn(t[2]-1, t[0], t[1])

    # Check that we have valid factor type names
    for ft in subrow+subcol:
        try:
            parser_field_parsers.string_to_ast(parser_field_parsers.simple_h_name, ft)
        except:
            some_error = True
            issues.append((3, "'"+ft+"' is not a valid Factor Type name"))
    if some_error:
        return issues, None, None

    # Scan the matrix, creating scale records
    scales = []
    for i, r in enumerate(range(t[0], t[1])):
        for j, c in enumerate(range(t[2], t[3])):
            v = sh.cell(row=r, column=c).value
            if v:
                if not isinstance(v, str):
                    v = str(v)
                # Origin factor
                origin = subcol[i]
                # Destination factor
                destination = subrow[j]
                if strcmp(origin, destination):
                    issues.append((3, "A change of scale to the same factor type ("+origin+") is not allowed"))
                else:
                    try:
                        parser_field_parsers.string_to_ast(parser_field_parsers.expression_with_parameters, v)
                        # Add the scale
                        scales.append(dict(origin=origin, destination=destination, scale=v))
                    except:
                        issues.append((3, "The expression '"+v+"' at the intersection of factor types " + origin + " and " + destination + " is syntactically incorrect"))

    content = {"origin_factor_types": subcol,
               "destination_factor_types": subrow,
               "scales": scales
               }

    return issues, None, content
def parse_structure_command(sh: Worksheet,
                            area: AreaTupleType,
                            name: str = None) -> IssuesLabelContentTripleType:
    """
    Analyze the input to produce a JSON object with a list of Observables and relations to other Observables

    Result:[
            {"origin": <processor or factor>,
             "description": <label describing origin>,
             "attributes": {"<attr>": "value"},
             "default_relation": <default relation type>,
             "dests": [
                {"name": <processor or factor>,
                 Optional("relation": <relation type>,)
                 "weight": <expression resulting in a numeric value>
                }
             }
            ]
    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []

    # Scan the sheet, the first column must be one of the keys of "k_list", following
    # columns can contain repeating values
    col_names = {
        ("origin", "name"): "origin",
        ("relation", "default relation"): "default_relation",
        ("destination", "destinations"): "destinations",
        ("origin label", "label"): "description"
    }
    # Check columns
    col_map = collections.OrderedDict()
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value
        if not col_name:
            continue

        for k in col_names:
            if col_name.lower() in k:
                col_map[c] = col_names[k]
                break

    # Map key to a list of values
    content = []  # Dictionary of lists, one per metadata key
    for r in range(area[0] + 1, area[1]):
        item = {}
        for c in col_map:
            value = sh.cell(row=r, column=c).value
            if not value:
                continue

            k = col_map[c]
            if k == "origin":  # Mandatory
                # Check syntax
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.factor_name, value)
                    item[k] = value
                except:
                    some_error = True
                    issues.append(
                        (3, "The name specified for the origin element, '" +
                         value + "', is not valid, in row " + str(r) +
                         ". It must be either a processor or a factor name."))
            elif k == "default_relation":  # Optional (if not specified, all destinations must specify it)
                # Check syntax
                allowed_relations = ('|', '>', '<', '<>', '><', '||')
                if value in allowed_relations:
                    item[k] = value
                else:
                    some_error = True
                    issues.append((
                        3,
                        "The Default relation type specified for the origin element, '"
                        + value + "', is not valid, in row " + str(r) +
                        ". It must be one of " + ', '.join(allowed_relations) +
                        "."))
            elif k == "destinations":  # Mandatory
                # Because the expression (weight relation p_f_name) and the simple p_f_name can collide syntactically,
                # first try the simpler expression then the complex one
                try:
                    dummy = parser_field_parsers.string_to_ast(
                        parser_field_parsers.factor_name, value)
                except:
                    try:
                        dummy = parser_field_parsers.string_to_ast(
                            parser_field_parsers.relation_expression, value)
                    except:
                        traceback.print_exc()
                        some_error = True
                        issues.append((
                            3, "The specification of destination, '" + value +
                            "', is not valid, in row " + str(r) +
                            ". It is a sequence of weight (optional) relation (optional) destination element (mandatory)"
                        ))

                    # Check syntax. It can contain: a weight, a relation type, a processor or factor name.

                if dummy:
                    if k not in item:
                        lst = []
                        item[k] = lst
                    else:
                        lst = item[k]
                    lst.append(value)
            elif k == "description":  # Optional
                item[k] = value

        # Check parameter completeness before adding it to the list of parameters
        if "origin" not in item:
            issues.append(
                (3, "The element must have an Origin, row " + str(r)))
            continue
        if "destinations" not in item:
            issues.append(
                (3, "The element must have at least one Destination, row " +
                 str(r)))
            continue

        content.append(item)

    return issues, None, dict(structure=content)
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType,
                                       dataset_name: str,
                                       state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)
    # Dataset source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)

    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)

    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = [k.lower()
                     for k in dims[d].code_list.keys()]  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True
    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            tmp = [
                to["d"] for o in mappings[m].map for to in o["to"] if to["d"]
            ]
            cl[mappings[m].destination] = set(
                tmp)  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.
    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian
    measures = []
    out_dims = []
    agg_funcs = []
    measures_as = []
    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue

        if col_name.lower().strip() in [
                "dimensions_kept", "dims", "dimensions"
        ]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for d in lst:
                if not d:
                    continue
                if d not in cl:
                    issues.append((
                        3, "The dimension specified for output, '" + d +
                        "' is neither a dataset dimension nor a mapped dimension. ["
                        + ', '.join([d2 for d2 in cl]) + "]"))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in [
                "aggregation_function", "aggfunc", "agg_func"
        ]:  # "SELECT AGGREGATORS"
            lst = obtain_column(c, area[0] + 1, area[1])
            for f in lst:
                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append((
                        3, "The specified aggregation function, '" + f +
                        "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'"
                    ))
                else:
                    agg_funcs.append(f)
        elif col_name.lower().strip() in ["measures"]:  # "SELECT"
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for m in lst:
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        (3, "The specified measure, '" + m +
                         "' is not a measure available in the dataset. [" +
                         ', '.join([m2 for m2 in measures]) + "]"))
                else:
                    measures.append(m)
        elif col_name.lower().strip() in ["measuresas"]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for m in lst:
                measures_as.append(m)
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for cd in lst:
                if not cd:
                    continue
                if str(cd).lower() not in cl[col_name]:
                    issues.append((
                        3, "The code '" + cd +
                        "' is not present in the codes declared for dimension '"
                        + col_name + "'. Please, check them."))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in ["result_name", "result name", "resultname"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append((3, "Column '" + col_name +
                                   "' has an invalid dataset name '" +
                                   result_name + "'"))

    if len(measures) == 0:
        issues.append((3, "At least one measure should be specified"))

    if len(agg_funcs) == 0:
        issues.append(
            (2, "No aggregation function specified. Assuming 'average'"))
        agg_funcs.append("average")

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            (2, "No result name specified. Assuming '" + result_name + "'"))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": None,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Exemplo n.º 20
0
    def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None:
        """
        Process a dictionary representing a row of the Interfaces command. The dictionary can come directly from
        the worksheet or from a dataset.

        :param field_values: dictionary
        """
        # f_processor_name -> p
        # f_interface_type_name -> it
        # f_interface_name -> i
        #
        # IF NOT i AND it AND p => i_name = it.name => get or create "i"
        # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error)
        # IF i AND p AND NOT it => get "i" (MUST EXIST)
        f_interface_type_name = field_values.get("interface_type")
        f_interface_name = field_values.get("interface")

        if not f_interface_name:
            if not f_interface_type_name:
                raise CommandExecutionError(
                    "At least one of InterfaceType or Interface must be defined"
                    + subrow_issue_message(subrow))

            f_interface_name = f_interface_type_name

        processor = self.find_processor(field_values.get("processor"), subrow)

        # Try to find Interface
        f_orientation = field_values.get("orientation")
        interface_type: Optional[FactorType] = None
        interface: Optional[Factor] = None
        interfaces: Sequence[Factor] = self._glb_idx.get(
            Factor.partial_key(processor=processor, name=f_interface_name))
        if len(interfaces) == 1:
            interface = interfaces[0]
            print(f"DEBUG - Interface '{interface.name}' found")
            interface_type = interface.taxon
            if f_interface_type_name and not strcmp(interface_type.name,
                                                    f_interface_type_name):
                self._add_issue(
                    IType.WARNING,
                    f"The existing Interface '{interface.name}' has the InterfaceType "
                    f"'{interface_type.name}' which is different from the specified "
                    f"InterfaceType '{f_interface_type_name}'. Record skipped."
                    + subrow_issue_message(subrow))
                return
        elif len(interfaces) > 1:
            raise CommandExecutionError(
                f"Interface '{f_interface_name}' found {str(len(interfaces))} times. "
                f"It must be uniquely identified." +
                subrow_issue_message(subrow))
        elif len(interfaces) == 0:
            # The interface does not exist, create it below
            if not f_orientation:
                raise CommandExecutionError(
                    f"Orientation must be defined for new Interfaces." +
                    subrow_issue_message(subrow))

        # InterfaceType still not found
        if not interface_type:
            interface_type_name = ifnull(f_interface_type_name,
                                         f_interface_name)

            # Find FactorType
            # TODO Allow creating a basic FactorType if it is not found?
            interface_types: Sequence[FactorType] = self._glb_idx.get(
                FactorType.partial_key(interface_type_name))
            if len(interface_types) == 0:
                raise CommandExecutionError(
                    f"InterfaceType '{interface_type_name}' not declared previously"
                    + subrow_issue_message(subrow))
            elif len(interface_types) > 1:
                raise CommandExecutionError(
                    f"InterfaceType '{interface_type_name}' found {str(len(interface_types))} times. "
                    f"It must be uniquely identified." +
                    subrow_issue_message(subrow))
            else:
                interface_type = interface_types[0]

        # Get attributes default values taken from Interface Type or Processor attributes
        # Rows   : value of (source) "processor.subsystem_type"
        # Columns: value of (target) "interface_type.opposite_processor_type"
        # Cells  : CORRECTED value of "opposite_processor_type"
        # +--------+-------+--------+-------+---------+
        # |        | Local | Env    | Ext   | ExtEnv  |
        # +--------+-------+--------+-------+---------+
        # | Local  | Local | Env    | Ext   | ExtEnv  |
        # | Env    | Local | Env    | Ext   | ExtEnv? |
        # | Ext    | Ext   | ExtEnv | Local | Env     |
        # | ExtEnv | Ext   | ExtEnv | Local | Env?    |
        # +--------+-------+--------+-------+---------+
        if interface_type.opposite_processor_type:
            tmp = interface_type.opposite_processor_type.lower()
            if processor.subsystem_type.lower() in ["local", "environment"
                                                    ]:  # First two rows
                opposite_processor_type = tmp
            else:
                opposite_processor_type = InterfacesAndQualifiedQuantitiesCommand.invert[
                    tmp]
            # TODO in doubt. Maybe these are undefined (values with question mark in the table)
            #  if tmp == "externalenvironment" and processor.subsystem_type.lower() in ["environment", "externalenvironment"]:
            #      pass
        else:
            opposite_processor_type = None

        interface_type_values = {
            "sphere": interface_type.sphere,
            "roegen_type": interface_type.roegen_type,
            "opposite_processor_type": opposite_processor_type
        }

        # Get internal and user-defined attributes in one dictionary
        # Use: value specified in Interfaces ELSE value specified in InterfaceTypes ELSE first value of allowed values
        attributes = {
            c.name: ifnull(
                field_values[c.name],
                ifnull(interface_type_values.get(c.name),
                       head(c.allowed_values)))
            for c in self._command_fields if c.attribute_of == Factor
        }

        if not interface:
            # f_list: Sequence[Factor] = self._glb_idx.get(
            #     Factor.partial_key(processor=p, factor_type=ft, orientation=f_orientation))
            #
            # if len(f_list) > 0:
            #     raise CommandExecutionError(f"An interface called '{f_list[0].name}' for Processor '{f_processor_name}'"
            #                                  f" with InterfaceType '{f_interface_type_name}' and orientation "
            #                                  f"'{f_orientation}' already exists"+subrow_issue_message(subrow))

            # Transform text of "interface_attributes" into a dictionary
            interface_attributes = self.transform_text_attributes_into_dictionary(
                field_values.get("interface_attributes"), subrow)
            attributes.update(interface_attributes)

            location = self.get_location(field_values.get("location"), subrow)

            interface = Factor.create_and_append(
                f_interface_name,
                processor,
                in_processor_type=FactorInProcessorType(external=False,
                                                        incoming=False),
                taxon=interface_type,
                geolocation=location,
                tags=None,
                attributes=attributes)
            self._glb_idx.put(interface.key(), interface)
            print(f"DEBUG - Interface '{interface.name}' created")
        elif not interface.compare_attributes(attributes):
            initial = ', '.join(
                [f"{k}: {interface.get_attribute(k)}" for k in attributes])
            new = ', '.join([f"{k}: {attributes[k]}" for k in attributes])
            name = interface.processor.full_hierarchy_names(
                self._glb_idx)[0] + ":" + interface.name
            raise CommandExecutionError(
                f"The same interface '{name}', is being redeclared with different properties. "
                f"INITIAL: {initial}; NEW: {new}." +
                subrow_issue_message(subrow))

        f_unit = field_values.get("unit")
        if not f_unit:
            f_unit = interface_type.unit

        # Unify unit (it must be done before considering RelativeTo -below-, because it adds a transformation to "f_unit")
        f_value = field_values.get("value")
        if f_value is not None and f_unit != interface_type.unit:
            try:
                f_value = UnitConversion.convert(f_value, f_unit,
                                                 interface_type.unit)
            except DimensionalityError:
                raise CommandExecutionError(
                    f"Dimensions of units in InterfaceType ({interface_type.unit}) and specified ({f_unit}) are not convertible"
                    + subrow_issue_message(subrow))

            f_unit = interface_type.unit

        # Search for a relative_to interface
        f_relative_to = field_values.get("relative_to")
        relative_to_interface: Optional[Factor] = None
        if f_relative_to:
            try:
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.factor_unit, f_relative_to)
            except:
                raise CommandExecutionError(
                    f"Could not parse the RelativeTo column, value {str(f_relative_to)}. "
                    + subrow_issue_message(subrow))

            relative_to_interface_name = ast_to_string(ast["factor"])

            # rel_unit_name = ast["unparsed_unit"]
            # try:
            #     f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units)
            # except (UndefinedUnitError, AttributeError) as ex:
            #     raise CommandExecutionError(f"The final unit could not be computed, interface '{f_unit}' / "
            #                                  f"relative_to '{rel_unit_name}': {str(ex)}"+subrow_issue_message(subrow))

            relative_to_interface = first(
                interface.processor.factors,
                lambda ifc: strcmp(ifc.name, relative_to_interface_name))

            if not relative_to_interface:
                raise CommandExecutionError(
                    f"Interface specified in 'relative_to' column "
                    f"'{relative_to_interface_name}' has not been found." +
                    subrow_issue_message(subrow))

        if f_value is None and relative_to_interface is not None:
            # Search for a Interface Type Conversion defined in the ScaleChangeMap command
            interface_types_transforms: List[FactorTypesRelationUnidirectionalLinearTransformObservation] = \
                find_factor_types_transform_relation(self._glb_idx, relative_to_interface.taxon, interface.taxon, processor, processor)

            # Overwrite any specified unit, it doesn't make sense without a value, i.e. it cannot be used for conversion
            f_unit = interface.taxon.unit
            if len(interface_types_transforms) == 1:
                f_value = interface_types_transforms[0].scaled_weight
            else:
                interface_types_transforms_message = "an interface type conversion doesn't exist" \
                    if (len(interface_types_transforms) == 0) \
                    else f"{len(interface_types_transforms)} interface type conversions exist"

                f_value = "0"
                self._add_issue(
                    IType.WARNING,
                    f"Field 'value' should be defined for interfaces having a "
                    f"'RelativeTo' interface, and {interface_types_transforms_message}. "
                    f"Using value '0'." + subrow_issue_message(subrow))

        # Create quantitative observation
        if f_value is not None:
            f_uncertainty = field_values.get("uncertainty")
            f_assessment = field_values.get("assessment")
            f_pedigree_matrix = field_values.get("pedigree_matrix")
            f_pedigree = field_values.get("pedigree")
            f_time = field_values.get("time")
            f_comments = field_values.get("comments")

            f_source = field_values.get("qq_source")
            # TODO: source is not being used
            source = self.get_source(f_source, subrow)

            # Find Observer
            observer: Optional[Observer] = None
            if f_source:
                observer = self._glb_idx.get_one(
                    Observer.partial_key(f_source))
                if not observer:
                    self._add_issue(
                        IType.WARNING,
                        f"Observer '{f_source}' has not been found." +
                        subrow_issue_message(subrow))

            # If an observation exists then "time" is mandatory
            if not f_time:
                raise CommandExecutionError(
                    f"Field 'time' needs to be specified for the given observation."
                    + subrow_issue_message(subrow))

            # An interface can have multiple observations if each of them have a different [time, observer] combination
            for observation in interface.quantitative_observations:
                observer_name = observation.observer.name if observation.observer else None
                if strcmp(observation.attributes["time"], f_time) and strcmp(
                        observer_name, f_source):
                    raise CommandExecutionError(
                        f"The interface '{interface.name}' in processor '{interface.processor.name}' already has an "
                        f"observation with time '{f_time}' and source '{f_source}'."
                    )

            self.check_existence_of_pedigree_matrix(f_pedigree_matrix,
                                                    f_pedigree, subrow)

            # Transform text of "number_attributes" into a dictionary
            number_attributes = self.transform_text_attributes_into_dictionary(
                field_values.get("number_attributes"), subrow)

            o = _create_or_append_quantitative_observation(
                interface, f_value, f_unit, f_uncertainty, f_assessment,
                f_pedigree, f_pedigree_matrix, observer, relative_to_interface,
                f_time, None, f_comments, None, number_attributes)
def parse_data_input_command(sh: Worksheet,
                             area: AreaTupleType,
                             processors_type: str,
                             state=None) -> IssuesLabelContentTripleType:
    """
    Scans the "area" of input worksheet "sh" where it is assumed a "data input" command
    is present.

    It obtains a list of observations, a list of processors, a list of observables, a list of tags
    All those are represented in JSON format

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param processors_type: Name for the type of processors. Also label of the command
    :param state: Transient state useful for checking existence of variables
    :return: DataInputCommand, list of issues (issue_type, message)
    """
    some_error = False
    issues = []
    # Define a set of observations (qualified quantities) of observables
    # This set can be replicated. So, ?how to refer to each replica?
    # Regular expression, internal name, Mandatory (True|False)
    known_columns = [
        (r"Name|Processor[_ ]name", "processor", False),
        (r"Level", "level", False),
        (r"Parent", "parent", False),
        (r"FF[_ ]type", "ff_type", True),
        (r"Var|Variable", "factor", True),
        (r"Value|NUSAP\.N", "value",
         False),  # If value is not specified, then just declare the Factor
        (r"Unit|NUSAP\.U", "unit",
         True),  # If blank, a dimensionless amount is assumed
        (r"Relative[_ ]to", "relative_to", False),
        (r"Uncertainty|Spread|NUSAP\.S", "uncertainty", False),
        (r"Assessment|NUSAP\.A", "assessment", False),
        (r"Pedigree[_ ]matrix|NUSAP\.PM", "pedigree_matrix", False),
        (r"Pedigree|NUSAP\.P", "pedigree", False),
        (r"Time|Date", "time", False),
        (r"Geo|Geolocation", "geolocation", False),
        (r"Source", "source", False),
        (r"Comment|Comments", "comments", False)
    ]

    label = "Processors " + processors_type

    # First, examine columns, to know which fields are being specified
    # Special cases:
    #   Open columns: the field is specified in the cell togheter with the value. Like "attr1=whatever", instead of a header "attr1" and in a row below, a value "whatever"
    #   Complex values: the value has syntactic rules. Like expressions for both quantities AND qualities (like NUSAP)
    #   References: the field refers to additional information in another worksheet. Unique names or ref holder (worksheet name) plus ref inside the worksheet, would be allowed. Also ref type can disambiguate
    mandatory = {t[1]: t[2] for t in known_columns}
    cre = {
    }  # Column Regular Expression dictionary (K: regular expression; V: RegularExpression object)
    if not case_sensitive:
        flags = re.IGNORECASE
    else:
        flags = 0
    for kc in known_columns:
        cre[kc[0]] = re.compile(kc[0], flags=flags)
    col_names = {}
    standard_cols = {
    }  # Internal (standardized) column name to column index in the worksheet (freedom in the order of columns)
    attribute_cols = create_dictionary(
    )  # Not recognized columns are considered freely named categories, attributes or tags
    attributes = [
    ]  # List of attributes or tags (keys of the previous dictionary)
    col_allows_dataset = create_dictionary(
    )  # If the column allows the reference to a dataset dimension
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value
        if not col_name:
            continue

        col_name = col_name.replace("\n", " ")
        col_names[c] = col_name

        # Match
        found = False
        for kc in known_columns:
            res = cre[kc[0]].search(col_name)
            if res:
                if kc[1] in standard_cols:
                    issues.append(
                        (2, "Cannot repeat column name '" + col_name + "' (" +
                         kc[0] + ") in data input command '" +
                         processors_type + "'"))
                else:
                    standard_cols[kc[1]] = c
                    col_names[c] = kc[
                        1]  # Override column name with pseudo column name for standard columns
                    if col_names[c].lower() in [
                            "factor", "value", "time", "geolocation"
                    ]:
                        col_allows_dataset[col_names[c]] = True
                    else:
                        col_allows_dataset[col_names[c]] = False
                    found = True
                break
        if not found:
            if col_name not in attribute_cols:
                # TODO Check valid col_names. It must be a valid Variable Name
                attribute_cols[col_name] = c
                attributes.append(col_name)
                col_allows_dataset[col_name] = True
            else:
                issues.append(
                    (2, "Cannot repeat column name '" + col_name +
                     "' in data input command '" + processors_type + "'"))

    del cre

    # Check if there are mandatory columns missing

    # TODO There could be combinations of columns which change the character of mandatory of some columns
    # TODO For instance, if we are only specifying structure, Value would not be needed
    print("BORRAME - " + str(known_columns))
    print("BORRAME 2 - " + str(standard_cols))
    for kc in known_columns:
        # "kc[2]" is the flag indicating if the column is mandatory or not
        # col_map contains standard column names present in the worksheet
        if kc[2] and kc[1] not in standard_cols:
            some_error = True
            issues.append((3, "Column name '" + kc[0] +
                           "' must be specified in data input command '" +
                           processors_type + "'"))

    # If there are errors, do not continue
    if some_error:
        return issues, label, None

    processor_attribute_exclusions = create_dictionary()
    processor_attribute_exclusions[
        "scale"] = None  # Exclude these attributes when characterizing the processor
    processor_attributes = [
        t for t in attributes if t not in processor_attribute_exclusions
    ]

    # SCAN rows
    lst_observations = [
    ]  # List of ALL observations. -- Main outcome of the parse operation --

    set_pedigree_matrices = create_dictionary()  # List of pedigree templates
    set_processors = create_dictionary()  # List of processor names
    set_factors = create_dictionary()  # List of factors
    set_taxa = create_dictionary(
    )  # Dictionary of taxa with their lists of values. Useful to return CODE LISTS
    set_referenced_datasets = create_dictionary(
    )  # Dictionary of datasets to be embedded into the result (it is a job of the execution part)
    processors_taxa = create_dictionary(
    )  # Correspondence "processor" -> taxa (to avoid changes in this correspondence)

    dataset_column_rule = parser_field_parsers.dataset_with_column
    values = [None] * area[3]
    # LOOP OVER EACH ROW
    for r in range(area[0] + 1, area[1]):  # Scan rows (observations)
        # Each row can specify: the processor, the factor, the quantity and qualities about the factor in the processor
        #                       It can also specify a "flow+containment hierarchy" relation

        row = {}  # Store parsed values of the row

        taxa = create_dictionary()  # Store attributes or taxa of the row

        referenced_dataset = None  # Once defined in a row, it cannot change!!
        # Scan the row first, looking for the dataset. The specification is allowed in certain columns:
        # attribute_cols and some standard_cols
        already_processed = create_dictionary()
        for c in range(area[2], area[3]):
            if c in col_names:
                value = sh.cell(row=r, column=c).value
                if isinstance(value, str) and value.startswith("#"):
                    col_name = col_names[c]
                    if col_allows_dataset[col_name]:
                        if not referenced_dataset:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    dataset_column_rule, value[1:])
                                if len(ast["parts"]) == 2:
                                    referenced_dataset = ast["parts"][0]
                                    # Remove the dataset variable. It will be stored in "_referenced_dataset"
                                    value = "#" + ast["parts"][1]
                                else:
                                    some_error = True
                                    issues.append((
                                        3,
                                        "The first dataset reference of the row must contain the "
                                        "dataset variable name and the dimension name, row "
                                        + str(r)))

                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        else:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    simple_ident, value[1:])
                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        if col_name in standard_cols:
                            row[col_name] = value
                        else:
                            taxa[col_name] = value

                values[c] = value

        # TODO If the flow type is decomposed, compose it first
        for c in standard_cols:
            if c in already_processed:
                continue

            value = values[standard_cols[c]]

            # != "" or not
            if value is None or (value is not None and value == ""):
                if c == "unit":
                    value = "-"
                if not value:
                    if mandatory[c]:
                        some_error = True
                        issues.append(
                            (3,
                             "Column '" + c + "' is mandatory, row " + str(r)))
                    continue  # Skip the rest of the iteration!

            # Parse the value
            if c in ["processor", "factor"]:
                # Check that it is a variable name, and allow hierarchical names
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_h_name, value)
            elif c == "pedigree_matrix":
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_ident, value)
            elif c == "relative_to":
                # Two elements, the first a hierarchical name, the second a unit name
                s = value.split(" ")
                if len(s) != 2:
                    some_error = True
                    issues.append((
                        3,
                        "The Relative To value has to have two parts, factor name and unit, separated by a whitespace (specified '"
                        + value + "'), in row " + str(r)))
                else:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.simple_h_name, s[0])
                    except:
                        some_error = True
                        issues.append((
                            3,
                            "The name specified for the relative to factor '" +
                            s[0] + "' is not valid, in row " + str(r)))

                    # It must be a recognized unit. Check with Pint
                    try:
                        ureg(s[1])
                        ureg.parse_unit_name(s[1], case_sensitive)
                    except UndefinedUnitError:
                        some_error = True
                        issues.append((
                            3, "The unit name '" + s[1] +
                            "' is not registered in the units processing package, in row "
                            + str(r)))
            elif c == "level":
                # A valid level name
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.level_name, value)
                except:
                    some_error = True
                    issues.append((3, "The level '" + value +
                                   "' syntax is not valid, in row " + str(r)))

            elif c == "parent":
                # Check that value is a valid parent name. It can be either a list of tags OR
                # a processor name, something defining a single processor
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.simple_h_name, value)
                except:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.named_parameters_list, value)
                    except:
                        some_error = True
                        issues.append((3, "Could not parse '" + value +
                                       "' as 'parent' in row " + str(r)))
            elif c == "ff_type":
                # The type of flow/fund must be one of a set of possible values. DEFINE THE LIST
                if value.lower() not in allowed_ff_types:
                    some_error = True
                    issues.append(
                        (3, "ff_type must be one of :" +
                         ', '.join(allowed_ff_types) + ", in row " + str(r)))
            elif c == "value":
                if not isinstance(value, str):
                    value = str(value)
                # Expression allowed. Check syntax only. It can refer to parameters.
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.expression, value)
                # TODO Check existence of used variables
                # TODO basic_elements_parser.ast_evaluator(ast, state, None, issues, "static")
            elif c == "unit":
                # It must be a recognized unit. Check with Pint
                try:
                    value = value.replace("€", "Euro").replace("$", "Dollar")
                    if value == "-":
                        value = ""  # Dimensionless
                    ureg(value)
                    ureg.parse_unit_name(value, case_sensitive)
                except:
                    some_error = True
                    issues.append((
                        3, "The unit name '" + value +
                        "' is not registered in the units processing package, in row "
                        + str(r)))
            elif c == "uncertainty":
                # TODO It must be a valid uncertainty specifier
                pass
            elif c == "assessment":
                # See page 135 of Funtowicz S., Ravetz J., "Uncertainty and Quality in Science for Policy"
                # "c" is "cognitive" assessment, "p" is pragmatic assessment.
                allowed = [
                    "nil", "low", "medium", "high", "total", "nil_c", "low_c",
                    "medium_c", "high_c", "total_c", "nil_p", "low_p",
                    "medium_p", "high_p", "total_p"
                ]
                if value and value.lower() not in allowed:
                    issues.append((3, "Assessment must be empty or one of: " +
                                   ", ".join(allowed)))
            elif c == "pedigree":
                # A valid pedigree specification is just an integer
                try:
                    int(value)
                except:
                    issues.append((3, "The pedigree specification '" + value +
                                   "' must be an integer"))
            elif c == "time":
                # A valid time specification. Possibilities: Year, Month-Year / Year-Month, Time span (two dates)
                if not isinstance(value, str):
                    value = str(value)
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.time_expression, value)
            elif c == "geolocation":
                # A reference to a geolocation
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, value)
                except:
                    some_error = True
                    issues.append((3, "The geolocation must be a reference"))
            elif c == "source":
                # Who or what provided the information. It can be formal or informal. Formal can be references (but evaluated later)
                pass
            elif c == "comments":
                # Free text
                pass

            # Store the parsed value
            row[c] = value

        for c in attribute_cols:
            if c in already_processed:
                continue

            value = values[attribute_cols[c]]

            # != "" or not
            if not value:
                taxa[c] = None
                continue  # Skip the rest of the iteration!

            # TODO Check value. Valid identifier, no whitespace
            # Validate "value", it has to be a simple ID
            try:
                if not isinstance(value, str):
                    value = str(value)
                parser_field_parsers.simple_ident.parseString(value,
                                                              parseAll=True)
            except:
                value = None
                some_error = True
                issues.append((
                    3, "The value in column '" + c +
                    "' has to be a simple identifier: start with letter, then letters, numbers and '_', no whitespace, in row "
                    + str(r)))

            taxa[c] = value

            # Disable the registration of taxa. If a Dataset reference is used, there is no way to register
            # taxa at parse time (the dataset is still not obtained). Leave it for the execution
            if c not in set_taxa:
                set_taxa[c] = create_dictionary()
            if value is not None:
                set_taxa[c][value] = None

        # Now that individual columns have been parsed, do other things

        if referenced_dataset:
            row["_referenced_dataset"] = referenced_dataset

        # If "processor" not specified, concatenate taxa columns in order to generate an automatic name
        # (excluding the processor type)
        p_taxa = taxa.copy()
        for k in processor_attribute_exclusions:
            if k in p_taxa: del p_taxa[k]

        if "processor" not in row:
            row["processor"] = "_".join(
                [str(taxa[t]) for t in processor_attributes]
            )  # TODO Which order? (the current is "order of appearance"; maybe "alphabetical order" would be better option)
        # Add as "taxa" the processor type (which is an optional input parameter to this function)
        if processors_type:
            taxa["_processors_type"] = processors_type
        # Store taxa (attributes and taxa)
        row["taxa"] = taxa
        # Store taxa if the processor still does not have it
        if row["processor"] not in processors_taxa:
            processors_taxa[row[
                "processor"]] = p_taxa  # "::".join([taxa[t] for t in lst_taxa_cols])
        else:
            # Taxa should be the same for each "processor". Error if different
            t = processors_taxa[row["processor"]]
            if t != p_taxa:
                issues.append(
                    (3, "The processor '" + row["processor"] +
                     "' has different taxa assigned, in row " + str(r)))

        # Register new processor names, pedigree templates, and variable names
        if "processor" in row:
            set_processors[row["processor"]] = None
        if "pedigree_matrix" in row:
            set_pedigree_matrices[row["pedigree_matrix"]] = None
        if "factor" in row:
            set_factors[row["factor"]] = None
        if referenced_dataset:
            set_referenced_datasets[referenced_dataset] = None

        lst_observations.append(row)

    content = {
        "factor_observations": lst_observations,
        "processor_attributes": processor_attributes,
        "processors": [k for k in set_processors],
        "pedigree_matrices": [k for k in set_pedigree_matrices],
        "factors": [k for k in set_factors],
        "referenced_datasets": [ds for ds in set_referenced_datasets],
        "code_lists": {k: [k2 for k2 in set_taxa[k]]
                       for k in set_taxa}
    }
    return issues, label, content
    def _process_row(self, fields: Dict[str, Any], subrow=None) -> None:
        def process_relation(relation_class):
            source_processor = self._get_processor_from_field(
                "source_processor")
            target_processor = self._get_processor_from_field(
                "target_processor")

            self._check_fields(relation_class, source_processor,
                               target_processor, subrow)

            if relation_class.is_between_processors:
                create_relation_observations(
                    self._glb_idx,
                    source_processor, [(target_processor, relation_class)],
                    relation_class,
                    None,
                    attributes=attributes)

            elif relation_class.is_between_interfaces:
                try:
                    source_interface = self._get_interface_from_field(
                        "source_interface",
                        source_processor) if self._fields.get(
                            "source_interface"
                        ) else self._get_interface_from_field(
                            "target_interface", source_processor)
                except CommandExecutionError as e:
                    source_interface = None
                    if not str(e).startswith("The interface"):
                        raise e
                    else:
                        self._add_issue(IType.WARNING, str(e))

                try:
                    target_interface = self._get_interface_from_field(
                        "target_interface",
                        target_processor) if self._fields.get(
                            "target_interface"
                        ) else self._get_interface_from_field(
                            "source_interface", target_processor)
                except CommandExecutionError as e:
                    target_interface = None
                    if not str(e).startswith("The interface"):
                        raise e
                    else:
                        self._add_issue(IType.WARNING, str(e))

                if not source_interface or not target_interface:
                    return

                if fields["back_interface"]:
                    relation_class = RelationClassType.ff_directed_flow_back

                if relation_class == RelationClassType.ff_directed_flow_back:
                    back_interface = self._get_interface_from_field(
                        "back_interface", source_processor)
                    self._check_flow_back_interface_types(
                        source_interface, target_interface, back_interface)
                    attributes.update(dict(back_interface=back_interface))

                if relation_class.is_flow:
                    self._check_flow_orientation(
                        source_processor,
                        target_processor,
                        source_interface,
                        target_interface,
                        is_direct_flow=(relation_class ==
                                        RelationClassType.ff_directed_flow))

                if source_interface.taxon != target_interface.taxon:
                    interface_types_transforms = find_factor_types_transform_relation(
                        self._glb_idx, source_interface.taxon,
                        target_interface.taxon, source_processor,
                        target_processor)

                    # ChangeOfTypeScale
                    if self._fields.get("change_type_scale"):
                        o = FactorTypesRelationUnidirectionalLinearTransformObservation.create_and_append(
                            source_interface.taxon,
                            target_interface.taxon,
                            self._fields.get("change_type_scale"),
                            source_interface.processor,
                            target_interface.
                            processor,  # AdHoc source-target Context
                            None,
                            None,  # No unit conversion
                            find_or_create_observer(
                                Observer.no_observer_specified, self._glb_idx))
                        self._glb_idx.put(o.key(), o)
                        if len(interface_types_transforms) > 0:
                            self._add_issue(
                                IType.WARNING,
                                f"Preexisting matching ScaleChangeMap entry found. Overriding with "
                                f"{self._fields.get('change_type_scale')}")

                    interface_types_transform = self._get_interface_types_transform(
                        source_interface.taxon, source_processor,
                        target_interface.taxon, target_processor, subrow)
                    attributes.update(
                        dict(scale_change_weight=interface_types_transform.
                             scaled_weight))

                create_relation_observations(
                    self._glb_idx,
                    source_interface,
                    [(target_interface, relation_class, fields["flow_weight"])
                     ],
                    relation_class,
                    None,
                    attributes=attributes)

        if not self._all_processors:
            self._all_processors = get_processor_names_to_processors_dictionary(
                self._glb_idx)
        # source_cardinality = fields["source_cardinality"]
        # target_cardinality = fields["target_cardinality"]
        source_processors = self._fields["source_processor"]
        target_processors = self._fields["target_processor"]
        attributes = self._get_attributes_from_field("attributes")

        try:  # Get relation class type
            relation_class = RelationClassType.from_str(
                fields["relation_type"])
        except NotImplementedError as e:
            raise CommandExecutionError(str(e))

        if ".." in source_processors or ".." in target_processors:
            if ".." in source_processors:
                source_processor_names = obtain_matching_processors(
                    string_to_ast(processor_names,
                                  self._fields["source_processor"]),
                    self._all_processors)
            else:
                source_processor_names = [source_processors]
            if ".." in target_processors:
                target_processor_names = obtain_matching_processors(
                    string_to_ast(processor_names,
                                  self._fields["target_processor"]),
                    self._all_processors)
            else:
                target_processor_names = [target_processors]
            for s in source_processor_names:
                for t in target_processor_names:
                    self._fields["source_processor"] = s
                    self._fields["target_processor"] = t
                    process_relation(relation_class)
        else:
            process_relation(relation_class)
Exemplo n.º 23
0
    def execute(self, state: "State"):
        """
            Process each of the specified relations, creating the endpoints if they do not exist already
            {"name": <processor or factor>,
             "attributes": {"<attr>": "value"},
             "type": <default relation type>,
             "dests": [
                {"name": <processor or factor>,
                 ["type": <relation type>,]
                 "weight": <expression resulting in a numeric value>
                }
             }
        """
        some_error = False
        issues = []
        glb_idx, _, _, _, _ = get_case_study_registry_objects(state)

        # Process each record
        for i, o in enumerate(self._content["structure"]):
            # origin processor[+factor] -> relation (weight) -> destination processor[+factor]
            origin_name = o["origin"]
            if "source" in o:
                source = o["source"]
            else:
                source = None
            if "default_relation" in o:
                default_relation = o["default_relation"]
            else:
                default_relation = None

            destinations = []
            for r in o["destinations"]:
                try:
                    result = parser_field_parsers.string_to_ast(
                        parser_field_parsers.factor_name, r)
                except:
                    try:
                        result = parser_field_parsers.string_to_ast(
                            parser_field_parsers.relation_expression, r)
                    except:
                        traceback.print_exc()
                        some_error = True
                        issues.append((
                            3, "The specification of destination, '" + r +
                            "', is not valid, in element " + str(r) +
                            ". It is a sequence of weight (optional) relation (optional) destination element (mandatory)"
                        ))

                if result:
                    if result["type"] == "pf_name":
                        base = result
                    else:
                        base = result["name"]
                    tmp = base["processor"]
                    destination_name = (
                        (tmp["ns"] + "::") if "ns" in tmp and tmp["ns"] else
                        '') + '.'.join(tmp["parts"])
                    if "factor" in base and base["factor"]:
                        tmp = base["factor"]
                        destination_name += ':' + (
                            (tmp["ns"] + "::") if "ns" in tmp and tmp["ns"]
                            else '') + '.'.join(tmp["parts"])
                    if "relation_type" in result and result["relation_type"]:
                        rel_type = result["relation_type"]
                    else:
                        rel_type = None
                    if "weight" in result and result["weight"]:
                        weight = ast_to_string(
                            result["weight"])  # For flow relations
                    else:
                        weight = None
                    if rel_type and weight:
                        t = (destination_name, rel_type, weight)
                    elif rel_type and not weight:
                        t = (destination_name, rel_type)
                    elif not rel_type and not weight:
                        t = tuple(
                            [destination_name]
                        )  # Force it to be a tuple (create_relation_observations expects that)

                    destinations.append(t)

            rels = create_relation_observations(glb_idx, origin_name,
                                                destinations, default_relation,
                                                source)

        return issues, None
Exemplo n.º 24
0
def get_interfaces(glb_idx: PartialRetrievalDictionary) -> pd.DataFrame:
    # Used to examine "value" as expression, and find variables that are interface names vs parameter names
    params = create_dictionary(
        data={p.name: None
              for p in glb_idx.get(Parameter.partial_key())})
    s = State()
    procs = glb_idx.get(Processor.partial_key())
    d = {}
    for p in procs:
        parent_relations = glb_idx.get(
            ProcessorsRelationPartOfObservation.partial_key(child=p))
        d[p.ident] = set([p.parent_processor.ident for p in parent_relations])

    lst = [[
        "Processor", "InterfaceType", "Interface", "Sphere", "RoegenType",
        "Orientation", "OppositeSubsystemType", "GeolocationRef",
        "GeolocationCode", "InterfaceAttributes", "Value", "Unit",
        "RelativeTo", "Uncertainty", "Assessment", "PedigreeMatrix",
        "Pedigree", "Time", "Source", "NumberAttributes", "Comments"
    ]]
    # Elaborate a DAG, then iterate over it
    for ident in list(toposort.toposort_flatten(d)):
        p = glb_idx.get(Processor.partial_key(ident=ident))[0]
        ifaces = glb_idx.get((Factor.partial_key(processor=p)))
        iface_names = create_dictionary(
            data={iface.name: iface
                  for iface in ifaces})
        # Elaborate DAG of Interfaces because of Observations
        d = {}
        for iface in ifaces:
            if iface.ident not in d:
                d[iface.ident] = set()
            for obs in iface.quantitative_observations:
                if obs.relative_factor:
                    d[iface.ident].add(obs.relative_factor.ident)
                # Consider obs.value and non linear dependencies
                if isinstance(obs.value, str):
                    ast = string_to_ast(expression_with_parameters, obs.value)
                    evaluation_issues = []
                    value, unresolved_vars = ast_evaluator(
                        exp=ast,
                        state=s,
                        obj=None,
                        issue_lst=evaluation_issues)
                    for unresolved in unresolved_vars:
                        if unresolved not in params:
                            d[iface.ident].add(iface_names[unresolved].ident)

        for ident2 in list(toposort.toposort_flatten(d)):
            iface = glb_idx.get(Factor.partial_key(ident=ident2))[0]
            lst1 = [
                iface.processor.name, iface.taxon.name, iface.name,
                iface.sphere, iface.roegen_type.name, iface.orientation,
                iface.opposite_processor_type, "", "", ""
            ]
            observations = iface.quantitative_observations
            if len(observations) > 0:
                for obs in observations:
                    lst2 = [
                        obs.value,
                        obs.attributes.get("unit",
                                           ""), obs.relative_factor.name
                        if obs.relative_factor else "",
                        obs.attributes.get("spread", ""),
                        obs.attributes.get("assessment", ""),
                        obs.attributes.get("pedigree_template", ""),
                        obs.attributes.get("pedigree", ""),
                        obs.attributes.get("time", ""),
                        obs.observer.name if obs.observer else "", "",
                        obs.attributes.get("comments", "")
                    ]
                    lst.append(lst1 + lst2)
            else:
                lst.append(lst1 + ["", "", "", "", "", "", "", "", "", ""])

    return list_to_dataframe(lst)
Exemplo n.º 25
0
def parse_command_in_worksheet(sh: Worksheet, area: AreaTupleType,
                               name: Optional[str],
                               cmd_name: str) -> IssuesLabelContentTripleType:
    """
    Parse command in general
    Generate a JSON
    Generate a list of issues

    :param sh: Worksheet to read
    :param area: Area of the worksheet
    :param name: Name of the worksheet
    :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions
    :return: issues List, None, content (JSON)
    """
    def check_expandable(v, location):
        """
        Check if curly braces match, that what is inside is syntactically correct, (and that the value exists)

        :param v:
        :return:
        """
        import re
        reg = re.compile(r"{.*?}")
        matches = reg.findall(v)
        output = set()
        if len(matches) == 0:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=f"Incorrect syntax, no macro expansion found",
                    location=location))
        else:
            for m in matches:
                h_name = m[1:-1]
                try:
                    parser_field_parsers.string_to_ast(
                        arith_boolean_expression, h_name)  # simple_h_name
                    output.add(h_name)
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value {m[1:-1]} is not a valid hierarchical name",
                            location=location))
        return output

    def commented_row(rn):
        commented = False
        v = sh.cell(row=r, column=1).value
        if v is not None:
            if str(v).startswith("#"):
                commented = True
        return commented

    issues: List[Issue] = []

    from nexinfosys.command_field_definitions import command_fields

    cols = command_fields[
        cmd_name]  # List of CommandField that will guide the parsing
    col_map, local_issues = check_columns(sh, name, area, cols, cmd_name)

    if any([i.itype == IType.ERROR for i in local_issues]):
        return local_issues, None, None

    issues.extend(local_issues)

    # The "mandatoriness" of a field may depend on values in other fields (like in RefBibliographic command fields)
    # Elaborate a list of fields having this "complex" mandatory property
    complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)]

    content = []  # The output JSON
    # Parse each Row
    for r in range(area[0] + 1, area[1]):
        line = {}
        expandable = set(
        )  # A set of variables to be expanded. If empty, it is a literal line (not expandable)
        complex = False  # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast)

        # A row is commented if the value in the first column starts with "#" (a first empty column could be inserted
        # to ease this, just to signal commented rows)
        if commented_row(r):
            continue

        # Constant mandatory values
        mandatory_not_found = set([
            c.name for c in cols
            if c.mandatory and isinstance(c.mandatory, bool)
        ])

        # Each "field"
        for field_def in col_map.keys():
            field_name = field_def.name
            field_defined = False
            # Appearances of field (normally just once, there are attributes allowing more than one appearance)
            for col_name, col_idx in col_map[field_def]:
                # Read and prepare "value"
                value = sh.cell(row=r, column=col_idx).value
                if value is not None:
                    if isinstance(value, float):
                        if value == int(value):
                            value = str(int(value))
                        else:
                            value = str(value)
                    elif not isinstance(value, str):
                        value = str(value)
                    value = value.strip()
                    field_defined = True
                else:
                    continue

                # Check if value contains "{", expansion
                if "{" in value:
                    # Expandable. Do not parse now. Check: curly pairs, and that what is between is a
                    #  simple_h_name and that it exists: as dataset
                    expandable.update(
                        check_expandable(
                            value,
                            IssueLocation(sheet_name=name,
                                          row=r,
                                          column=col_idx)))
                    # With many appearances, just a "Key-Value list" syntax is permitted
                    if field_def.many_appearances:
                        if field_name in line:
                            line[
                                field_name] += ", " + col_name + "='" + value + "'"
                        else:
                            line[field_name] = col_name + "='" + value + "'"
                    else:
                        if field_name in line:
                            line[field_name] += ", " + value
                        else:
                            line[field_name] = value  # Store the value
                else:
                    if field_def.allowed_values:  # If the CommandField checks for a list of allowed values
                        allowed_values_dict: Dict[str, str] = {
                            v.lower(): v
                            for v in field_def.allowed_values
                        }
                        if value.lower(
                        ) not in allowed_values_dict:  # TODO Case insensitive CI
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Field '{col_name}' of command '{cmd_name}' has invalid category "
                                    f"'{value}'. Allowed values are: {', '.join(field_def.allowed_values)}.",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=col_idx)))
                        else:
                            # Use case from allowed values
                            line[field_name] = allowed_values_dict[
                                value.lower()]
                    else:  # Instead of a list of values, check if a syntactic rule is met by the value
                        if field_def.parser:  # Parse, just check syntax (do not store the AST)
                            try:
                                standalone_attribute_value = "@" in field_def.allowed_names[
                                    0]
                                if not standalone_attribute_value:
                                    ast = parser_field_parsers.string_to_ast(
                                        field_def.parser, value)
                                else:
                                    try:
                                        ast = parser_field_parsers.string_to_ast(
                                            field_def.parser, value)
                                    except:
                                        ast = parser_field_parsers.string_to_ast(
                                            unquoted_string, value)

                                # Rules are in charge of informing if the result is expandable and if it complex
                                if "expandable" in ast and ast["expandable"]:
                                    issues.append(
                                        Issue(
                                            itype=IType.ERROR,
                                            description=
                                            f"The value in field '{col_header}' of command "
                                            f"'{cmd_name}' should not be expandable. Entered: {value}",
                                            location=IssueLocation(
                                                sheet_name=name,
                                                row=r,
                                                column=col_idx)))
                                if "complex" in ast and ast["complex"]:
                                    complex = True

                                # With many appearances, just a "Key-Value list" syntax is permitted
                                if field_def.many_appearances:
                                    if field_name in line:
                                        line[
                                            field_name] += ", " + col_name + "='" + value + "'"
                                    else:
                                        line[
                                            field_name] = col_name + "='" + value + "'"
                                else:
                                    if field_name in line:
                                        line[field_name] += ", " + value
                                    else:
                                        line[
                                            field_name] = value  # Store the value
                            except:
                                import traceback
                                traceback.print_exc()
                                col_header = sh.cell(row=1,
                                                     column=col_idx).value
                                issues.append(
                                    Issue(
                                        itype=IType.ERROR,
                                        description=
                                        f"The value in field '{col_header}' of command "
                                        f"'{cmd_name}' is not syntactically correct. Entered: {value}",
                                        location=IssueLocation(
                                            sheet_name=name,
                                            row=r,
                                            column=col_idx)))
                        else:
                            line[
                                field_name] = value  # No parser, just store blindly the value

            if field_defined and field_def.name in mandatory_not_found:
                mandatory_not_found.discard(field_def.name)

        if len(line) == 0:
            continue  # Empty line (allowed)

        # Flags to accelerate the second evaluation, during execution
        line["_row"] = r
        line["_expandable"] = list(expandable)
        line["_complex"] = complex

        # Append if all mandatory fields have been filled
        may_append = True
        if len(mandatory_not_found) > 0:
            issues.append(
                Issue(itype=IType.ERROR,
                      description="Mandatory columns: " +
                      ", ".join(mandatory_not_found) +
                      " have not been specified",
                      location=IssueLocation(sheet_name=name,
                                             row=r,
                                             column=None)))
            may_append = False

        # Check varying mandatory fields (fields depending on the value of other fields)
        for c in complex_mandatory_cols:
            field_def = c.name  # next(c2 for c2 in col_map if strcmp(c.name, c2.name))
            if isinstance(c.mandatory, str):
                # Evaluate
                mandatory = eval(c.mandatory, None, line)
                may_append = (mandatory
                              and field_def in line) or (not mandatory)
                if mandatory and field_def not in line:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Mandatory column: " + field_def +
                              " has not been specified",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))

        if may_append:
            content.append(line)

    return issues, None, {"items": content, "command_name": name}
Exemplo n.º 26
0
        def process_row(row):
            """
            Process a dictionary representing a row of the data input command. The dictionary can come directly from
            the worksheet or from a dataset.

            Implicitly uses "glb_idx"

            :param row: dictionary
            """
            # From "ff_type" extract: flow/fund, external/internal, incoming/outgoing
            # ecosystem/society?
            ft = row["ff_type"].lower()
            if ft == "int_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = True
                incoming = True
            elif ft == "int_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = True
                incoming = True
            elif ft == "ext_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = False
                incoming = True
            elif ft == "int_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = True
                incoming = False
            elif ft == "ext_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = True
            elif ft == "ext_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = False
            elif ft == "env_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = False
            elif ft == "env_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = True
            elif ft == "env_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = False
                incoming = True

            # Split "taxa" attributes. "scale" corresponds to the observation
            p_attributes = row["taxa"].copy()
            if "scale" in p_attributes:
                other_attrs = create_dictionary()
                other_attrs["scale"] = p_attributes["scale"]
                del p_attributes["scale"]
            else:
                other_attrs = None

            # Check existence of PedigreeMatrix, if used
            if "pedigree_matrix" in row:
                pm = glb_idx.get(
                    PedigreeMatrix.partial_key(name=row["pedigree_matrix"]))
                if len(pm) != 1:
                    issues.append((3, "Could not find Pedigree Matrix '" +
                                   row["pedigree_matrix"] + "'"))
                    del row["pedigree_matrix"]
                else:
                    try:
                        lst = pm[0].get_modes_for_code(row["pedigree"])
                    except:
                        issues.append(
                            (3, "Could not decode Pedigree '" +
                             row["pedigree"] + "' for Pedigree Matrix '" +
                             row["pedigree_matrix"] + "'"))
                        del row["pedigree"]
                        del row["pedigree_matrix"]
            else:
                if "pedigree" in row:
                    issues.append((
                        3,
                        "Pedigree specified without accompanying Pedigree Matrix"
                    ))
                    del row["pedigree"]

            # Source
            if "source" in row:
                try:
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, row["source"])
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(Reference.partial_key(ref_id),
                                             ref_type="provenance")
                    if len(references) == 1:
                        source = references[0]
                except:
                    source = row["source"]
            else:
                source = None

            # Geolocation
            if "geolocation" in row:
                try:
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, row["geolocation"])
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(Reference.partial_key(ref_id),
                                             ref_type="geographic")
                    if len(references) == 1:
                        geolocation = references[0]
                except:
                    geolocation = row["geolocation"]
            else:
                geolocation = None

            # CREATE FactorType, A Type of Observable, IF it does not exist
            # AND ADD Quantitative Observation
            p, ft, f, o = create_or_append_quantitative_observation(
                glb_idx,
                factor=row["processor"] + ":" + row["factor"],
                value=row["value"] if "value" in row else None,
                unit=row["unit"],
                observer=source,
                spread=row["uncertainty"] if "uncertainty" in row else None,
                assessment=row["assessment"] if "assessment" in row else None,
                pedigree=row["pedigree"] if "pedigree" in row else None,
                pedigree_template=row["pedigree_matrix"]
                if "pedigree_matrix" in row else None,
                relative_to=row["relative_to"]
                if "relative_to" in row else None,
                time=row["time"] if "time" in row else None,
                geolocation=None,
                comments=row["comments"] if "comments" in row else None,
                tags=None,
                other_attributes=other_attrs,
                proc_aliases=None,
                proc_external=False,  # TODO
                proc_attributes=p_attributes,
                proc_location=None,
                ftype_roegen_type=roegen_type,
                ftype_attributes=None,
                fact_external=not internal,
                fact_incoming=incoming,
                fact_location=geolocation)
            if p_set.append(
                    p, glb_idx
            ):  # Appends codes to the pset if the processor was not member of the pset
                p_set.append_attributes_codes(row["taxa"])
Exemplo n.º 27
0
def parse_indicators_command(sh, area):
    """

    :param sh:
    :param area:
    :return:
    """

    some_error = False
    issues = []
    """
        self._name = name
        self._formula = formula
        self._from_indicator = from_indicator
        self._benchmark = benchmark
        self._indicator_category = indicator_category
    
    """
    # Scan the sheet, the first column must be one of the keys
    col_names = {
        ("name", ): "name",  # Name of the indicator
        (
            "formula",
            "expression",
        ): "formula",  # Expression to compute the indicator
        ("benchmark", ):
        "benchmark",  # Once calculated, a frame to qualify the goodness of the indicator
        ("description", "label", "desc"): "description"
    }

    # Check columns
    col_map = {}
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value
        for k in col_names:
            if col_name.lower() in k:
                col_map[col_names[k]] = c
                break

    # Map key to a list of values
    content = []  # Dictionary of lists, one per metadata key
    for r in range(area[0] + 1, area[1]):
        indicator = {}
        for k in col_names.values():
            if k not in col_map:
                continue

            value = sh.cell(row=r, column=col_map[k]).value

            if not value:
                continue

            if k == "name":  # Mandatory
                # Check syntax
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.simple_ident, value)
                    indicator[k] = value
                except:
                    some_error = True
                    issues.append(
                        (3, "The name specified for the indicator, '" + value +
                         "', is not valid, in row " + str(r) +
                         ". It must be a simple identifier."))
            elif k == "formula":  # Mandatory
                # Check syntax
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.indicator_expression, value)
                    indicator[k] = value
                except:
                    some_error = True
                    issues.append(
                        (3, "The Formula specified for the indicator, '" +
                         value + "', is not valid, in row " + str(r) + "."))
            elif k == "benchmark":  # Optional
                # This column can appear multiple times.
                # Check syntax
                if value.lower().strip() in ():
                    if value.lower().strip() in ("number", "float"):
                        value = "number"  # "float" --> "number"
                    indicator[k] = value
                else:
                    some_error = True
                    issues.append(
                        (3, "The Type specified for the parameter, '" + value +
                         "', is not valid, in row " + str(r) +
                         ". It must be one of 'category', 'integer', 'number'."
                         ))
            elif k == "description":  # Optional
                indicator[k] = value

        # Check indicator completeness before adding it to the list of indicators
        if "name" not in indicator:
            issues.append((3, "The indicator must have a Name, row " + str(r)))
            continue
        if "formula" not in indicator:
            issues.append(
                (3, "The indicator must have a Formula, row " + str(r)))
            continue

        content.append(indicator)

    return issues, None, content