def get_datasets(self,
                     source: Union[IDataSourceManager, str] = None,
                     database=None,
                     local_datasets=None):
        # Register AdHoc datasets
        self.register_local_datasets(local_datasets)

        if source:
            source = self._get_source_manager(source)

        lst = []
        if source:
            if strcmp(source.get_name(), "AdHoc"):
                lst = [(source.get_name(), source.get_datasets(database))]
            else:
                lst = self.get_external_datasets(source, database)
        else:  # ALL DATASETS
            lst_total = []
            lst = self.get_external_datasets(
                source, database
            )  # Because "get_external_datasets" uses "Memoize", DO NOT modify "lst" outside
            lst_total.extend(lst)
            for s in self.registry:
                if strcmp(s, "AdHoc") and local_datasets:
                    lst_total.append(
                        (s, [ds for ds in self.registry[s].get_datasets()]))

        # Unregister AdHoc datasets
        self.unregister_local_datasets(local_datasets)

        return lst
def get_interface_type(attribute, value, prd: PartialRetrievalDictionary = None):
    """
    Obtain the name of an InterfaceType given the value of an attribute
    (Obtain the registry of objects)

    :param attribute:
    :param value:
    :param prd: A PartialRetrievalDictionary, passed in State "_glb_idx" to the AST evaluator by
    :return:
    """

    if not prd:
        raise Exception(f"No Global-Index parameter passed to InterfaceType function")
    else:
        # Obtain ALL InterfaceTypes, then ONE having attribute "attribute" with value <value>
        its = prd.get(FactorType.partial_key())
        ret = None
        for it in its:
            v = vars(it).get(attribute)
            if not v:
                v = it.attributes.get(attribute)
            if v and (strcmp(v, str(value)) or (is_float(value) and float(v) == float(value))):
                ret = it.name
                break
        if ret:
            return ret
        else:
            raise Exception(f"No InterfaceType found having attribute '{attribute}' with value '{value}'")
    def get_external_datasets(self,
                              source: Union[IDataSourceManager, str] = None,
                              database=None):
        """
        Obtain a list of tuples (Source, Dataset name)

        :param source: If specified, the name of the source
        :param database: If specified, the name of a database in the source
        :return: List of tuples (Source name, Dataset name)
        """

        if source:
            source = self._get_source_manager(source)

        if source:
            if database:  # SOURCE+DATABASE DATASETS
                return [(source.get_name(), source.get_datasets(database))]
            else:  # ALL SOURCE DATASETS
                lst = []
                for db in source.get_databases():
                    lst.extend(source.get_datasets(db))
                return [(source.get_name(), lst)
                        ]  # List of tuples (dataset code, description, urn)
        else:  # ALL DATASETS
            lst = []
            for s in self.registry:
                if not strcmp(s, "AdHoc"):
                    lst.append(
                        (s, [ds for ds in self.registry[s].get_datasets()]))
            return lst  # List of tuples (source, dataset code, description, urn)
def get_processor(attribute, value, prd: PartialRetrievalDictionary = None):
    """
    Obtain the name of a Processor given the value of an attribute
    (Obtain the registry of objects)

    :param attribute:
    :param value:
    :param prd: A PartialRetrievalDictionary, passed in State "_glb_idx" to the AST evaluator by
    :return:
    """

    if not prd:
        raise Exception(f"No Global-Index parameter passed to Processor function")
    else:
        # Obtain ALL Processors, then ONE having attribute "attribute" with value <value>
        procs = prd.get(Processor.partial_key())
        ret = None
        for proc in procs:
            v = vars(proc).get(attribute)
            if not v:
                v = proc.attributes.get(attribute)
            if v and (strcmp(v, str(value)) or (is_float(value) and float(v) == float(value))):
                ret = proc.name
                break
        if ret:
            return ret
        else:
            raise Exception(f"No Processor found having attribute '{attribute}' with value '{value}'")
Exemplo n.º 5
0
    def _constrains_interface(self, scale: str, invoking_interface_name: str,
                              requested_interface_name: str,
                              parent_processor: Processor,
                              child_processor: Processor):

        origin_factor = first(
            parent_processor.factors,
            lambda i: strcmp(i.name, invoking_interface_name))

        if not origin_factor:
            raise Exception("Invoking interface name '" +
                            invoking_interface_name +
                            "' not found for processor '" +
                            parent_processor.name + "'")

        destination_factor = first(
            child_processor.factors,
            lambda i: strcmp(i.name, requested_interface_name))

        if not destination_factor:
            raise Exception("Requested interface name '" +
                            invoking_interface_name +
                            "' not found for processor '" +
                            parent_processor.name + "'")

        if origin_factor.taxon != destination_factor.taxon:
            # Search for an Interface Type Conversion defined in the ScaleChangeMap command
            interface_types_transform = self._get_interface_types_transform(
                origin_factor.taxon, parent_processor,
                destination_factor.taxon, child_processor)
            scale = FloatOrString.multiply(
                scale, interface_types_transform.scaled_weight)

        relationship = FactorsRelationScaleObservation.create_and_append(
            origin=origin_factor,
            destination=destination_factor,
            observer=None,
            quantity=scale)

        # relationship = ProcessorsRelationUpscaleObservation.create_and_append(parent=parent_processor,
        #                                                                       child=child_processor,
        #                                                                       observer=None,
        #                                                                       factor_name=interface_name,
        #                                                                       quantity=scale)

        self._glb_idx.put(relationship.key(), relationship)
Exemplo n.º 6
0
 def _scale_observations_relative_to_interface(self, processor: Processor,
                                               interface_name: str,
                                               scale: Union[int, float]):
     for factor in processor.factors:
         for observation in factor.quantitative_observations:
             relative_to_interface = observation.attributes.get(
                 "relative_to", None)
             if relative_to_interface and strcmp(relative_to_interface.name,
                                                 interface_name):
                 observation.value = float(observation.value) * scale
                 observation.attributes["relative_to"] = None
    def _check_flow_orientation(self, source_processor: Processor,
                                target_processor: Processor,
                                source_interface: Factor,
                                target_interface: Factor,
                                is_direct_flow: bool):
        """Check for correct interfaces orientation (input/output) of source and target"""
        allowed_source_orientation = ("Output" if is_direct_flow else "Input")

        # Are the orientations equal?
        if strcmp(source_interface.orientation, target_interface.orientation):
            if strcmp(source_interface.orientation,
                      allowed_source_orientation):
                # Target processor should be parent of source processor
                parent_processor, child_processor = target_processor, source_processor
            else:
                # Source processor should be parent of target processor
                parent_processor, child_processor = source_processor, target_processor

            if child_processor not in parent_processor.children(self._glb_idx):
                raise CommandExecutionError(
                    f"The processor '{child_processor.name}' should be part of the "
                    f"processor '{parent_processor.name}' when using the same interface "
                    f"orientation '{source_interface.orientation}'.")

        else:  # Orientations are different
            if not strcmp(source_interface.orientation,
                          allowed_source_orientation):
                raise CommandExecutionError(
                    f"The source interface '{source_interface.full_name}' has the wrong "
                    f"orientation '{source_interface.orientation}'.")

            if strcmp(target_interface.orientation,
                      allowed_source_orientation):
                raise CommandExecutionError(
                    f"The target interface '{target_interface.full_name}' has the wrong "
                    f"orientation '{target_interface.orientation}'.")
Exemplo n.º 8
0
def get_parameters_in_state(state: State):
    res = []
    query = BasicQuery(state)
    for p in query.execute([Parameter], filt=""):
        p_name = p.name
        p_type = p.type
        if p.range:
            if strcmp(p_type, "Number"):
                p_range = p.range
            else:
                glb_idx, _, _, _, _ = get_case_study_registry_objects(state)
                h = glb_idx.get(Hierarchy.partial_key(p.range))
                h = h[0]
                p_range = ', '.join(h.codes.keys())
        else:
            p_range = ""
        res.append(dict(name=p_name, type=p_type, range=p_range))
    return res
    def _process_row(self, fields: Dict[str, Any], subrow=None) -> None:
        """
        Create and register Indicator object

        :param fields:
        """
        benchmark_names = fields["benchmarks"]

        benchmarks = []
        if benchmark_names:
            for benchmark_name in benchmark_names.split(","):
                if benchmark_name:
                    benchmark = self._glb_idx.get(Benchmark.partial_key(benchmark_name))
                    if len(benchmark) == 1:
                        benchmark = benchmark[0]
                    elif len(benchmark) == 0:
                        self._add_issue(IType.ERROR,
                                        f"Benchmark {benchmark_name} does not exist (it must be declared previously in a "
                                        "ScalarBenchmark command worksheet")
                        return
                    elif len(benchmark) > 1:
                        self._add_issue(IType.ERROR,
                                        f"Benchmark {benchmark_name} exists {len(benchmark)} times."
                                        " Only one occurrence is allowed.")
                        return
                else:
                    benchmark = None
                if benchmark:
                    benchmarks.append(benchmark)

        indicator = Indicator(fields["indicator_name"],
                              fields["formula"],
                              None,
                              fields.get("processors_selector"),
                              benchmarks,
                              IndicatorCategories.factors_expression if strcmp(fields.get("local"), "Yes")
                              else IndicatorCategories.case_study,
                              fields.get("description"),
                              fields["indicators_group"],
                              fields["unit"],
                              fields["unit_label"],
                              fields["source"])
        self._glb_idx.put(indicator.key(), indicator)
Exemplo n.º 10
0
    def _get_factor_type_from_field(
            self, hierarchy_field_name: str,
            interface_type_field_name: str) -> FactorType:
        interface_type_name = self._fields[interface_type_field_name]
        if not interface_type_name:
            raise CommandExecutionError(
                f"The field '{interface_type_field_name}' has not been specified"
            )

        # Check if FactorType exists
        interface_types = self._glb_idx.get(
            FactorType.partial_key(interface_type_name))

        if len(interface_types) == 1:
            return interface_types[0]
        elif len(interface_types) == 0:
            raise CommandExecutionError(
                f"The interface type '{interface_type_name}' has not been found"
            )
        else:
            hierarchy_name = self._fields[hierarchy_field_name]
            if not hierarchy_name:
                raise CommandExecutionError(
                    f"The field '{hierarchy_field_name}' has not been specified and "
                    f"the interface type '{interface_type_name}' is not unique"
                )

            interface_type = first(
                interface_types,
                lambda t: strcmp(t.hierarchy.name, hierarchy_name))
            if not interface_type:
                raise CommandExecutionError(
                    f"The interface type '{interface_type_name}' has not been found in "
                    f"hierarchy '{hierarchy_name}'")

            return interface_type
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType,
                                       dataset_name: str,
                                       state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)
    # Dataset source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)

    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)

    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = [k.lower()
                     for k in dims[d].code_list.keys()]  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True
    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            tmp = [
                to["d"] for o in mappings[m].map for to in o["to"] if to["d"]
            ]
            cl[mappings[m].destination] = set(
                tmp)  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.
    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian
    measures = []
    out_dims = []
    agg_funcs = []
    measures_as = []
    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue

        if col_name.lower().strip() in [
                "dimensions_kept", "dims", "dimensions"
        ]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for d in lst:
                if not d:
                    continue
                if d not in cl:
                    issues.append((
                        3, "The dimension specified for output, '" + d +
                        "' is neither a dataset dimension nor a mapped dimension. ["
                        + ', '.join([d2 for d2 in cl]) + "]"))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in [
                "aggregation_function", "aggfunc", "agg_func"
        ]:  # "SELECT AGGREGATORS"
            lst = obtain_column(c, area[0] + 1, area[1])
            for f in lst:
                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append((
                        3, "The specified aggregation function, '" + f +
                        "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'"
                    ))
                else:
                    agg_funcs.append(f)
        elif col_name.lower().strip() in ["measures"]:  # "SELECT"
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for m in lst:
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        (3, "The specified measure, '" + m +
                         "' is not a measure available in the dataset. [" +
                         ', '.join([m2 for m2 in measures]) + "]"))
                else:
                    measures.append(m)
        elif col_name.lower().strip() in ["measuresas"]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for m in lst:
                measures_as.append(m)
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for cd in lst:
                if not cd:
                    continue
                if str(cd).lower() not in cl[col_name]:
                    issues.append((
                        3, "The code '" + cd +
                        "' is not present in the codes declared for dimension '"
                        + col_name + "'. Please, check them."))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in ["result_name", "result name", "resultname"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append((3, "Column '" + col_name +
                                   "' has an invalid dataset name '" +
                                   result_name + "'"))

    if len(measures) == 0:
        issues.append((3, "At least one measure should be specified"))

    if len(agg_funcs) == 0:
        issues.append(
            (2, "No aggregation function specified. Assuming 'average'"))
        agg_funcs.append("average")

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            (2, "No result name specified. Assuming '" + result_name + "'"))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": None,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Exemplo n.º 12
0
def prepare_model(state) -> NoReturn:
    """
    Modify the state so that:
    * Implicit references of Interfaces to subcontexts are materialized
      * Creating processors
      * Creating interfaces in these processors
      * Creating relationships in these processors

    :param state:
    """

    # TODO: currently when an interface is defined as a Scale from two or more interfaces, the computed values are
    #  added while the intuition tells us that only one scale should be defined. We have to give a warning message
    #  if this situation happens.

    # Registry and the other objects also
    glb_idx, _, _, _, _ = get_case_study_registry_objects(state)
    # Prepare a Query to obtain ALL interfaces
    query = BasicQuery(state)
    filt = {}
    objs = query.execute([Factor], filt)
    for iface in objs[Factor]:  # type: Factor
        if strcmp(
                iface.processor.instance_or_archetype, 'Archetype') or strcmp(
                    iface.processor.instance_or_archetype, 'No'):
            continue

        # If the Interface is connected to a "Subcontext" different than the owning Processor
        if iface.opposite_processor_type:
            if iface.opposite_processor_type.lower(
            ) != iface.processor.subsystem_type.lower():
                # Check if the interface has flow relationships
                # TODO An alternative is to search "observations" of type FactorsRelationDirectedFlowObservation
                #      in the same "iface"

                if iface.orientation.lower() == "input":
                    parameter = {"target": iface}
                else:
                    parameter = {"source": iface}

                relations = glb_idx.get(
                    FactorsRelationDirectedFlowObservation.partial_key(
                        **parameter))

                # If it does not have flow relationships:
                #  * define default Processor name and retrieve it (or if it does not exist, create it)
                #  * create an Interface into that Processor and a Flow Relationship
                if len(relations) == 0:
                    # Define the name of a Processor in the same context but in different subcontext
                    p_name = iface.processor.processor_system + "_" + iface.opposite_processor_type
                    p = glb_idx.get(Processor.partial_key(p_name))
                    if len(p) == 0:
                        attributes = {
                            'subsystem_type': iface.opposite_processor_type,
                            'processor_system':
                            iface.processor.processor_system,
                            'functional_or_structural': 'Functional',
                            'instance_or_archetype': 'Instance'
                            # 'stock': None
                        }

                        p = Processor(p_name, attributes=attributes)
                        glb_idx.put(p.key(), p)
                    else:
                        p = p[0]

                    attributes = {
                        'sphere':
                        'Technosphere'
                        if iface.opposite_processor_type.lower()
                        in ["local", "external"] else 'Biosphere',
                        'roegen_type':
                        iface.roegen_type,
                        'orientation':
                        "Input"
                        if iface.orientation.lower() == "output" else "Output",
                        'opposite_processor_type':
                        iface.processor.subsystem_type
                    }

                    # Create Interface (if it does not exist)
                    if not p.factors_find(iface.taxon.name):
                        f = Factor.create_and_append(
                            name=iface.taxon.name,
                            processor=p,
                            in_processor_type=FactorInProcessorType(
                                external=False,
                                incoming=iface.orientation.lower() ==
                                "output"),
                            attributes=attributes,
                            taxon=iface.taxon)

                        glb_idx.put(f.key(), f)

                    # Create Flow Relationship
                    if iface.orientation.lower() == "output":
                        source = iface
                        target = f
                    else:
                        source = f
                        target = iface

                    fr = FactorsRelationDirectedFlowObservation.create_and_append(
                        source=source, target=target, observer=None)
                    glb_idx.put(fr.key(), fr)
Exemplo n.º 13
0
    def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None:
        """
        Process a dictionary representing a row of the InterfaceTypes command. The dictionary can come directly from
        the worksheet or from a dataset.

        :param field_values: dictionary
        """

        # Read variables
        ft_h_name = field_values.get(
            "interface_type_hierarchy",
            "_default")  # "_default" InterfaceType Hierarchy NAME <<<<<<
        ft_name = field_values.get("interface_type")
        ft_sphere = field_values.get("sphere")
        ft_roegen_type = field_values.get("roegen_type")
        ft_parent = field_values.get("parent_interface_type")
        ft_formula = field_values.get("formula")
        ft_description = field_values.get("description")
        ft_unit = field_values.get("unit")
        ft_opposite_processor_type = field_values.get(
            "opposite_processor_type")
        ft_level = field_values.get("level")
        ft_attributes = field_values.get("attributes", {})
        print(str(type(ft_attributes)))
        if ft_attributes:
            try:
                ft_attributes = dictionary_from_key_value_list(
                    ft_attributes, self._glb_idx)
            except Exception as e:
                self._add_issue(IType.ERROR,
                                str(e) + subrow_issue_message(subrow))
                return
        else:
            ft_attributes = {}

        # Process
        # Mandatory fields
        if not ft_h_name:
            self._add_issue(
                IType.WARNING,
                "Empty interface type hierarchy name. It is recommended to specify one, assuming '_default'."
                + subrow_issue_message(subrow))
            ft_h_name = "_default"

        if not ft_name:
            self._add_issue(
                IType.ERROR, "Empty interface type name. Skipped." +
                subrow_issue_message(subrow))
            return

        # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it
        hie = self._glb_idx.get(Hierarchy.partial_key(name=ft_h_name))
        if not hie:
            hie = Hierarchy(name=ft_h_name, type_name="interfacetype")
            self._glb_idx.put(hie.key(), hie)
        else:
            hie = hie[0]

        # If parent defined, check if it exists
        # (it must be registered both in the global registry AND in the hierarchy)
        if ft_parent:
            parent = self._glb_idx.get(FactorType.partial_key(ft_parent))
            if len(parent) > 0:
                for p in parent:
                    if p.hierarchy == hie:
                        parent = p
                        break
                if not isinstance(parent, FactorType):
                    self._add_issue(
                        IType.ERROR,
                        f"Parent interface type name '{ft_parent}' not found in hierarchy '{ft_h_name}"
                        + subrow_issue_message(subrow))
                    return
            else:
                self._add_issue(
                    IType.ERROR,
                    f"Parent interface type name '{ft_parent}' not found" +
                    subrow_issue_message(subrow))
                return
            # Double check, it must be defined in "hie"
            if ft_parent not in hie.codes:
                self._add_issue(
                    IType.ERROR,
                    f"Parent interface type name '{ft_parent}' not registered in the hierarchy '{ft_h_name}'"
                    + subrow_issue_message(subrow))
                return
        else:
            parent = None

        # Check if FactorType exists
        ft = self._glb_idx.get(FactorType.partial_key(ft_name))
        if len(ft) == 0:
            # TODO Compile and CONSIDER attributes (on the FactorType side)
            roegen_type = None
            if ft_roegen_type:
                roegen_type = FlowFundRoegenType.flow if strcmp(
                    ft_roegen_type, "flow") else FlowFundRoegenType.fund

            ft = FactorType(
                ft_name,
                parent=parent,
                hierarchy=hie,
                roegen_type=roegen_type,
                tags=None,  # No tags
                attributes=dict(unit=ft_unit,
                                description=ft_description,
                                level=ft_level,
                                **ft_attributes),
                expression=ft_formula,
                sphere=ft_sphere,
                opposite_processor_type=ft_opposite_processor_type)
            # Simple name
            self._glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft)
            if not strcmp(ft_name, ft.full_hierarchy_name()):
                self._glb_idx.put(
                    FactorType.partial_key(ft.full_hierarchy_name(), ft.ident),
                    ft)
        else:
            self._add_issue(
                IType.WARNING,
                f"Interface type name '{ft_name}' already registered" +
                subrow_issue_message(subrow))
            return
Exemplo n.º 14
0
    def execute(self, state: "State"):
        """
        Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept:
        * FactorTypes and Categories use Hierarchies, which are intrinsic.
            The hierarchy name is passed to the containing Hierarchy object
        * Processors use Part-Of Relations. In this case, the hierarchy name is lost
        Names of Processor and FactorTypes are built both in hierarchical and simple form
        The hierarchical is all the ancestors from root down to the current node, separated by "."
        The simple name is just the current node. If there is already another concept with that name, the simple name
        is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??)
        """
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # Process parsed information
        for item in self._content["items"]:
            r = item["_row"]
            # HierarchySource (Optional)
            hsource = item.get("source",
                               None)  # Code of entity defining the Hierarchy
            if hsource:
                tmp = hsource
                hsource = glb_idx.get(
                    HierarchySource.partial_key(name=hsource))
                if len(hsource) == 0:
                    hsource = HierarchySource(name=tmp)
                    glb_idx.put(hsource.key(), hsource)
                else:
                    hsource = hsource[0]

            hname = item.get("hierarchy_name", None)
            if not hname:
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        "The name of the Hierarchy has not been defined. Skipped.",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                continue

            # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL)
            hg = item.get("hierarchy_group", None)
            if hg:
                is_code_list = False  # Hierarchy group
            else:
                is_code_list = True  # Hierarchy group for the Code List, with the same name
                hg = hname

            # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup
            tmp = hg
            hg = glb_idx.get(HierarchyGroup.partial_key(name=hg))
            if len(hg) == 0:
                hg = HierarchyGroup(name=tmp, source=hsource)
                glb_idx.put(hg.key(), hg)
            else:
                hg = hg[0]

            # Check if the Hierarchy is defined. YES, get it; NO, create it
            tmp = hname
            h = glb_idx.get(Hierarchy.partial_key(name=hname))
            if len(h) == 0:
                h = Hierarchy(name=tmp)
                glb_idx.put(h.key(), h)
                glb_idx.put(h.key(hg.name + "." + h.name),
                            h)  # Register with alternative (full) name
            else:
                h = h[0]

            # Add the Hierarchy to the HierarchyGroup (if not)
            if h not in hg.hierarchies:
                hg.hierarchies.append(h)

            # Level
            level = item.get("level", None)
            if level:
                # Check if the level is defined. YES, get it; NO, create it
                for l in h.levels:
                    if strcmp(l.name, level):
                        level = l
                        break
                else:
                    level = HierarchyLevel(name=level, hierarchy=h)
                    h.levels.append(level)

            code = item.get("code", None)
            label = item.get("label", None)
            description = item.get("description", None)
            attributes = item.get("attributes", None)
            expression = item.get("expression", None)

            # Parent property (what really defines Hierarchies)
            parent_code = item.get("parent_code", None)
            if parent_code:
                ph = h  # Parent Hierarchy is the same as current hierarchy
                pcode = ph.codes.get(parent_code, None)
                if not pcode:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Could not find code '" +
                              parent_code + "' in hierarchy '" + ph.name +
                              "'. Skipped.",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
            else:
                pcode = None

            # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned
            if not is_code_list:
                ref_hierarchy = item.get("referred_hierarchy", None)
                if not ref_hierarchy:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            "For HCLs, defining ReferredHierarchy is mandatory",
                            location=IssueLocation(sheet_name=name,
                                                   row=r,
                                                   column=None)))
                    continue

                tmp = ref_hierarchy
                ref_hierarchy = glb_idx.get(
                    Hierarchy.partial_key(name=ref_hierarchy))
                if len(ref_hierarchy) == 0:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="ReferredHierarchy '" + tmp +
                              "' not defined previously",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    ref_hierarchy = ref_hierarchy[0]

                ref_code = ref_hierarchy.codes.get(code, None)
                if not ref_code:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Code '" + code +
                              "' not found in referred hierarchy '" +
                              ref_hierarchy.name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Ignore: LABEL, DESCRIPTION. Copy them from referred code
                label = ref_code.label
                description = ref_code.description
            else:
                ref_code = None

            c = h.codes.get(code, None)
            if c:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="Code '" + code + "' in hierarchy '" +
                          h.name + "' redefined.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue

            # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other
            # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None)
            c = Taxon(name=code,
                      hierarchy=h,
                      level=level,
                      referred_taxon=ref_code,
                      parent=pcode,
                      label=label,
                      description=description,
                      attributes=attributes,
                      expression=expression)
            # Add code to hierarchy
            h.codes[code] = c
            if not c.parent:
                h.roots_append(c)
            # Add code to level
            if level:
                level.codes.add(c)
            # Add child to parent code
            # (DONE BY THE CONSTRUCTOR!!)
            # if pcode:
            #     pcode.children_codes.append(c)

        return issues, None  # Issues, Output
Exemplo n.º 15
0
def parse_scale_conversion_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType:
    """
    Analyze the input area
    Obtain the numerical part
    Read a row above and a column to the left, looking for source (left col) and target (row above) factor types

    FactorTypes do not need to exist previously, they can be created

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """

    def get_subrow(r, c1, c2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        previous = None
        for c in range(c1, c2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)

        return lst

    def get_subcolumn(c, r1, r2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        # !!! This may not be correct at all times: when a cell is intentionally left blank
        # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range
        previous = None
        for r in range(r1, r2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)
        return lst

    # ---------------------------------------------

    some_error = False
    issues = []

    # Detect the matrix defining scales
    m = binary_mask_from_worksheet(sh, True)  # "True" is to focus on cells containing numbers
    # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix
    t = obtain_rectangular_submatrices(m)[0]  # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3]
    t = (t[0]+1, t[1]+1, t[2]+1, t[3]+1)  # The previous calculation is done using Numpy, so it is Zero based. Correct this

    # Obtain the factor type names in the subrow on top of the matrix
    subrow = get_subrow(t[0]-1, t[2], t[3])
    # Obtain the factor type names in the subcolumn to the left of the matrix
    subcol = get_subcolumn(t[2]-1, t[0], t[1])

    # Check that we have valid factor type names
    for ft in subrow+subcol:
        try:
            parser_field_parsers.string_to_ast(parser_field_parsers.simple_h_name, ft)
        except:
            some_error = True
            issues.append((3, "'"+ft+"' is not a valid Factor Type name"))
    if some_error:
        return issues, None, None

    # Scan the matrix, creating scale records
    scales = []
    for i, r in enumerate(range(t[0], t[1])):
        for j, c in enumerate(range(t[2], t[3])):
            v = sh.cell(row=r, column=c).value
            if v:
                if not isinstance(v, str):
                    v = str(v)
                # Origin factor
                origin = subcol[i]
                # Destination factor
                destination = subrow[j]
                if strcmp(origin, destination):
                    issues.append((3, "A change of scale to the same factor type ("+origin+") is not allowed"))
                else:
                    try:
                        parser_field_parsers.string_to_ast(parser_field_parsers.expression_with_parameters, v)
                        # Add the scale
                        scales.append(dict(origin=origin, destination=destination, scale=v))
                    except:
                        issues.append((3, "The expression '"+v+"' at the intersection of factor types " + origin + " and " + destination + " is syntactically incorrect"))

    content = {"origin_factor_types": subcol,
               "destination_factor_types": subrow,
               "scales": scales
               }

    return issues, None, content
Exemplo n.º 16
0
    def _process_row(self, fields: Dict[str, Any], subrow=None) -> None:
        scaling_type = fields["scaling_type"]
        scale: str = fields["scale"]

        # Find processors
        invoking_processor = self._get_processor_from_field(
            "invoking_processor")
        requested_processor = self._get_processor_from_field(
            "requested_processor")

        if invoking_processor == requested_processor:
            raise CommandExecutionError(
                f"Invoking and Requested processors cannot be the same '{invoking_processor.name}'. "
                f"Use the 'relative_to' attribute in 'Interfaces' command instead."
            )

        invoking_interface_name: str = fields["invoking_interface"]
        requested_interface_name: str = fields["requested_interface"]

        requested_new_processor_name: str = fields["new_processor_name"]

        ##
        # Transform text of "attributes" into a dictionary
        if fields.get("attributes"):
            try:
                fields["attributes"] = dictionary_from_key_value_list(
                    fields["attributes"], self._glb_idx)
            except Exception as e:
                self._add_issue(IType.ERROR,
                                str(e) + subrow_issue_message(subrow))
                return
        else:
            fields["attributes"] = {}

        # Process specific fields

        # Obtain the parent: it must exist. It could be created dynamically but it's important to specify attributes
        if fields.get("parent_processor"):
            try:
                parent_processor = self._get_processor_from_field(
                    "parent_processor")
            except CommandExecutionError:
                self._add_issue(
                    IType.ERROR,
                    f"Specified parent processor, '{fields.get('parent_processor')}', does not exist"
                    + subrow_issue_message(subrow))
                return
        else:
            parent_processor = None

        # Get internal and user-defined attributes in one dictionary
        attributes = {
            c.name: fields[c.name]
            for c in self._command_fields
            if c.attribute_of == Processor and fields[c.name] is not None
        }

        # print(f"Invoking: {invoking_processor.name}:{invoking_interface_name}, Requested: {requested_processor.name}:{requested_interface_name}")

        requested_processor_clone = None
        if strcmp(scaling_type, "CloneAndScale") or strcmp(
                scaling_type, "Clone"):
            # TODO: check “RequestedProcessor” must be an archetype
            # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor”
            requested_processor_clone = self._clone_processor_as_child(
                processor=requested_processor,
                parent_processor=invoking_processor
                if not parent_processor else parent_processor,
                name=requested_new_processor_name,
                other_attributes=attributes)

            if strcmp(scaling_type, "CloneAndScale"):
                # 2. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale”
                try:
                    self._constrains_interface(
                        scale=scale,
                        invoking_interface_name=invoking_interface_name,
                        requested_interface_name=requested_interface_name,
                        parent_processor=invoking_processor,
                        child_processor=requested_processor_clone)
                except Exception as e:
                    self._add_issue(IType.ERROR,
                                    str(e) + subrow_issue_message(subrow))
                    return
        elif strcmp(scaling_type, "Scale"):
            # Processors must be of same type (archetype or instance)
            if not strcmp(invoking_processor.instance_or_archetype,
                          requested_processor.instance_or_archetype):
                raise CommandExecutionError(
                    "Requested and invoking processors should be of the same type "
                    "(both instance or_archetype)")

            # 1. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale”
            try:
                self._constrains_interface(
                    scale=scale,
                    invoking_interface_name=invoking_interface_name,
                    requested_interface_name=requested_interface_name,
                    parent_processor=invoking_processor,
                    child_processor=requested_processor)
            except Exception as e:
                self._add_issue(IType.ERROR,
                                str(e) + subrow_issue_message(subrow))
                return

        elif strcmp(scaling_type, "CloneScaled"):
            # “RequestedProcessor” must be an archetype
            # if not strcmp(requested_processor.instance_or_archetype, "archetype"):
            #     raise CommandExecutionError(f"Requested processor '{requested_processor.name}' should be of type 'archetype'")

            # “InvokingProcessor” must be an instance
            # if not strcmp(invoking_processor.instance_or_archetype, "instance"):
            #     raise CommandExecutionError(f"Invoking processor '{invoking_processor.name}' should be of type 'instance'")

            # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor”
            # 2. Scales the new processor using “Scale” as the value of “RequestedInterface”
            requested_processor_clone = self._clone_processor_as_child(
                processor=requested_processor,
                parent_processor=invoking_processor
                if not parent_processor else parent_processor,
                other_attributes=attributes)

            # Value Scale, which can be an expression, should be evaluated (ast) because we need a final float number
            scale_value = self._get_scale_value(scale)

            # In the cloned processor search in all interfaces if there are Observations relative_to RequestedInterface
            # and multiply the observation by the computed scale.
            self._scale_observations_relative_to_interface(
                processor=requested_processor_clone,
                interface_name=requested_interface_name,
                scale=scale_value)

        if requested_processor_clone:
            # Find or create processor and REGISTER it in "glb_idx"
            # Add to ProcessorsGroup, if specified
            field_val = fields.get("processor_group")
            if field_val:
                p_set = self._p_sets.get(field_val, ProcessorsSet(field_val))
                self._p_sets[field_val] = p_set
                if p_set.append(
                        requested_processor_clone, self._glb_idx
                ):  # Appends codes to the pset if the processor was not member of the pset
                    p_set.append_attributes_codes(fields["attributes"])
Exemplo n.º 17
0
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]
        dataset_datetime = self._content["dataset_datetime"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, measures = obtain_dataset_metadata(
            dataset_name, source, datasets)

        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in [
                    "startperiod", "starttime", "endperiod", "endtime"
            ]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = nexinfosys.data_source_manager.get_dataset_filtered(
            source, dataset_name, params, datasets)
        df = ds.data

        # Join with mapped dimensions (augment it)
        mapping_dict = create_dictionary()
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map))
                mapping_dict[mappings[m].origin] = (mappings[m].destination, {
                    d["o"]: d["to"]
                    for d in mappings[m].map
                })

        # If accelerated version not available, use slow version
        try:
            if nexinfosys.get_global_configuration_variable(
                    "ENABLE_CYTHON_OPTIMIZATIONS") == "True":
                from nexinfosys.restful_service.helper_accel import augment_dataframe_with_mapped_columns2 as augment_df
            else:
                raise Exception("Just to import the slow version")
        except:
            from nexinfosys.common.helper import augment_dataframe_with_mapped_columns as augment_df

        df = augment_df(df, mapping_dict, ["value"])

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            v2 = []
            for v in values:
                for c in df.columns:
                    if v.lower() == c.lower():
                        v2.append(c)
                        break
            values = v2

            # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value"
            # values = self._content["measures"]
            out_names = self._content["measures_as"]
            group_by_dims = translate_case(
                self._content["group_by"],
                df.columns)  # Group by dimension names
            lcase_group_by_dims = [d.lower() for d in group_by_dims]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_group_by_dims:
                    # Find and replace
                    for i, d2 in enumerate(group_by_dims):
                        if strcmp(d, d2):
                            group_by_dims[i] = d
                            break

            agg_funcs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    agg_funcs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    agg_funcs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    agg_funcs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    agg_funcs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:
                    agg_funcs.append("count")
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgna"]:
                    agg_funcs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:
                    agg_funcs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                for r in group_by_dims.copy():
                    df_columns_dict = create_dictionary(
                        data={c: None
                              for c in df.columns})
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[k] = params[k][0]
                                else:
                                    group_by_dims.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            group_by_dims.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in group_by_dims:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                if True:
                    groups = df.groupby(by=group_by_dims,
                                        as_index=False)  # Split
                    d = OrderedDict([])
                    lst_names = []
                    if len(values) == len(agg_funcs):
                        for i, (value,
                                agg_func) in enumerate(zip(values, agg_funcs)):
                            if len(out_names) == len(values) and out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            lst = d.get(value, [])
                            lst.append(agg_func)
                            d[value] = lst
                    else:
                        for value in values:
                            lst = d.get(value, [])
                            for agg_func in agg_funcs:
                                lst.append(agg_func)
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            d[value] = lst
                    # Print NaN values for each value column
                    for value in set(values):
                        cnt = df[value].isnull().sum()
                        print("NA count for col '" + value + "': " + str(cnt) +
                              " of " + str(df.shape[0]))
                    # AGGREGATE !!
                    df2 = groups.agg(d)

                    # Rename the aggregated columns
                    df2.columns = group_by_dims + lst_names
                # else:
                #     # Pivot table
                #     df2 = pd.pivot_table(df,
                #                          values=values,
                #                          index=group_by_dims,
                #                          aggfunc=[agg_funcs[0]], fill_value=np.NaN, margins=False,
                #                          dropna=True)
                #     # Remove the multiindex in columns
                #     df2.columns = [col[-1] for col in df2.columns.values]
                #     # Remove the index
                #     df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds = self._create_new_dataset(result_name, ds, df2,
                                              group_by_dims, out_names)
            except Exception as e:
                traceback.print_exc()
                issues.append((3, "There was a problem: " + str(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None
Exemplo n.º 18
0
    def _get_factor_types_from_field(
            self, hierarchy_field_name: str,
            interface_type_field_name: str) -> List[FactorType]:
        """ Possibly obtain not only one but many InterfaceTypes """

        hierarchy_name = self._fields[hierarchy_field_name]
        interface_type_name = self._fields[interface_type_field_name]

        if not interface_type_name and not hierarchy_name:
            raise CommandExecutionError(
                f"No hierarchy nor interface type have been specified. At least specify one of them."
            )
        elif interface_type_name and hierarchy_name:
            interface_types = self._glb_idx.get(
                FactorType.partial_key(interface_type_name))
            if len(interface_types) == 1:
                return [interface_types[0]]
            elif len(interface_types) == 0:
                raise CommandExecutionError(
                    f"The interface type '{interface_type_name}' has not been found"
                )
            else:
                hierarchy_name = self._fields[hierarchy_field_name]
                if not hierarchy_name:
                    raise CommandExecutionError(
                        f"The field '{hierarchy_field_name}' has not been specified and "
                        f"the interface type '{interface_type_name}' is not unique"
                    )

                interface_type = first(
                    interface_types,
                    lambda t: strcmp(t.hierarchy.name, hierarchy_name))
                if not interface_type:
                    raise CommandExecutionError(
                        f"The interface type '{interface_type_name}' has not been found in "
                        f"hierarchy '{hierarchy_name}'")

                return [interface_type]
        elif interface_type_name and not hierarchy_name:
            interface_types = self._glb_idx.get(
                FactorType.partial_key(interface_type_name))
            if len(interface_types) == 1:
                return [interface_types[0]]
            elif len(interface_types) == 0:
                raise CommandExecutionError(
                    f"The interface type '{interface_type_name}' has not been found"
                )
            else:
                raise CommandExecutionError(
                    f"The field '{hierarchy_field_name}' has not been specified and "
                    f"the interface type '{interface_type_name}' is not unique"
                )
        elif not interface_type_name and hierarchy_name:
            hie = self._glb_idx.get(Hierarchy.partial_key(hierarchy_name))
            if len(hie) == 1:
                # All children of "hierarchy_name"
                return [v for v in hie[0].codes.values()]
            elif len(hie) == 0:
                raise CommandExecutionError(
                    f"The InterfaceTypes hierarchy '{hierarchy_name}' has not been found"
                )
            else:
                raise CommandExecutionError(
                    f"The InterfaceTypes hierarchy '{hierarchy_name}' has been found multiple times!!"
                )

        # Check if FactorType exists
        interface_types = self._glb_idx.get(
            FactorType.partial_key(interface_type_name))

        if len(interface_types) == 1:
            return interface_types[0]
        elif len(interface_types) == 0:
            raise CommandExecutionError(
                f"The interface type '{interface_type_name}' has not been found"
            )
        else:
            hierarchy_name = self._fields[hierarchy_field_name]
            if not hierarchy_name:
                raise CommandExecutionError(
                    f"The field '{hierarchy_field_name}' has not been specified and "
                    f"the interface type '{interface_type_name}' is not unique"
                )

            interface_type = first(
                interface_types,
                lambda t: strcmp(t.hierarchy.name, hierarchy_name))
            if not interface_type:
                raise CommandExecutionError(
                    f"The interface type '{interface_type_name}' has not been found in "
                    f"hierarchy '{hierarchy_name}'")

            return interface_type
Exemplo n.º 19
0
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    if dataset_name is None:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command",
                location=IssueLocation(sheet_name=name, row=None,
                                       column=None)))
        return issues, None, None

    # Obtain the source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join(
                                [m2["measure"]
                                 for m2 in out_measures.values]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "starttime", "endperiod", "endtime"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                if col_name.lower() == "starttime":
                    col_name = "StartPeriod"
                elif col_name.lower() == "endtime":
                    col_name = "EndPeriod"
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=IType.WARNING,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=IType.ERROR,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=IType.WARNING,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Exemplo n.º 20
0
    def execute(self, state: "State"):
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.code for ds in datasets.values()]

        # List of datasets with local worksheet name
        external_dataset_names = []
        for ds in datasets.values():
            if ds.attributes["_location"].lower().startswith("data://#"):
                worksheet = ds.attributes["_location"][len("data://#"):]
                if not worksheet.lower().startswith("datasetdata "):
                    worksheet = "DatasetData " + worksheet

                if strcmp(worksheet, name):
                    external_dataset_names.append(ds.code)

        # Process parsed information
        for r, line in enumerate(self._content["items"]):
            # A dataset
            dataset_names = line["name"]
            if dataset_names == "":
                if external_dataset_names:
                    dataset_names = external_dataset_names
                else:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            "The column name 'DatasetName' was not defined for command 'DatasetData' and there is no 'location' in a DatasetDef command pointing to it",
                            location=IssueLocation(sheet_name=name,
                                                   row=1,
                                                   column=None)))
            else:
                dataset_names = [dataset_names]

            # Find it in the already available datasets. MUST EXIST
            for n in ds_names:
                for dataset_name in dataset_names:
                    if strcmp(dataset_name, n):
                        df = pd.read_json(StringIO(line["values"]),
                                          orient="split")
                        # Check columns
                        ds = datasets[n]
                        iss = prepare_dataframe_after_external_read(
                            ds, df, name)
                        issues.extend(iss)
                        # Everything ok? Store the dataframe!
                        if not any_error_issue(iss):
                            r = ds.attributes["_dataset_first_row"]
                            # Loop over "ds" concepts.
                            # - "dimension" concepts of type "string" generate a CodeHierarchy
                            # - Check that the DataFrame contains ALL declared concepts. If not, generate issue
                            # dims = translate_case([d.code for d in ds.dimensions], df.columns)
                            cid = create_dictionary(
                                data={col: col
                                      for col in df.columns})
                            col_names = list(df.columns)
                            for c in ds.dimensions:
                                if c.code in cid:
                                    col_names[df.columns.get_loc(
                                        cid[c.code])] = c.code  # Rename column
                                    dsd_concept_data_type = c.attributes[
                                        "_datatype"]
                                    if dsd_concept_data_type.lower(
                                    ) == "string" and not c.is_measure:  # Freely defined dimension
                                        cl = df[cid[c.code]].unique().tolist()
                                        c.code_list = CodeList.construct(
                                            c.code,
                                            c.code, [""],
                                            codes=[
                                                CodeImmutable(c, c, "", [])
                                                for c in cl
                                            ])
                                else:
                                    issues.append(
                                        Issue(
                                            itype=IType.ERROR,
                                            description=
                                            f"Concept '{c.code}' not defined for '{ds.code}'",
                                            location=IssueLocation(
                                                sheet_name=name,
                                                row=r,
                                                column=None)))
                            df.columns = col_names
                            ds.data = df
                        dataset_names.remove(dataset_name)
                        break

            if dataset_names:
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        f"Metadata for the datasets: {','.join(dataset_names)}, must be defined previously",
                        location=IssueLocation(sheet_name=name,
                                               row=-1,
                                               column=-1)))

        return issues, None
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    ds.database = None
                    ds.attributes = {}
                    current_ds[dsd_dataset_name] = ds
                if not dsd_concept_type:
                    if ds.attributes.get("_location"):
                        issues.append(
                            Issue(
                                itype=IType.WARNING,
                                description=
                                f"Location of data for dataset {ds.code} previously declared. "
                                f"Former: {attributes.get('_location')}, "
                                f"Current: {dsd_dataset_data_location}",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        attributes = ds.attributes
                    else:
                        attributes["_dataset_first_row"] = r
                    attributes[
                        "_location"] = dsd_dataset_data_location  # Location
                    ds.description = dsd_concept_description
                    ds.attributes = attributes  # Set attributes
                else:  # If concept_type is defined => add a concept
                    # Check if the concept name already appears --> Error
                    for d1 in ds.dimensions:
                        if strcmp(d1.code, dsd_concept_name):
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept {dsd_concept_name} already declared for dataset {ds.code}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            break

                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl

                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """

    issues: List[Issue] = []

    # Analyze column names
    col_map = create_dictionary()
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value.strip()
        # Avoid repetitions
        if col_name in col_map:
            issues.append(Issue(itype=IType.ERROR,
                                description="The column name '"+col_name+"' is repeated",
                                location=IssueLocation(sheet_name=name, row=1, column=c)))

        if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"):
            col_map["dataset"] = c
        elif col_name:
            # Concept name
            col_map[col_name] = c

    if any([i.itype == IType.ERROR for i in issues]):
        return issues, None, None

    # Read all the content into a list of lists
    lines = []
    for r in range(area[0] + 1, area[1]):
        line = []
        for col_name, c in col_map.items():
            v = sh.cell(row=r, column=c).value
            if isinstance(v, str):
                v = v.strip()
            line.append(v)
        lines.append(line)

    # pd.DataFrame
    df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines)

    content = []  # The output JSON

    if "dataset" in df:
        # Find the different datasets
        datasets = df["dataset"].unique()
        datasets = set([d.lower() for d in datasets])

        for dataset in datasets:
            # Obtain filtered
            df2 = df.loc[df['dataset'].str.lower() == dataset]
            # Convert to JSON and store in content
            del df2["dataset"]
            s = StringIO()
            df2.to_json(s, orient="split")
            content.append(dict(name=dataset, values=s.getvalue()))
    else:
        s = StringIO()
        df.to_json(s, orient="split")
        content.append(dict(name="", values=s.getvalue()))

    return issues, None, dict(items=content, command_name=name)
Exemplo n.º 23
0
    def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None:
        """
        Process a dictionary representing a row of the Interfaces command. The dictionary can come directly from
        the worksheet or from a dataset.

        :param field_values: dictionary
        """
        # f_processor_name -> p
        # f_interface_type_name -> it
        # f_interface_name -> i
        #
        # IF NOT i AND it AND p => i_name = it.name => get or create "i"
        # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error)
        # IF i AND p AND NOT it => get "i" (MUST EXIST)
        f_interface_type_name = field_values.get("interface_type")
        f_interface_name = field_values.get("interface")

        if not f_interface_name:
            if not f_interface_type_name:
                raise CommandExecutionError(
                    "At least one of InterfaceType or Interface must be defined"
                    + subrow_issue_message(subrow))

            f_interface_name = f_interface_type_name

        processor = self.find_processor(field_values.get("processor"), subrow)

        # Try to find Interface
        f_orientation = field_values.get("orientation")
        interface_type: Optional[FactorType] = None
        interface: Optional[Factor] = None
        interfaces: Sequence[Factor] = self._glb_idx.get(
            Factor.partial_key(processor=processor, name=f_interface_name))
        if len(interfaces) == 1:
            interface = interfaces[0]
            print(f"DEBUG - Interface '{interface.name}' found")
            interface_type = interface.taxon
            if f_interface_type_name and not strcmp(interface_type.name,
                                                    f_interface_type_name):
                self._add_issue(
                    IType.WARNING,
                    f"The existing Interface '{interface.name}' has the InterfaceType "
                    f"'{interface_type.name}' which is different from the specified "
                    f"InterfaceType '{f_interface_type_name}'. Record skipped."
                    + subrow_issue_message(subrow))
                return
        elif len(interfaces) > 1:
            raise CommandExecutionError(
                f"Interface '{f_interface_name}' found {str(len(interfaces))} times. "
                f"It must be uniquely identified." +
                subrow_issue_message(subrow))
        elif len(interfaces) == 0:
            # The interface does not exist, create it below
            if not f_orientation:
                raise CommandExecutionError(
                    f"Orientation must be defined for new Interfaces." +
                    subrow_issue_message(subrow))

        # InterfaceType still not found
        if not interface_type:
            interface_type_name = ifnull(f_interface_type_name,
                                         f_interface_name)

            # Find FactorType
            # TODO Allow creating a basic FactorType if it is not found?
            interface_types: Sequence[FactorType] = self._glb_idx.get(
                FactorType.partial_key(interface_type_name))
            if len(interface_types) == 0:
                raise CommandExecutionError(
                    f"InterfaceType '{interface_type_name}' not declared previously"
                    + subrow_issue_message(subrow))
            elif len(interface_types) > 1:
                raise CommandExecutionError(
                    f"InterfaceType '{interface_type_name}' found {str(len(interface_types))} times. "
                    f"It must be uniquely identified." +
                    subrow_issue_message(subrow))
            else:
                interface_type = interface_types[0]

        # Get attributes default values taken from Interface Type or Processor attributes
        # Rows   : value of (source) "processor.subsystem_type"
        # Columns: value of (target) "interface_type.opposite_processor_type"
        # Cells  : CORRECTED value of "opposite_processor_type"
        # +--------+-------+--------+-------+---------+
        # |        | Local | Env    | Ext   | ExtEnv  |
        # +--------+-------+--------+-------+---------+
        # | Local  | Local | Env    | Ext   | ExtEnv  |
        # | Env    | Local | Env    | Ext   | ExtEnv? |
        # | Ext    | Ext   | ExtEnv | Local | Env     |
        # | ExtEnv | Ext   | ExtEnv | Local | Env?    |
        # +--------+-------+--------+-------+---------+
        if interface_type.opposite_processor_type:
            tmp = interface_type.opposite_processor_type.lower()
            if processor.subsystem_type.lower() in ["local", "environment"
                                                    ]:  # First two rows
                opposite_processor_type = tmp
            else:
                opposite_processor_type = InterfacesAndQualifiedQuantitiesCommand.invert[
                    tmp]
            # TODO in doubt. Maybe these are undefined (values with question mark in the table)
            #  if tmp == "externalenvironment" and processor.subsystem_type.lower() in ["environment", "externalenvironment"]:
            #      pass
        else:
            opposite_processor_type = None

        interface_type_values = {
            "sphere": interface_type.sphere,
            "roegen_type": interface_type.roegen_type,
            "opposite_processor_type": opposite_processor_type
        }

        # Get internal and user-defined attributes in one dictionary
        # Use: value specified in Interfaces ELSE value specified in InterfaceTypes ELSE first value of allowed values
        attributes = {
            c.name: ifnull(
                field_values[c.name],
                ifnull(interface_type_values.get(c.name),
                       head(c.allowed_values)))
            for c in self._command_fields if c.attribute_of == Factor
        }

        if not interface:
            # f_list: Sequence[Factor] = self._glb_idx.get(
            #     Factor.partial_key(processor=p, factor_type=ft, orientation=f_orientation))
            #
            # if len(f_list) > 0:
            #     raise CommandExecutionError(f"An interface called '{f_list[0].name}' for Processor '{f_processor_name}'"
            #                                  f" with InterfaceType '{f_interface_type_name}' and orientation "
            #                                  f"'{f_orientation}' already exists"+subrow_issue_message(subrow))

            # Transform text of "interface_attributes" into a dictionary
            interface_attributes = self.transform_text_attributes_into_dictionary(
                field_values.get("interface_attributes"), subrow)
            attributes.update(interface_attributes)

            location = self.get_location(field_values.get("location"), subrow)

            interface = Factor.create_and_append(
                f_interface_name,
                processor,
                in_processor_type=FactorInProcessorType(external=False,
                                                        incoming=False),
                taxon=interface_type,
                geolocation=location,
                tags=None,
                attributes=attributes)
            self._glb_idx.put(interface.key(), interface)
            print(f"DEBUG - Interface '{interface.name}' created")
        elif not interface.compare_attributes(attributes):
            initial = ', '.join(
                [f"{k}: {interface.get_attribute(k)}" for k in attributes])
            new = ', '.join([f"{k}: {attributes[k]}" for k in attributes])
            name = interface.processor.full_hierarchy_names(
                self._glb_idx)[0] + ":" + interface.name
            raise CommandExecutionError(
                f"The same interface '{name}', is being redeclared with different properties. "
                f"INITIAL: {initial}; NEW: {new}." +
                subrow_issue_message(subrow))

        f_unit = field_values.get("unit")
        if not f_unit:
            f_unit = interface_type.unit

        # Unify unit (it must be done before considering RelativeTo -below-, because it adds a transformation to "f_unit")
        f_value = field_values.get("value")
        if f_value is not None and f_unit != interface_type.unit:
            try:
                f_value = UnitConversion.convert(f_value, f_unit,
                                                 interface_type.unit)
            except DimensionalityError:
                raise CommandExecutionError(
                    f"Dimensions of units in InterfaceType ({interface_type.unit}) and specified ({f_unit}) are not convertible"
                    + subrow_issue_message(subrow))

            f_unit = interface_type.unit

        # Search for a relative_to interface
        f_relative_to = field_values.get("relative_to")
        relative_to_interface: Optional[Factor] = None
        if f_relative_to:
            try:
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.factor_unit, f_relative_to)
            except:
                raise CommandExecutionError(
                    f"Could not parse the RelativeTo column, value {str(f_relative_to)}. "
                    + subrow_issue_message(subrow))

            relative_to_interface_name = ast_to_string(ast["factor"])

            # rel_unit_name = ast["unparsed_unit"]
            # try:
            #     f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units)
            # except (UndefinedUnitError, AttributeError) as ex:
            #     raise CommandExecutionError(f"The final unit could not be computed, interface '{f_unit}' / "
            #                                  f"relative_to '{rel_unit_name}': {str(ex)}"+subrow_issue_message(subrow))

            relative_to_interface = first(
                interface.processor.factors,
                lambda ifc: strcmp(ifc.name, relative_to_interface_name))

            if not relative_to_interface:
                raise CommandExecutionError(
                    f"Interface specified in 'relative_to' column "
                    f"'{relative_to_interface_name}' has not been found." +
                    subrow_issue_message(subrow))

        if f_value is None and relative_to_interface is not None:
            # Search for a Interface Type Conversion defined in the ScaleChangeMap command
            interface_types_transforms: List[FactorTypesRelationUnidirectionalLinearTransformObservation] = \
                find_factor_types_transform_relation(self._glb_idx, relative_to_interface.taxon, interface.taxon, processor, processor)

            # Overwrite any specified unit, it doesn't make sense without a value, i.e. it cannot be used for conversion
            f_unit = interface.taxon.unit
            if len(interface_types_transforms) == 1:
                f_value = interface_types_transforms[0].scaled_weight
            else:
                interface_types_transforms_message = "an interface type conversion doesn't exist" \
                    if (len(interface_types_transforms) == 0) \
                    else f"{len(interface_types_transforms)} interface type conversions exist"

                f_value = "0"
                self._add_issue(
                    IType.WARNING,
                    f"Field 'value' should be defined for interfaces having a "
                    f"'RelativeTo' interface, and {interface_types_transforms_message}. "
                    f"Using value '0'." + subrow_issue_message(subrow))

        # Create quantitative observation
        if f_value is not None:
            f_uncertainty = field_values.get("uncertainty")
            f_assessment = field_values.get("assessment")
            f_pedigree_matrix = field_values.get("pedigree_matrix")
            f_pedigree = field_values.get("pedigree")
            f_time = field_values.get("time")
            f_comments = field_values.get("comments")

            f_source = field_values.get("qq_source")
            # TODO: source is not being used
            source = self.get_source(f_source, subrow)

            # Find Observer
            observer: Optional[Observer] = None
            if f_source:
                observer = self._glb_idx.get_one(
                    Observer.partial_key(f_source))
                if not observer:
                    self._add_issue(
                        IType.WARNING,
                        f"Observer '{f_source}' has not been found." +
                        subrow_issue_message(subrow))

            # If an observation exists then "time" is mandatory
            if not f_time:
                raise CommandExecutionError(
                    f"Field 'time' needs to be specified for the given observation."
                    + subrow_issue_message(subrow))

            # An interface can have multiple observations if each of them have a different [time, observer] combination
            for observation in interface.quantitative_observations:
                observer_name = observation.observer.name if observation.observer else None
                if strcmp(observation.attributes["time"], f_time) and strcmp(
                        observer_name, f_source):
                    raise CommandExecutionError(
                        f"The interface '{interface.name}' in processor '{interface.processor.name}' already has an "
                        f"observation with time '{f_time}' and source '{f_source}'."
                    )

            self.check_existence_of_pedigree_matrix(f_pedigree_matrix,
                                                    f_pedigree, subrow)

            # Transform text of "number_attributes" into a dictionary
            number_attributes = self.transform_text_attributes_into_dictionary(
                field_values.get("number_attributes"), subrow)

            o = _create_or_append_quantitative_observation(
                interface, f_value, f_unit, f_uncertainty, f_assessment,
                f_pedigree, f_pedigree_matrix, observer, relative_to_interface,
                f_time, None, f_comments, None, number_attributes)
Exemplo n.º 24
0
def export_model_to_xml(
        registry: PartialRetrievalDictionary
) -> Tuple[str, Dict[str, Processor]]:
    """
    Elaborate an XML string containing the nested processors and their attributes.
    Also the interfaces inside processors
    <processors>
      <root_p1 fullname="" level="" system="" subsystem="" functional="true|false">
        <interfaces>
          <i1 type="" sphere="" roegen_type="" orientation="" opposite_processor_type="" />
          ...
        </interfaces>
        <child_p2>
          ...
        </child_p2>
      </root_p1>
      ...
    </processors>

    Example (abstract):

    '/processors//[level="n"]'

    :param registry:
    :return:
    """
    def xml_processor(p: Processor, registry: PartialRetrievalDictionary,
                      p_map: Dict[str, Processor]):
        """
        Return the XML of a processor
        Recursive into children

        :param p:
        :return:
        """
        def xml_interface(iface: Factor):
            """

            :param iface:
            :return:
            """
            s = f'<{iface.name} type="{iface.taxon.name}" sphere="{iface.sphere}" ' \
                f'roegen_type="{iface.roegen_type}" orientation="{iface.orientation}" ' \
                f'opposite_processor_type="{iface.opposite_processor_type}" />'
            if case_sensitive:
                return s
            else:
                return s.lower()

        children = p.children(registry)
        full_name = p.full_hierarchy_names(registry)[0]
        if case_sensitive:
            p_map[full_name] = p
        else:
            p_map[full_name.lower()] = p

        s = f"""
    <{p.name} fullname="{full_name}" level="{p.level}" system="{p.processor_system}" subsystem="{p.subsystem_type}" functional="{"true" if strcmp(p.functional_or_structural, "Functional") else "false"}" >
        <interfaces>
        {chr(10).join([xml_interface(f) for f in p.factors])}
        </interfaces>
        {chr(10).join([xml_processor(c, registry, p_map) for c in children])}    
    </{p.name}>"""
        if case_sensitive:
            return s
        else:
            return s.lower()

    # Part of relationships
    por = registry.get(ProcessorsRelationPartOfObservation.partial_key())

    # Set of all instance processors NOT touched by part-of relationships
    unaffected_procs = set([
        p for p in registry.get(Processor.partial_key())
        if strcmp(p.instance_or_archetype, "Instance")
    ])
    for po in por:
        try:
            unaffected_procs.remove(po.parent_processor)
        except KeyError:
            pass
        try:
            unaffected_procs.remove(po.child_processor)
        except KeyError:
            pass

    # Keep those affecting Instance processors
    por = [
        po for po in por
        if strcmp(po.parent_processor.instance_or_archetype, "Instance")
    ]

    # Get root processors (set of processors not appearing as child_processor)
    parents = set([po.parent_processor for po in por])
    children = set([po.child_processor for po in por])
    roots = parents.difference(children).union(unaffected_procs)
    # leaves = children.difference(parents)
    result = '<processors>'  # <?xml version="1.0" encoding="utf-8"?>\n
    p_map = {}
    for p in roots:
        result += xml_processor(p, registry, p_map)
    result += "\n</processors>"

    return result, p_map
Exemplo n.º 25
0
    def _process_row(self, field_values: Dict[str, Any], subrow=None) -> None:

        # Transform text of "attributes" into a dictionary
        if field_values.get("attributes"):
            try:
                field_values["attributes"] = dictionary_from_key_value_list(
                    field_values["attributes"], self._glb_idx)
            except Exception as e:
                self._add_issue(IType.ERROR,
                                str(e) + subrow_issue_message(subrow))
                return
        else:
            field_values["attributes"] = {}

        # Process specific fields

        # Obtain the parent: it must exist. It could be created dynamically but it's important to specify attributes
        if field_values.get("parent_processor"):
            try:
                parent_processor = self._get_processor_from_field(
                    "parent_processor")
                # parents = find_processors_matching_name(parent_processor)
                # if len(parents) > 1:
                #     self._add_issue(IType.WARNING,
                #                     f"Parent processor '{parent_processor}' not unique. Matches: {', '.join(p.hierarchical_names[0] for p in parents)}. Skipped." + subrow_issue_message(subrow))
                #     return
            except CommandExecutionError:
                self._add_issue(
                    IType.ERROR,
                    f"Specified parent processor, '{field_values.get('parent_processor')}', does not exist"
                    + subrow_issue_message(subrow))
                return
        else:
            parent_processor = None

        behave_as_processor: Optional[Processor] = None
        if field_values.get("behave_as_processor"):
            try:
                behave_as_processor = self._get_processor_from_field(
                    "behave_as_processor")
            except CommandExecutionError:
                self._add_issue(
                    IType.WARNING,
                    f"Specified 'behave as' processor, '{field_values.get('behave_as_processor')}', does not exist, value ignored"
                    + subrow_issue_message(subrow))

        # Find or create processor and REGISTER it in "glb_idx"
        # TODO Now, only Simple name allowed
        # TODO Improve allowing hierarchical names, and hierarchical names with wildcards
        pgroup = field_values.get("processor_group")

        # Get internal and user-defined attributes in one dictionary
        attributes = {
            c.name: field_values[c.name]
            for c in self._command_fields if c.attribute_of == Processor
        }
        attributes.update(field_values["attributes"])
        attributes["processor_group"] = pgroup

        # Needed to support the new name of the field, "Accounted" (previously it was "InstanceOrArchetype")
        # (internally the values have the same meaning, "Instance" for a processor which has to be accounted,
        # "Archetype" for a processor which hasn't)
        v = attributes.get("instance_or_archetype", None)
        if strcmp(v, "Yes"):
            v = "Instance"
        elif strcmp(v, "No"):
            v = "Archetype"
        if v:
            attributes["instance_or_archetype"] = v

        name = field_values["processor"]
        p_names, _ = obtain_name_parts(name)

        geolocation = Geolocation.create(field_values["geolocation_ref"],
                                         field_values["geolocation_code"])

        ps = find_processors_matching_name(name, self._glb_idx)
        more_than_one = len(ps) > 1
        simple = len(p_names) == 1
        exists = True if len(ps) == 1 else False
        # SIMPLE? EXISTS? PARENT? ACTION:
        # Yes     Yes     Yes     NEW; HANG FROM PARENT
        # Yes     Yes     No      Warning: repeated
        # Yes     No      Yes     NEW; HANG FROM PARENT
        # Yes     No      No      NEW
        # No      Yes     Yes     Warning: cannot hang from parent
        # No      Yes     No      Warning: repeated AND not simple not allowed
        # No      No      Yes     Warning: cannot create more than one processor AND not simple not allowed
        # No      No      No      Warning: cannot create more than one processor AND not simple not allowed

        create_new = False
        if not simple:
            if not parent_processor:
                self._add_issue(
                    IType.WARNING,
                    f"When a processor does not have parent, the name must be simple. Skipped."
                    + subrow_issue_message(subrow))
                return
        else:
            if exists and not parent_processor:
                self._add_issue(
                    IType.WARNING,
                    f"Repeated declaration of {name}. Skipped." +
                    subrow_issue_message(subrow))
                return
            create_new = True

        if create_new:
            p = find_or_create_processor(state=self._glb_idx,
                                         name=name,
                                         proc_attributes=attributes,
                                         proc_location=geolocation)
        else:
            if exists:
                p = ps[0]

        # Add to ProcessorsGroup, if specified
        if pgroup:
            p_set = self._p_sets.get(pgroup, ProcessorsSet(pgroup))
            self._p_sets[pgroup] = p_set
            if p_set.append(
                    p, self._glb_idx
            ):  # Appends codes to the pset if the processor was not member of the pset
                p_set.append_attributes_codes(field_values["attributes"])

        # If geolocation specified, check if it exists
        # Inside it, check it the code exists
        if p.geolocation and p.geolocation.reference:
            # Geographical reference
            gr = self._glb_idx.get(
                GeographicReference.partial_key(name=p.geolocation.reference))
            if len(gr) == 0:
                self._add_issue(
                    IType.ERROR,
                    f"Geographical reference {p.geolocation.reference} not found "
                    + subrow_issue_message(subrow))
                return
            if p.geolocation.reference and not p.geolocation.code:
                self._add_issue(
                    IType.ERROR,
                    f"Geographical reference was specified but not the code in it "
                    + subrow_issue_message(subrow))
                return
            geo_id = p.geolocation.code
            try:
                url = gr[0].attributes["data_location"]
            except:
                self._add_issue(
                    IType.ERROR,
                    f"URL not found in geographical reference {p.geolocation.reference} "
                    + subrow_issue_message(subrow))
                return
            try:
                j, ids = read_geojson(
                    url
                )  # READ the file!! (or get it from cache). Could take some time...
            except:
                self._add_issue(
                    IType.ERROR,
                    f"URL {url} in reference {p.geolocation.reference} could not be read "
                    + subrow_issue_message(subrow))
                return
            if geo_id not in ids:
                self._add_issue(
                    IType.WARNING,
                    f"Could not find code {geo_id} in file {url}, geographical reference {p.geolocation.reference} "
                    + subrow_issue_message(subrow))

        # Add Relationship "part-of" if parent was specified
        # The processor may have previously other parent processors that will keep its parentship
        if parent_processor:
            # Create "part-of" relationship
            if len(
                    self._glb_idx.get(
                        ProcessorsRelationPartOfObservation.partial_key(
                            parent_processor, p))) > 0:
                self._add_issue(
                    IType.WARNING,
                    f"{p.name} is already part-of {parent_processor.name}. Skipped."
                    + subrow_issue_message(subrow))
                return

            o1 = ProcessorsRelationPartOfObservation.create_and_append(
                parent_processor,
                p,
                None,
                behave_as=behave_as_processor,
                weight=field_values.get("parent_processor_weight"))  # Part-of
            self._glb_idx.put(o1.key(), o1)
            for hname in parent_processor.full_hierarchy_names(self._glb_idx):
                p_key = Processor.partial_key(f"{hname}.{p.name}", p.ident)
                if attributes:
                    p_key.update({
                        k: ("" if v is None else v)
                        for k, v in attributes.items()
                    })
                self._glb_idx.put(p_key, p)
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin,
                          destination) -> IssuesLabelContentTripleType:
    """
    Map from a set of categories from an external dataset into a set of MuSIASEM categories
    If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping
    will still hold

    The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence.
    The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side)

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param origin:
    :param destination:
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []
    # Analyze Origin
    cell = sh.cell(row=area[0], column=area[2])
    col_name = cell.value
    if origin:
        if not strcmp(origin, col_name):
            some_error = True
            issues.append((
                3,
                "The Origin name is different in the sheet name and in the worksheet ("
                + origin + ", " + col_name + ")"))
    else:
        origin = col_name

    #   Obtain the source, the dataset and the dimension of "origin"
    spl = origin.split(".")
    if len(spl) == 3:  # Source.Dataset.Dimension
        s, ds, dim = spl
        s = s + "."
        origin_ok = True
    elif len(spl) == 2:  # Dataset.Dimension
        ds, dim = spl
        s = ""
        origin_ok = True
    else:
        origin_ok = False
        some_error = True
        issues.append((
            3,
            "Origin must specify a dataset and a dimension name separated by '.'"
        ))

    if origin_ok:
        origin_dataset = s + ds
        origin_dim = dim

        if not check_dataset_exists(origin_dataset):
            some_error = True
            issues.append((3, "The Origin '" + origin_dataset +
                           "' does not match any registered dataset"))
        else:
            dims, attrs, meas = obtain_dataset_metadata(ds)
            if origin_dim not in dims:
                some_error = True
                issues.append(
                    (3, "The Origin dataset '" + origin_dataset +
                     "' does not have a dimension '" + origin_dim + "'"))

    # Analyze Destination
    cell = sh.cell(row=area[0], column=area[2] + 1)
    col_name = cell.value
    if destination:
        if not strcmp(destination, col_name):
            some_error = True
            issues.append((
                3,
                "The Destination name is different in the sheet name and in the worksheet ("
                + destination + ", " + col_name + ")"))
    else:
        destination = col_name

    #  Destination name must be a simple identity
    try:
        parser_field_parsers.simple_ident.parseString(destination,
                                                      parseAll=True)
    except:
        some_error = True
        issues.append((3, "'" + destination +
                       "' category name has to be a simple identifier"))

    if some_error:  # Issues at this point are errors, return if there are any
        return issues, None, None

    # Read mapping Origin to Destination
    o_dict = create_dictionary()
    for r in range(area[0] + 1, area[1]):
        o_value = sh.cell(row=r,
                          column=area[2]).value  # First column -> Origin
        d_value = sh.cell(row=r, column=area[2] +
                          1).value  # Second column -> Destination
        try:
            exp_value = sh.cell(
                row=r, column=area[2] +
                2).value  # Third column -> Weight (for Many to Many mappings)
            if exp_value:
                try:
                    exp_value = float(exp_value)
                except:  # If it is not possible, it maybe an expression, postpone conversion until usage
                    pass
            else:
                exp_value = 1.0  # If undefined -> Many to One
        except:
            exp_value = 1.0  # If undefined -> Many to One

        if not o_value and not d_value:
            # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped."))
            continue
        elif not o_value or not d_value:
            if not o_value and d_value:
                issues.append(
                    (2,
                     "Row " + str(r) + ": Origin not defined. Row skipped."))
            else:
                issues.append((2, "Row " + str(r) +
                               ": Destination not defined. Row skipped."))
            continue

        o_value = str(o_value).lower()
        d_value = str(d_value).lower()
        if o_value in o_dict:
            lst = o_dict[o_value]
        else:
            lst = []
            o_dict[o_value] = lst
        # Check "d_value" is not being repeated for "o_value"
        if (len(lst) == 0) or (len(lst) >= 1
                               and d_value not in [d["d"] for d in lst]):
            lst.append({"d": d_value, "w": exp_value})
        else:
            issues.append((3, "Destination category '" + destination +
                           "' has been repeated for origin category '" +
                           o_value + "' at row '" + str(r) + "'"))

    # List of dictionaries, where each dictionary contains the specification of an origin "o"
    # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e"
    content = {
        "origin_dataset":
        origin_dataset,  # Name of the origin dataset (may include the source name)
        "origin_dimension":
        origin_dim,  # Name of the origin dimension inside the dataset
        "destination": destination,  # Name of the destination hierarchy
        "map": [{
            "o": k,
            "to": v
        } for k, v in o_dict.items()]
    }
    label = ((content["origin_dataset"] + ".") if origin_dataset else ""
             ) + content["origin_dimension"] + " -> " + content["destination"]
    return issues, label, content
Exemplo n.º 27
0
    def _init_and_process_row(self, row: Dict[str, Any]) -> None:
        def obtain_dictionary_with_not_expandable_fields(d):
            output = {}
            for k, v in d.items():
                if v is None or "{" not in v:
                    output[k] = v
            return output

        self._current_row_number = row["_row"]
        self._fields = self._get_command_fields_values(row)
        tmp_fields = self._fields
        self._check_all_mandatory_fields_have_values()
        # If expandable, do it now
        expandable = row["_expandable"]
        if expandable:
            # Extract variables
            state = State()
            issues = []
            asts = {}
            referenced_variables = create_dictionary()
            for e in expandable:
                ast = parser_field_parsers.string_to_ast(
                    arith_boolean_expression, e)
                c_name = f"{{{e}}}"
                asts[c_name] = ast
                res, vars = ast_evaluator(ast,
                                          state,
                                          None,
                                          issues,
                                          atomic_h_names=True)
                for v in vars:
                    referenced_variables[v] = None

            res = classify_variables2(referenced_variables.keys(),
                                      self._datasets, self._hierarchies,
                                      self._parameters)
            ds_list = res["datasets"]
            ds_concepts = res["ds_concepts"]
            h_list = res["hierarchies"]
            if len(ds_list) >= 1 and len(h_list) >= 1:
                self._add_issue(
                    itype=IType.ERROR,
                    description="Dataset(s): " +
                    ", ".join([d.name
                               for d in ds_list]) + ", and hierarchy(ies): " +
                    ", ".join([h.name for h in h_list]) +
                    ", have been specified. Only a single dataset is supported."
                )
                return
            elif len(ds_list) > 1:
                self._add_issue(
                    itype=IType.ERROR,
                    description="More than one dataset has been specified: " +
                    ", ".join([d.name for d in ds_list]) +
                    ", just one dataset is supported.")
                return
            elif len(h_list) > 0:
                self._add_issue(
                    itype=IType.ERROR,
                    description="One or more hierarchies have been specified: "
                    + ", ".join([h.name for h in h_list]))
                return
            if len(ds_list) == 1:  # Expand dataset
                ds = ds_list[0]
                measure_requested = False
                all_dimensions = set(
                    [c.code for c in ds.dimensions if not c.is_measure])
                requested_dimensions = set()
                requested_measures = set()
                for con in ds_concepts:
                    found = False
                    for c in ds.dimensions:
                        if strcmp(c.code, con):
                            found = True
                            if c.is_measure:
                                measure_requested = True
                                requested_measures.add(c.code)
                            else:  # Dimension
                                all_dimensions.remove(c.code)
                                requested_dimensions.add(c.code)
                    if not found:
                        self._add_issue(
                            itype=IType.ERROR,
                            description=
                            f"The concept '{{{ds.code}.{con}}}' is not in the dataset '{ds.code}'"
                        )
                        return
                ds_concepts = list(requested_measures)
                ds_concepts.extend(list(requested_dimensions))
                all_dimensions_requested = len(all_dimensions) == 0

                if measure_requested and not all_dimensions_requested:
                    self._add_issue(
                        IType.ERROR,
                        f"It is not possible to use a measure ({', '.join(requested_measures)}), if not all dimensions are used "
                        f"(cannot assume implicit aggregation). Dimensions not used: {', '.join(all_dimensions)}"
                    )
                    return
                elif not measure_requested and not all_dimensions_requested:
                    # Reduce the Dataframe to unique tuples of the specified dimensions
                    # TODO Consider the current case -sensitive or not-sensitive-
                    data = ds.data[list(
                        requested_dimensions)].drop_duplicates()
                else:  # Take the dataset as-is
                    data = ds.data

                # Remove Index, and do it NOT-INPLACE
                data = data.reset_index()

                # Drop rows with empty dimension value
                import numpy as np
                data = data.replace(r'^\s*$', np.NaN, regex=True)
                data.dropna(subset=requested_dimensions, inplace=True)

                const_dict = obtain_dictionary_with_not_expandable_fields(
                    self._fields)  # row?
                var_dict = set(
                    [f for f in self._fields.keys() if f not in const_dict])

                re_concepts = {}
                for c in ds_concepts:
                    c_name = f"{{{ds.code}.{c}}}"
                    if case_sensitive:
                        re_concepts[c_name] = re.compile(c_name)
                    else:
                        re_concepts[c_name] = re.compile(c_name, re.IGNORECASE)

                location = IssueLocation(sheet_name=self._command_name,
                                         row=self._current_row_number,
                                         column=None)
                already_parsed_fields = set(const_dict.keys())
                for ds_row, row2 in enumerate(
                        data.iterrows()):  # Each row in the dataset
                    # Initialize constant values (those with no "{..}" expressions)
                    row3 = const_dict.copy()
                    # Prepare state to evaluate functions
                    state = State()
                    for c in ds_concepts:
                        state.set(f"{ds.code}.{c}", str(row2[1][c]))
                    state.set(
                        "_glb_idx", self._glb_idx
                    )  # Pass PartialRetrievalDictionary to the evaluator. For functions needing it

                    # Evaluate all functions
                    expressions = {}
                    for e, ast in asts.items():
                        res, vars = ast_evaluator(ast,
                                                  state,
                                                  None,
                                                  issues,
                                                  atomic_h_names=True)
                        expressions[e] = res
                    # Expansion into var_dict
                    for f in var_dict:
                        v = self._fields[f]  # Initial value
                        for item in sorted(expressions.keys(),
                                           key=len,
                                           reverse=True):
                            v = v.replace(item, expressions[item])
                        row3[f] = v

                    # # Concepts change dictionary
                    # concepts = {}
                    # for c in ds_concepts:
                    #     concepts[f"{{{ds.code}.{c}}}"] = str(row2[1][c])
                    # # Expansion into var_dict
                    # for f in var_dict:
                    #     v = self._fields[f]  # Initial value
                    #     for item in sorted(concepts.keys(), key=len, reverse=True):
                    #         v = re_concepts[item].sub(concepts[item], v)
                    #     row3[f] = v

                    # Syntactic verification of the resulting expansion
                    processable, tmp_issues = parse_cmd_row_dict(
                        self._serialization_type, row3, already_parsed_fields,
                        location)
                    if len(tmp_issues) > 0:
                        self._issues.extend(tmp_issues)
                    # Process row
                    if processable:
                        self._fields = row3
                        self._process_row(row3, ds_row)
                        self._fields = tmp_fields
            elif len(h_list) == 1:  # Expand hierarchy
                pass
        else:
            self._process_row(self._fields)  # Process row
Exemplo n.º 28
0
    def execute(self, state: "State"):
        """
        For each parent processor clone all the child processors.
        The cloning process may pass some factor observation, that may result in
        """
        some_error = False
        issues = []

        parent_processor_type = self._content["parent_processor_type"]
        child_processor_type = self._content["child_processor_type"]
        scaled_factor = self._content["scaled_factor"]
        source = self._content["source"]
        # column_headers = self._content["column_headers"]
        # row_headers = self._content["row_headers"]
        scales = self._content["scales"]

        # Find processor sets, for parent and child
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        if parent_processor_type not in p_sets:
            some_error = True
            issues.append((
                3, "The processor type '" + parent_processor_type +
                "' (appointed for parent) has not been found in the commands execute so far"
            ))

        if child_processor_type not in p_sets:
            some_error = True
            issues.append((
                3, "The processor type '" + child_processor_type +
                "' (should be child processor) has not been found in the commands execute so far"
            ))

        if some_error:
            return issues, None

        # CREATE the Observer of the Upscaling
        oer = glb_idx.get(Observer.partial_key(source))
        if not oer:
            oer = Observer(source)
            glb_idx.put(oer.key(), oer)
        else:
            oer = oer[0]

        # Processor Sets have associated attributes, and each of them has a code list
        parent = p_sets[parent_processor_type]  # type: ProcessorsSet
        child = p_sets[child_processor_type]  # type: ProcessorsSet

        # Form code lists from the command specification
        code_lists = None
        for sc_dict in scales:
            codes = sc_dict["codes"]
            if not code_lists:
                code_lists = [set() for _ in codes]

            for i, c in enumerate(codes):
                code_lists[i].add(c)

        # Match existing code lists (from Processor attributes) with the ones gathered in the specification of
        # the two (parent and child) processors sets.
        # Form lists of attributes of processors used in the code lists
        parent_attrs = []
        child_attrs = []
        matched = []
        for i, cl in enumerate(code_lists):
            found = False
            for attr, attr_values in parent.attributes.items():
                if set(attr_values).issuperset(cl):
                    parent_attrs.append(
                        (attr, i))  # (Attribute, code list index)
                    found = True
                    break
            for attr, attr_values in child.attributes.items():
                if set(attr_values).issuperset(cl):
                    child_attrs.append(
                        (attr, i))  # (Attribute, code list index)
                    found = True
                    break
            matched.append(found)
        for i, found in enumerate(matched):
            if not found:
                cl = code_lists[i]
                # TODO Try cl as a list of names of parent or child processors
                if not found:
                    issues.append((
                        2, "The code list: " + ", ".join(cl) +
                        " is not contained in the attributes of the parent processors set '"
                        + parent_processor_type +
                        "' nor in the attributes of the child processors set '"
                        + child_processor_type + "'"))

        # Execute the upscale for each
        cached_processors = {}
        for sc_dict in scales:
            try:
                non_zero_weight = math.fabs(float(sc_dict["weight"])) > 1e-6
            except:
                non_zero_weight = True
            if not non_zero_weight:
                continue

            codes = sc_dict["codes"]
            # Find parent processor
            parent_dict = {attr: codes[i] for attr, i in parent_attrs}
            d2s = str(parent_dict)
            if d2s in cached_processors:
                parent = cached_processors[d2s]
                if not parent:
                    issues.append((
                        3, "Either the tuple (" + d2s +
                        ") did not match any Processor or matched more than one."
                    ))
            else:
                parent_dict.update(Processor.partial_key())

                # Obtain Processor matching the attributes <<<<<<<<<<
                # Query the PartialRetrievalDictionary by attributes
                parents = glb_idx.get(parent_dict)

                if len(parents) > 1:
                    issues.append(
                        (3, "The tuple (" + str(parent_dict) + ") matches " +
                         str(len(parents)) + " Processors: " +
                         (", ".join([p.name for p in parents]))))
                    parent = None
                elif len(parents) == 0:
                    issues.append((3, "The tuple (" + str(parent_dict) +
                                   ") did not match any Processor"))
                    parent = None
                else:
                    parent = parents[0]

                cached_processors[d2s] = parent

            # Find child processor
            child_dict = {attr: codes[i] for attr, i in child_attrs}
            d2s = str(child_dict)
            if d2s in cached_processors:
                child = cached_processors[d2s]
                if not child:
                    issues.append((
                        3, "Either the tuple (" + d2s +
                        ") did not match any Processor or matched more than one."
                    ))
            else:
                child_dict.update(Processor.partial_key())

                # Obtain Processors matching the attributes
                # Query the PartialRetrievalDictionary by attributes
                children = glb_idx.get(child_dict)

                if len(children) > 1:
                    issues.append(
                        (3, "The tuple (" + str(child_dict) + ") matches " +
                         str(len(parents)) + " Processors: " +
                         (", ".join([p.name for p in children]))))
                    child = None
                elif len(children) == 0:
                    issues.append((3, "The tuple (" + str(child_dict) +
                                   ") did not match any Processor"))
                    child = None
                else:
                    child = children[0]  # type: Processor

                cached_processors[d2s] = child

            # Clone child processor (and its descendants) and add an upscale relation between "parent" and the clone
            if parent and child:
                if non_zero_weight:
                    # Clone the child processor
                    # TODO
                    cloned_child, cloned_children = child.clone(state=glb_idx)
                    Processor.register([cloned_child] + list(cloned_children),
                                       glb_idx)

                    # Create the new Relation Observations
                    # - Part-of Relation
                    o1 = ProcessorsRelationPartOfObservation.create_and_append(
                        parent, cloned_child, oer)  # Part-of
                    glb_idx.put(o1.key(), o1)
                    # - Upscale Relation
                    quantity = str(sc_dict["weight"])
                    if True:
                        # Find Interface named "scaled_factor"
                        for f in parent.factors:
                            if strcmp(f.name, scaled_factor):
                                origin = f
                                break
                        else:
                            origin = None
                        for f in cloned_child.factors:
                            if strcmp(f.name, scaled_factor):
                                destination = f
                                break
                        else:
                            destination = None

                        if origin and destination:
                            o3 = FactorsRelationScaleObservation.create_and_append(
                                origin,
                                destination,
                                observer=None,
                                quantity=quantity)
                            glb_idx.put(o3.key(), o3)
                        else:
                            raise Exception(
                                "Could not find Interfaces to define a Scale relation. Processors: "
                                + parent.name + ", " + cloned_child.name +
                                "; Interface name: " + scaled_factor)
                    else:
                        o3 = ProcessorsRelationUpscaleObservation.create_and_append(
                            parent,
                            cloned_child,
                            observer=None,
                            factor_name=scaled_factor,
                            quantity=quantity)
                        glb_idx.put(o3.key(), o3)
            else:
                # TODO
                parent_dict = str({attr: codes[i] for attr, i in parent_attrs})
                child_dict = str({attr: codes[i] for attr, i in child_attrs})
                if not parent and child:
                    issues.append((
                        2,
                        "Could not find parent Processor matching attributes: "
                        + parent_dict))
                elif not child and parent:
                    issues.append(
                        (2,
                         "Could not find child Processor matching attributes: "
                         + child_dict))
                else:
                    issues.append((
                        2,
                        "Could not find parent Processor matching attributes: "
                        + parent_dict +
                        ", nor child Processor matching attributes: " +
                        child_dict))

        return issues, None
Exemplo n.º 29
0
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in ["startperiod", "endperiod"]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = nexinfosys.data_source_manager.get_dataset_filtered(
            source, dataset_name, params)
        df = ds.data

        # Join with mapped dimensions (augment it)
        # TODO Prepare an "m" containing ALL the mappings affecting "df"
        # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"])
        # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # TODO Change by many-to-many mapping
                # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns)
                # Elaborate a many to one mapping
                tmp = []
                for el in mappings[m].map:
                    for to in el["to"]:
                        if to["d"]:
                            tmp.append([el["o"], to["d"]])
                df_dst = pd.DataFrame(
                    tmp, columns=['sou_rce', mappings[m].destination])
                for di in df.columns:
                    if strcmp(mappings[m].origin, di):
                        d = di
                        if not nexinfosys.case_sensitive:
                            df[d + "_l"] = df[d].str.lower()
                            d = d + "_l"
                        break
                df = pd.merge(df,
                              df_dst,
                              how='left',
                              left_on=d,
                              right_on='sou_rce')
                del df['sou_rce']
                if not nexinfosys.case_sensitive:
                    del df[d]

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            out_names = self._content["measures_as"]
            rows = translate_case(self._content["group_by"],
                                  params)  # Group by dimension names
            lcase_rows = [d.lower() for d in rows]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_rows:
                    # Find and replace
                    for i, d2 in enumerate(rows):
                        if strcmp(d, d2):
                            rows[i] = d
                            break

            aggs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    aggs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    aggs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    aggs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    aggs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:  # countav=="Count Available"
                    aggs.append("count")  # Count number of non-NaN elements
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgav",
                                   "avgna"]:  # avgna=="Average without
                    aggs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:  # % of NaN vs total elements
                    aggs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "rows" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                df_columns_dict = create_dictionary(
                    data={c: c
                          for c in df.columns})
                for r in rows.copy():
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[r] = params[k][0]
                                else:
                                    rows.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            rows.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))
                # Put proper DIMENSION names
                for ir, r in enumerate(rows):
                    if r in df_columns_dict:
                        rows[ir] = df_columns_dict[r]

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in rows:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                # if True:
                groups = df.groupby(by=rows, as_index=False)  # Split
                d = OrderedDict([])
                lst_names = []
                if len(values) == len(aggs):
                    for i, t in enumerate(zip(values, aggs)):
                        v, agg = t
                        if len(out_names) == len(values):
                            if out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg] + "_" + v)
                        else:
                            lst_names.append(agg_names[agg] + "_" + v)
                        lst = d.get(v, [])
                        lst.append(agg)
                        d[v] = lst
                else:
                    for v in values:
                        lst = d.get(v, [])
                        for agg in aggs:
                            lst.append(agg)
                            lst_names.append(agg_names[agg] + "_" + v)
                        d[v] = lst
                # Print NaN values for each value column
                for v in set(values):
                    cnt = df[v].isnull().sum()
                    print("NA count for col '" + v + "': " + str(cnt) +
                          " of " + str(df.shape[0]))
                # AGGREGATE !!
                df2 = groups.agg(d)

                # Rename the aggregated columns
                df2.columns = rows + lst_names
                # else:
                #     # Pivot table
                #     df2 = pd.pivot_table(df,
                #                          values=values,
                #                          index=rows,
                #                          aggfunc=[aggs[0]], fill_value=np.NaN, margins=False,
                #                          dropna=True)
                #     # Remove the multiindex in columns
                #     df2.columns = [col[-1] for col in df2.columns.values]
                #     # Remove the index
                #     df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds.data = df2
            except Exception as e:
                issues.append(
                    (3, "There was a problem with the grouping: " + repr(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None