예제 #1
0
def parse_metadata_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType:
    """
    Most "parse" methods are mostly syntactic (as opposed to semantic). They do not check existence of names.
    But in this case, the valid field names are fixed beforehand, so they are checked at this time.
    Some of the fields will be controlled also, according to some

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []
    controlled = create_dictionary()
    mandatory = create_dictionary()
    keys = create_dictionary()
    for t in metadata_fields:
        controlled[t[4]] = t[3]
        mandatory[t[4]] = t[2]
        keys[t[0]] = t[4]

    # Scan the sheet, the first column must be one of the keys of "k_list", following
    # columns can contain repeating values

    # Map key to a list of values
    content = {}  # Dictionary of lists, one per metadata key
    for r in range(area[0], area[1]):
        label = sh.cell(row=r, column=area[2]).value
        if label in keys:
            key = keys[label]
            for c in range(area[2]+1, area[3]):
                value = sh.cell(row=r, column=c).value
                if value:
                    value = str(value).strip()
                    if controlled[key]:
                        # Control "value" if the field is controllable
                        cl = {"dimensions": ["water", "energy", "food", "land", "climate"],
                              "subject_topic_keywords": None,
                              "geographical_level": ["local", "regional", "region", "country", "europe", "global", "sectoral", "sector"],
                              "geographical_situation": None,  # TODO Read the list of all geographical regions (A long list!!)
                              "restriction_level": ["internal", "confidential", "public"],
                              "language": None,  # TODO Read the list of ALL languages (or just "English"??)
                              }
                        if cl[key] and value.lower() not in cl[key]:
                            issues.append((3, "The key '"+key+"' should be one of: "+",".join(cl[key])))

                    if key not in content:
                        content[key] = []
                    content[key].append(value)
        else:
            issues.append((2, "Row "+str(r)+": unknown metadata label '"+label+"'"))

    for key in keys.values():
        if mandatory[key] and key not in content:
            some_error = True
            issues.append((3, "The value '"+key+"' is mandatory in the definition of the metadata"))

    return issues, None, content
예제 #2
0
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", 1.0)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[mh_dst_code] = (
                    mh_weight, r
                )  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict
def lcia_method(indicator: str, method: str=None, horizon: str=None,
                state: State=None, lcia_methods: PartialRetrievalDictionary=None):
    """

    :param indicator: Indicator name
    :param method: LCIA method weighting
    :param horizon: Time horizon
    :param state: Current values of processor plus parameters
    :param lcia_methods: Where LCIA data is collected
    :return: A dictionary with the
    """
    if indicator is None or indicator.strip() == "":
        return None

    k = dict(d=indicator)
    if method:
        k["m"] = method
    if horizon:
        k["h"] = horizon
    ms = lcia_methods.get(key=k, key_and_value=True)
    indices = create_dictionary()
    for k, v in ms:
        idx_name = f'{k["d"]}_{k["m"]}_{k["h"]}'
        if idx_name in indices:
            lst = indices[idx_name]
        else:
            lst = []
            indices[idx_name] = lst
        lst.append((k["i"], v[0], float(v[1])))

    ifaces = create_dictionary()
    for t in state.list_namespace_variables():
        if not t[0].startswith("_"):
            p = t[1]  # * ureg(iface_unit)
            ifaces[t[0]] = p

    res = dict()
    for name, lst in indices.items():
        interfaces = []
        weights = []  # From "
        for t in lst:
            if t[0] in ifaces:
                v = ifaces[t[0]]  # TODO .to(t[1])
                interfaces.append(v)
                weights.append(t[2])
        # Calculate the value
        ind = np.sum(np.multiply(interfaces, weights))  # * ureg(indicator_unit)
        res[name] = ind

    return res
    def obtain_dataset_source(dset_name, local_datasets=None):
        from nexinfosys.ie_imports.data_sources.ad_hoc_dataset import AdHocDatasets
        # Register AdHocDatasets
        if local_datasets:
            if "AdHoc" not in nexinfosys.data_source_manager.registry:
                adhoc = AdHocDatasets(local_datasets)
                nexinfosys.data_source_manager.register_datasource_manager(
                    adhoc)

        # Obtain the list of ALL datasets, and find the desired one, then find the source of the dataset
        lst = nexinfosys.data_source_manager.get_datasets(
            None, None, local_datasets)  # ALL Datasets, (source, dataset)
        ds = create_dictionary(data={
            d[0]: t[0]
            for t in lst for d in t[1]
        })  # Dataset to Source (to obtain the source given the dataset name)

        if dset_name in ds:
            source = ds[dset_name]
        else:
            source = None

        # Unregister AdHocDatasets
        if local_datasets:
            nexinfosys.data_source_manager.unregister_datasource_manager(adhoc)

        return source
예제 #5
0
 def test_002_many_to_many_1(self):
     # Prepare a many to many map from category set to category set
     # Prepare a simple DataFrame containing
     m = create_dictionary()
     m["cat_o_1"] = ("cat_d_1", {
         "c11": [{
             "d": "c21",
             "w": 0.6
         }, {
             "d": "c22",
             "w": 0.4
         }],
         "c12": [{
             "d": "c23",
             "w": 1.0
         }],
         "c13": [{
             "d": "c23",
             "w": 1.0
         }]
     })
     # Prepare a simple DataFrame
     df = pd.DataFrame(data=[["c11", 4], ["c12", 3], ["c13", 1.5]],
                       columns=["cat_o_1", "value"])
     # Call
     df2 = augment_dataframe_with_mapped_columns(df, m, ["value"])
     # Check result
     self.assertEqual(list(df2.columns), ["cat_o_1", "cat_d_1", "value"])
     self.assertEqual(df2.shape, (4, 3))
예제 #6
0
    def execute(self, state: "State"):
        any_error = False
        issues = []
        sheet_name = self._content["command_name"]
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)

        scenarios = create_dictionary()

        for r, param in enumerate(self._content["items"]):
            parameter = param["parameter"]
            scenario = param.get("scenario_name")
            p = glb_idx.get(Parameter.partial_key(parameter))
            if len(p) == 0:
                issues.append(Issue(itype=IType.ERROR,
                                    description="The parameter '" + parameter + "' has not been declared previously.",
                                    location=IssueLocation(sheet_name=sheet_name, row=r, column=None)))
                any_error = True
                continue

            p = p[0]
            name = parameter

            value = param.get("parameter_value")

            check_parameter_value(glb_idx, p, value, issues, sheet_name, r)

            description = param.get("description")  # For readability of the workbook. Not used for solving
            if scenario:
                if scenario in scenarios:
                    sp = scenarios[scenario]
                else:
                    sp = create_dictionary()
                    scenarios[scenario] = sp
                sp[name] = value
            else:
                p.current_value = value
                p.default_value = value

        if not any_error:
            solver_parameters = {}  # {p.name: p.current_value for p in glb_idx.get(Parameter.partial_key()) if p.group and strcmp(p.group, "NISSolverParameters")}
            if len(scenarios) == 0:
                scenarios["default"] = create_dictionary()
            ps = ProblemStatement(solver_parameters, scenarios)
            glb_idx.put(ps.key(), ps)

        return issues, None
예제 #7
0
def serialize_state(state: State):
    """
    Serialization prepared for a given organization of the state

    :return:
    """
    def serialize_dataframe(df):
        return df.to_json(orient="split", index=False), \
               json.dumps({i[0]: str(i[1]) for i in df.dtypes.to_dict().items()})
        # list(df.index.names), df.to_dict()

    print("  serialize_state IN")

    import copy
    # "_datasets"
    ns_ds = {}
    # Save and nullify before deep copy
    for ns in state.list_namespaces():
        _, _, _, datasets, _ = get_case_study_registry_objects(state, ns)
        ns_ds[ns] = datasets
        state.set("_datasets", create_dictionary(), ns)  # Nullify datasets

    # !!! WARNING: It destroys "state", so a DEEP COPY is performed !!!
    tmp = sys.getrecursionlimit()
    sys.setrecursionlimit(10000)
    state2 = copy.deepcopy(state)
    sys.setrecursionlimit(tmp)

    # Iterate all namespaces
    for ns in state2.list_namespaces():
        glb_idx, p_sets, hh, _, mappings = get_case_study_registry_objects(
            state2, ns)
        if glb_idx:
            tmp = glb_idx.to_pickable()
            state2.set("_glb_idx", tmp, ns)
        datasets = ns_ds[ns]
        # TODO Serialize other DataFrames.
        # Process Datasets
        for ds_name in datasets:
            ds = datasets[ds_name]
            if isinstance(ds.data, pd.DataFrame):
                tmp = serialize_dataframe(ds.data)
            else:
                tmp = None
                # ds.data = None
            # DB serialize the datasets
            lst2 = serialize(ds.get_objects_list())
            lst2.append(tmp)  # Append the serialized DataFrame
            datasets[ds_name] = lst2
        state2.set("_datasets", datasets, ns)
    tmp = serialize_from_object(
        state2)  # <<<<<<<< SLOWEST !!!! (when debugging)
    print("  serialize_state length: " + str(len(tmp)) + " OUT")
    tmp = blosc.compress(bytearray(tmp, "utf-8"), cname="zlib", typesize=8)
    print("  serialize_state compressed length: " + str(len(tmp)) + " OUT")

    return tmp
예제 #8
0
def obtain_dictionary_with_literal_fields(item, asts):
    d = create_dictionary()
    for f in item:
        if not f.startswith("_"):
            ast = asts[f]
            if "complex" not in ast or ("complex" in ast
                                        and not ast["complex"]):
                d[f] = item[f]
    return d
예제 #9
0
    def initialize_datasets_registry(self, datasets_list: Dict[str, Dataset]):
        """
        Receive a list of the datasets and make a copy

        :param datasets_list:
        :return: None
        """
        self._registry = create_dictionary()
        for ds_name, ds in datasets_list.items():
            self.register_dataset(ds.code, ds)
예제 #10
0
    def construct(name: str, description: str, levels: List[str], codes: List[CodeImmutable]):
        """

        :param name: Name of the Code List
        :param description: Description of the Code List
        :param levels: Names of the levels
        :param codes: List of codes, including in each the following tuple: CodeImmutable = namedtuple("CodeTuple", "code description level children")
        :return:
        """

        cl = CodeList()
        cl.code = name
        cl.description = description
        # Levels
        levels_dict = create_dictionary()
        for l in levels:
            cll = CodeListLevel()
            cll.code_list = cl  # Point to the containing CodeList
            cll.code = l
            cll.description = None
            levels_dict[l] = cll
        # Codes
        codes_dict = create_dictionary()
        for ct in codes:
            c = Code()
            c.code = ct.code
            c.description = ct.description
            if ct.level in levels_dict:
                c.level = levels_dict[ct.level]  # Point to the containing CodeListLevel
            else:
                c.level = None
            codes_dict[ct.code] = c
            c.children = []
            c.parents = []
        # Set children & parents
        for ct in codes:
            for ch in ct.children:
                if ch in codes_dict:
                    c.children.append(codes_dict[ch])
                    codes_dict[ch].parents.append(c)

        return cl
예제 #11
0
    def list_all_names(self):
        """
            Returns a list of the names of registered entities considering the scopes
            Start from top level, end in bottom level (the current one, which takes precedence)
            :return:
        """
        t = create_dictionary()
        for scope in self.__scope:
            t.update(scope._registry)

        return t.keys()
예제 #12
0
 def obtain_problem_statement(
         dynamic_scenario_parameters: Dict = None) -> ProblemStatement:
     """
     Obtain a ProblemStatement instance
     Obtain the solver parameters plus a list of scenarios
     :param dynamic_scenario_parameters:
     :return:
     """
     if dynamic_scenario_parameters is not None:
         scenarios = create_dictionary()
         scenarios["dynamic"] = create_dictionary(
             dynamic_scenario_parameters)
         return ProblemStatement(scenarios=scenarios)
     else:
         ps_list: List[ProblemStatement] = glb_idx.get(
             ProblemStatement.partial_key())
         if len(ps_list) == 0:
             # No scenarios (dummy), and use the default solver
             scenarios = create_dictionary()
             scenarios["default"] = create_dictionary()
             return ProblemStatement(scenarios=scenarios)
         else:
             return ps_list[0]
예제 #13
0
def get_case_study_registry_objects(state, namespace=None):
    """
    Obtain the main entries of the state

    :param state: Input state (modified also)
    :param namespace: State supports several namespaces. This one serves to specify which one. Default=None
    :return: Tuple: (global index, processor sets, hierarchies, datasets, mappings)
    """
    # Index of ALL objects
    glb_idx = state.get("_glb_idx", namespace)
    if not glb_idx:
        glb_idx = PartialRetrievalDictionary()
        state.set("_glb_idx", glb_idx, namespace)

    # ProcessorSet dict (dict of sets)
    p_sets = state.get("_processor_sets", namespace)
    if not p_sets:
        p_sets = create_dictionary()
        state.set("_processor_sets", p_sets, namespace)

    # Hierarchies Dict
    hh = state.get("_hierarchies", namespace)
    if not hh:
        hh = create_dictionary()
        state.set("_hierarchies", hh, namespace)
    # Datasets Dict
    datasets = state.get("_datasets", namespace)
    if not datasets:
        datasets = create_dictionary()
        state.set("_datasets", datasets, namespace)
    # Mappings Dict
    mappings = state.get("_mappings", namespace)
    if not mappings:
        mappings = create_dictionary()
        state.set("_mappings", mappings, namespace)

    return glb_idx, p_sets, hh, datasets, mappings
예제 #14
0
 def test_003_many_to_many_2(self):
     # Prepare a many to many map from category set to category set
     # Prepare a simple DataFrame containing
     m = create_dictionary()
     m["cat_o_1"] = ("cat_d_1", {
         "c11": [{
             "d": "c21",
             "w": 0.6
         }, {
             "d": "c22",
             "w": 0.4
         }],
         "c12": [{
             "d": "c23",
             "w": 1.0
         }],
         "c13": [{
             "d": "c23",
             "w": 1.0
         }]
     })
     m["cat_o_2"] = ("cat_d_2", {
         "c31": [{
             "d": "c41",
             "w": 0.3
         }, {
             "d": "c42",
             "w": 0.7
         }],
         "c32": [{
             "d": "c43",
             "w": 1.0
         }],
         "c33": [{
             "d": "c43",
             "w": 1.0
         }]
     })
     # Prepare a simple DataFrame
     df = pd.DataFrame(data=[["c11", "c31", 4], ["c12", "c32", 3],
                             ["c13", "c31", 1.5]],
                       columns=["cat_o_1", "cat_o_2", "value"])
     # >>>>> Call Cython ACCELERATED Function <<<<<
     df2 = augment_dataframe_with_mapped_columns2(df, m, ["value"])
     # Check result
     self.assertEqual(list(df2.columns),
                      ["cat_o_1", "cat_o_2", "cat_d_1", "cat_d_2", "value"])
     self.assertEqual(df2.shape, (7, 5))
    def execute(self, state: "State"):
        """
        Create a set of linear scale conversions, from factor type to factor type
        """
        some_error = False
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        origin_factor_types = self._content["origin_factor_types"]
        destination_factor_types = self._content["destination_factor_types"]
        scales = self._content["scales"]

        # Check that we have valid factor type names
        fts = create_dictionary()
        for ft_name in origin_factor_types + destination_factor_types:
            # Obtain (maybe Create) the mentioned Factor Types
            p, ft, f = find_or_create_observable(
                glb_idx,
                ft_name,
                Observer.no_observer_specified,
                None,
                proc_external=None,
                proc_attributes=None,
                proc_location=None,
                fact_roegen_type=None,
                fact_attributes=None,
                fact_incoming=None,
                fact_external=None,
                fact_location=None)
            if not ft:
                some_error = True
                issues.append((3, "Could not obtain/create the Factor Type '" +
                               ft_name + "'"))
            fts[ft_name] = ft

        if some_error:
            return issues, None

        for sc in scales:
            origin = fts[sc["origin"]]
            destination = fts[sc["destination"]]
            scale = sc["scale"]
            FactorTypesRelationUnidirectionalLinearTransformObservation.create_and_append(
                origin, destination, scale, Observer.no_observer_specified)

        return None, None
예제 #16
0
def get_processor_names_to_processors_dictionary(
        state: PartialRetrievalDictionary):
    """
    Obtain a dictionary with all processor names (a processor may have multiple names) and
    the corresponding Processor object

    :param state:
    :return:
    """
    ps = state.get(Processor.partial_key())
    ps = set(ps)  # Avoid repeating Processor objects
    d = create_dictionary()
    for p in ps:
        for n in p.full_hierarchy_names(state):
            d[n] = p
    return d
예제 #17
0
def read_geojson(url):
    """
    Read a GeoJSON file and index it by ID

    :param url:
    :return: A tuple with the deserialized GeoJSON file and an index of ID to position in the features list
    """
    if url not in in_files:
        f = urllib.request.urlopen(url)
        j = geojson.loads(f.read())
        id_dict = create_dictionary()
        for i, f in enumerate(j["features"]):
            fid = f["id"]
            id_dict[fid] = i
        in_files[url] = (j, id_dict)
    else:
        j, id_dict = in_files[url]

    return j, id_dict
예제 #18
0
def convert_code_list_to_hierarchy(cl, as_list=False):
    """
    Receives a list of codes. Codes are sorted lexicographically (to include numbers).

    Two types of coding schemes are supported by assuming that trailing zeros can be ignored to match parent -> child
    relations. The first is uniformly sized codes (those with trailing zeros). The second is growing length codes.

    Those with length less than others but common prefix are parents

    :param cl:
    :param as_list: if True, return a flat tree (all nodes are siblings, descending from a single root)
    :return:
    """

    def can_be_child(parent_candidate, child_candidate):
        # Strip zeros to the right, from parent_candidate, and
        # check if the child starts with the resulting substring
        return child_candidate.startswith(parent_candidate.rstrip("0"))

    root = Node("")
    path = [root]
    code_to_node = create_dictionary()
    for c in sorted(cl):
        if as_list:
            n = Node(c, path[-1])
        else:
            found = False
            while len(path) > 0 and not found:
                if can_be_child(path[-1].name, c):
                    found = True
                else:
                    path.pop()
            if c.rstrip("0") == path[-1].name:
                # Just modify (it may enter here only in the root node)
                path[-1].name = c
                n = path[-1]
            else:
                # Create node and append it to the active path
                n = Node(c, path[-1])
                path.append(n)
        code_to_node[c] = n  # Map the code to the node

    return root, code_to_node
예제 #19
0
def generate_dublin_core_xml(content):
    """
    Generate an XML string with a Simple Dublin Core Record from a Case Study Metadata Command Content
    :param content:
    :return:
    """
    controlled = create_dictionary()
    for t in metadata_fields:
        controlled[t[4]] = t

    s = """<?xml version="1.0"?>
<caseStudyMetadata xmlns="http://magic-nexus.org/dmp/" xmlns:dc="http://purl.org/dc/elements/1.1/">
"""
    for key in content:
        k = controlled[key][1]
        if k:
            for l in content[key]:
                s += "    <dc:" + k + ">" + escape(str(l)) + "</dc:" + k + ">\n"

    s += "</caseStudyMetadata>\n"

    return s
def dictionary_from_key_value_list(kvl, state: State = None):
    """
    From a string containing a list of keys and values, return a dictionary
    Keys must be literals, values can be expressions, to be evaluated at a later moment

    (syntactic validity of expressions is not checked here)

    :param kvl: String containing the list of keys and values
    :except If syntactic problems occur
    :return: A dictionary
    """
    pairs = kvl.split(",")
    d = create_dictionary()
    for p in pairs:
        k, v = p.split("=", maxsplit=1)
        if not k:
            raise Exception(
                "Each key-value pair must be separated by '=' and key has to be defined, value can be empty: " + kvl)
        else:
            try:
                k = k.strip()
                v = v.strip()
                string_to_ast(simple_ident, k)
                try:
                    # Simplest: string
                    string_to_ast(quotedString, v)
                    v = v[1:-1]
                except:
                    issues = []
                    ast = string_to_ast(expression_with_parameters, v)
                    res, unres = ast_evaluator(ast, state, None, issues)
                    if len(unres) == 0:
                        v = res

                d[k] = v
            except:
                raise Exception("Key must be a string: " + k + " in key-value list: " + kvl)
    return d
    def _process_row(self, fields: Dict[str, Any], subrow=None) -> None:
        """
        Create and register Benchmark object

        :param fields:
        """
        name = fields["benchmark"]
        benchmark_group = fields["benchmark_group"]
        stakeholders = fields["stakeholders"]
        b = self._glb_idx.get(Benchmark.partial_key(name=name))
        if len(b) == 1:
            b = b[0]
        elif len(b) == 0:
            b = Benchmark(name, benchmark_group,
                          stakeholders.split(",") if stakeholders else [])
            self._glb_idx.put(b.key(), b)
        else:
            self._add_issue(
                IType.ERROR,
                f"There are {len(b)} instances of the Benchmark '{name}'" +
                subrow_issue_message(subrow))
            return

        # Add range, if not repeated
        category = fields["category"]
        if category not in b.ranges:
            b.ranges[category] = create_dictionary(
                data=dict(range=fields["range"],
                          unit=fields["unit"],
                          category=category,
                          label=fields["label"],
                          description=fields["description"]))
        else:
            self._add_issue(
                IType.WARNING, f"Range with category '{category}' repeated" +
                subrow_issue_message(subrow))
예제 #22
0
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", 1.0)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[mh_dst_code] = (
                    mh_weight, r
                )  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict

        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        local_mappings = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items
                pass
            else:
                process_line(line)

        # Mappings post-processing
        for d in local_mappings:
            # Convert the mapping into:
            # [{"o": "", "to": [{"d": "", "w": ""}]}]
            # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
            mapping = []
            ds_rows = []  # Rows in which a dataset is mentioned
            for orig in local_mappings[d].mapping:
                lst = []
                for dst in local_mappings[d].mapping[orig]:
                    t = local_mappings[d].mapping[orig][dst]
                    lst.append(dict(d=dst, w=t[0]))
                    if local_mappings[d].origin_dataset:
                        ds_rows.append(t[1])
                mapping.append(dict(o=orig, to=lst))
            from nexinfosys.ie_imports.data_source_manager import DataSourceManager
            if local_mappings[d].origin_dataset:
                if not DataSourceManager.obtain_dataset_source(
                        local_mappings[d].origin_dataset, datasets):
                    for r in ds_rows:
                        issues.append(
                            Issue(
                                itype=IType.ERROR,
                                description=
                                f"The dataset '{local_mappings[d].origin_dataset}' was not found",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                    continue
                dims, attrs, meas = obtain_dataset_metadata(
                    local_mappings[d].origin_dataset, None, datasets)
                if local_mappings[d].origin_hierarchy not in dims:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="The origin dimension '" +
                              local_mappings[d].origin_hierarchy +
                              "' does not exist in dataset '" +
                              local_mappings[d].origin_dataset + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    dim = dims[local_mappings[d].origin_hierarchy]
                    mapping = fill_map_with_all_origin_categories(dim, mapping)
            #
            origin_dataset = local_mappings[d].origin_dataset
            origin_hierarchy = local_mappings[d].origin_hierarchy
            destination_hierarchy = local_mappings[d].destination_hierarchy
            # Create Mapping and add it to Case Study mappings variable
            mappings[d] = Mapping(
                d,
                DataSourceManager.obtain_dataset_source(
                    origin_dataset, datasets), origin_dataset,
                origin_hierarchy, destination_hierarchy, mapping)

        # TODO
        # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns"
        # Put it to work !!!

        # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy"
        # Read mapping parameters

        return issues, None
예제 #23
0
import geojson
import urllib

from nexinfosys.common.helper import create_dictionary
from nexinfosys.model_services import get_case_study_registry_objects
from nexinfosys.models.musiasem_concepts import Processor, GeographicReference

in_files = create_dictionary()  # URL -> (json, idx)


def read_geojson(url):
    """
    Read a GeoJSON file and index it by ID

    :param url:
    :return: A tuple with the deserialized GeoJSON file and an index of ID to position in the features list
    """
    if url not in in_files:
        f = urllib.request.urlopen(url)
        j = geojson.loads(f.read())
        id_dict = create_dictionary()
        for i, f in enumerate(j["features"]):
            fid = f["id"]
            id_dict[fid] = i
        in_files[url] = (j, id_dict)
    else:
        j, id_dict = in_files[url]

    return j, id_dict

예제 #24
0
 def __init__(self, name=None):
     self._name = name  # A name for the scope itself
     self._registry = create_dictionary()
예제 #25
0
    def __init__(self, d: Dict[str, Any] = None):
        self._default_namespace = ""
        self._namespaces = create_dictionary()

        if d is not None and len(d) > 0:
            self.update(d)
예제 #26
0
def map_codelists(src, dst, corresp, dst_tree=False) -> (list, set):
    """
    Obtain map of two code lists
    If the source is a tree, children of a mapped node are assigned to the same mapped node
    The same source may be mapped more than once, to different nodes
    The codes from the source not mapped, are stored in "unmapped"

    :param src: source full code list
    :param dst: destination full code list
    :param corresp: list of tuples with the correspondence
    :param dst_tree: Is the dst code list a tree?
    :return: List of tuples (source code, target code), set of unmapped codes
    """

    def assign(n: str, v: str):
        """
        Assign a destination code name to a source code name
        If the source has children, assign the same destination to children, recursively

        :param n: Source code name
        :param v: Destination code name
        :return:
        """
        mapped.add(n, v)
        if n in unmapped:
            unmapped.remove(n)
        for c in cn_src[n].children:
            assign(c.name, v)

    unmapped = set(src)
    r_src, cn_src = convert_code_list_to_hierarchy(src, as_list=True)
    if dst_tree:
        r_dst, cn_dst = convert_code_list_to_hierarchy(dst)
    else:
        cn_dst = create_dictionary()
        for i in dst:
            cn_dst[i] = None  # Simply create the entry
    mapped = create_dictionary(multi_dict=True)  # MANY TO MANY
    for t in corresp:
        if t[0] in cn_src and t[1] in cn_dst:
            # Check that t[1] is a leaf node. If not, ERROR
            if isinstance(cn_dst[t[1]], Node) and len(cn_dst[t[1]].children) > 0:
                # TODO ERROR: the target destination code is not a leaf node
                pass
            else:
                # Node and its children (recursively) correspond to t[1]
                assign(t[0], t[1])

    for k in sorted(unmapped):
        print("Unmapped: " + k)
    # for k in sorted(r):
    #     print(k+" -> "+r[k])

    # Convert mapped to a list of tuples
    # Upper case
    mapped_lst = []
    for k in mapped:
        for i in mapped.getall(k):
            mapped_lst.append((k, i))

    return mapped_lst, unmapped
def parse_data_input_command(sh: Worksheet,
                             area: AreaTupleType,
                             processors_type: str,
                             state=None) -> IssuesLabelContentTripleType:
    """
    Scans the "area" of input worksheet "sh" where it is assumed a "data input" command
    is present.

    It obtains a list of observations, a list of processors, a list of observables, a list of tags
    All those are represented in JSON format

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param processors_type: Name for the type of processors. Also label of the command
    :param state: Transient state useful for checking existence of variables
    :return: DataInputCommand, list of issues (issue_type, message)
    """
    some_error = False
    issues = []
    # Define a set of observations (qualified quantities) of observables
    # This set can be replicated. So, ?how to refer to each replica?
    # Regular expression, internal name, Mandatory (True|False)
    known_columns = [
        (r"Name|Processor[_ ]name", "processor", False),
        (r"Level", "level", False),
        (r"Parent", "parent", False),
        (r"FF[_ ]type", "ff_type", True),
        (r"Var|Variable", "factor", True),
        (r"Value|NUSAP\.N", "value",
         False),  # If value is not specified, then just declare the Factor
        (r"Unit|NUSAP\.U", "unit",
         True),  # If blank, a dimensionless amount is assumed
        (r"Relative[_ ]to", "relative_to", False),
        (r"Uncertainty|Spread|NUSAP\.S", "uncertainty", False),
        (r"Assessment|NUSAP\.A", "assessment", False),
        (r"Pedigree[_ ]matrix|NUSAP\.PM", "pedigree_matrix", False),
        (r"Pedigree|NUSAP\.P", "pedigree", False),
        (r"Time|Date", "time", False),
        (r"Geo|Geolocation", "geolocation", False),
        (r"Source", "source", False),
        (r"Comment|Comments", "comments", False)
    ]

    label = "Processors " + processors_type

    # First, examine columns, to know which fields are being specified
    # Special cases:
    #   Open columns: the field is specified in the cell togheter with the value. Like "attr1=whatever", instead of a header "attr1" and in a row below, a value "whatever"
    #   Complex values: the value has syntactic rules. Like expressions for both quantities AND qualities (like NUSAP)
    #   References: the field refers to additional information in another worksheet. Unique names or ref holder (worksheet name) plus ref inside the worksheet, would be allowed. Also ref type can disambiguate
    mandatory = {t[1]: t[2] for t in known_columns}
    cre = {
    }  # Column Regular Expression dictionary (K: regular expression; V: RegularExpression object)
    if not case_sensitive:
        flags = re.IGNORECASE
    else:
        flags = 0
    for kc in known_columns:
        cre[kc[0]] = re.compile(kc[0], flags=flags)
    col_names = {}
    standard_cols = {
    }  # Internal (standardized) column name to column index in the worksheet (freedom in the order of columns)
    attribute_cols = create_dictionary(
    )  # Not recognized columns are considered freely named categories, attributes or tags
    attributes = [
    ]  # List of attributes or tags (keys of the previous dictionary)
    col_allows_dataset = create_dictionary(
    )  # If the column allows the reference to a dataset dimension
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value
        if not col_name:
            continue

        col_name = col_name.replace("\n", " ")
        col_names[c] = col_name

        # Match
        found = False
        for kc in known_columns:
            res = cre[kc[0]].search(col_name)
            if res:
                if kc[1] in standard_cols:
                    issues.append(
                        (2, "Cannot repeat column name '" + col_name + "' (" +
                         kc[0] + ") in data input command '" +
                         processors_type + "'"))
                else:
                    standard_cols[kc[1]] = c
                    col_names[c] = kc[
                        1]  # Override column name with pseudo column name for standard columns
                    if col_names[c].lower() in [
                            "factor", "value", "time", "geolocation"
                    ]:
                        col_allows_dataset[col_names[c]] = True
                    else:
                        col_allows_dataset[col_names[c]] = False
                    found = True
                break
        if not found:
            if col_name not in attribute_cols:
                # TODO Check valid col_names. It must be a valid Variable Name
                attribute_cols[col_name] = c
                attributes.append(col_name)
                col_allows_dataset[col_name] = True
            else:
                issues.append(
                    (2, "Cannot repeat column name '" + col_name +
                     "' in data input command '" + processors_type + "'"))

    del cre

    # Check if there are mandatory columns missing

    # TODO There could be combinations of columns which change the character of mandatory of some columns
    # TODO For instance, if we are only specifying structure, Value would not be needed
    print("BORRAME - " + str(known_columns))
    print("BORRAME 2 - " + str(standard_cols))
    for kc in known_columns:
        # "kc[2]" is the flag indicating if the column is mandatory or not
        # col_map contains standard column names present in the worksheet
        if kc[2] and kc[1] not in standard_cols:
            some_error = True
            issues.append((3, "Column name '" + kc[0] +
                           "' must be specified in data input command '" +
                           processors_type + "'"))

    # If there are errors, do not continue
    if some_error:
        return issues, label, None

    processor_attribute_exclusions = create_dictionary()
    processor_attribute_exclusions[
        "scale"] = None  # Exclude these attributes when characterizing the processor
    processor_attributes = [
        t for t in attributes if t not in processor_attribute_exclusions
    ]

    # SCAN rows
    lst_observations = [
    ]  # List of ALL observations. -- Main outcome of the parse operation --

    set_pedigree_matrices = create_dictionary()  # List of pedigree templates
    set_processors = create_dictionary()  # List of processor names
    set_factors = create_dictionary()  # List of factors
    set_taxa = create_dictionary(
    )  # Dictionary of taxa with their lists of values. Useful to return CODE LISTS
    set_referenced_datasets = create_dictionary(
    )  # Dictionary of datasets to be embedded into the result (it is a job of the execution part)
    processors_taxa = create_dictionary(
    )  # Correspondence "processor" -> taxa (to avoid changes in this correspondence)

    dataset_column_rule = parser_field_parsers.dataset_with_column
    values = [None] * area[3]
    # LOOP OVER EACH ROW
    for r in range(area[0] + 1, area[1]):  # Scan rows (observations)
        # Each row can specify: the processor, the factor, the quantity and qualities about the factor in the processor
        #                       It can also specify a "flow+containment hierarchy" relation

        row = {}  # Store parsed values of the row

        taxa = create_dictionary()  # Store attributes or taxa of the row

        referenced_dataset = None  # Once defined in a row, it cannot change!!
        # Scan the row first, looking for the dataset. The specification is allowed in certain columns:
        # attribute_cols and some standard_cols
        already_processed = create_dictionary()
        for c in range(area[2], area[3]):
            if c in col_names:
                value = sh.cell(row=r, column=c).value
                if isinstance(value, str) and value.startswith("#"):
                    col_name = col_names[c]
                    if col_allows_dataset[col_name]:
                        if not referenced_dataset:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    dataset_column_rule, value[1:])
                                if len(ast["parts"]) == 2:
                                    referenced_dataset = ast["parts"][0]
                                    # Remove the dataset variable. It will be stored in "_referenced_dataset"
                                    value = "#" + ast["parts"][1]
                                else:
                                    some_error = True
                                    issues.append((
                                        3,
                                        "The first dataset reference of the row must contain the "
                                        "dataset variable name and the dimension name, row "
                                        + str(r)))

                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        else:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    simple_ident, value[1:])
                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        if col_name in standard_cols:
                            row[col_name] = value
                        else:
                            taxa[col_name] = value

                values[c] = value

        # TODO If the flow type is decomposed, compose it first
        for c in standard_cols:
            if c in already_processed:
                continue

            value = values[standard_cols[c]]

            # != "" or not
            if value is None or (value is not None and value == ""):
                if c == "unit":
                    value = "-"
                if not value:
                    if mandatory[c]:
                        some_error = True
                        issues.append(
                            (3,
                             "Column '" + c + "' is mandatory, row " + str(r)))
                    continue  # Skip the rest of the iteration!

            # Parse the value
            if c in ["processor", "factor"]:
                # Check that it is a variable name, and allow hierarchical names
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_h_name, value)
            elif c == "pedigree_matrix":
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_ident, value)
            elif c == "relative_to":
                # Two elements, the first a hierarchical name, the second a unit name
                s = value.split(" ")
                if len(s) != 2:
                    some_error = True
                    issues.append((
                        3,
                        "The Relative To value has to have two parts, factor name and unit, separated by a whitespace (specified '"
                        + value + "'), in row " + str(r)))
                else:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.simple_h_name, s[0])
                    except:
                        some_error = True
                        issues.append((
                            3,
                            "The name specified for the relative to factor '" +
                            s[0] + "' is not valid, in row " + str(r)))

                    # It must be a recognized unit. Check with Pint
                    try:
                        ureg(s[1])
                        ureg.parse_unit_name(s[1], case_sensitive)
                    except UndefinedUnitError:
                        some_error = True
                        issues.append((
                            3, "The unit name '" + s[1] +
                            "' is not registered in the units processing package, in row "
                            + str(r)))
            elif c == "level":
                # A valid level name
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.level_name, value)
                except:
                    some_error = True
                    issues.append((3, "The level '" + value +
                                   "' syntax is not valid, in row " + str(r)))

            elif c == "parent":
                # Check that value is a valid parent name. It can be either a list of tags OR
                # a processor name, something defining a single processor
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.simple_h_name, value)
                except:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.named_parameters_list, value)
                    except:
                        some_error = True
                        issues.append((3, "Could not parse '" + value +
                                       "' as 'parent' in row " + str(r)))
            elif c == "ff_type":
                # The type of flow/fund must be one of a set of possible values. DEFINE THE LIST
                if value.lower() not in allowed_ff_types:
                    some_error = True
                    issues.append(
                        (3, "ff_type must be one of :" +
                         ', '.join(allowed_ff_types) + ", in row " + str(r)))
            elif c == "value":
                if not isinstance(value, str):
                    value = str(value)
                # Expression allowed. Check syntax only. It can refer to parameters.
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.expression, value)
                # TODO Check existence of used variables
                # TODO basic_elements_parser.ast_evaluator(ast, state, None, issues, "static")
            elif c == "unit":
                # It must be a recognized unit. Check with Pint
                try:
                    value = value.replace("€", "Euro").replace("$", "Dollar")
                    if value == "-":
                        value = ""  # Dimensionless
                    ureg(value)
                    ureg.parse_unit_name(value, case_sensitive)
                except:
                    some_error = True
                    issues.append((
                        3, "The unit name '" + value +
                        "' is not registered in the units processing package, in row "
                        + str(r)))
            elif c == "uncertainty":
                # TODO It must be a valid uncertainty specifier
                pass
            elif c == "assessment":
                # See page 135 of Funtowicz S., Ravetz J., "Uncertainty and Quality in Science for Policy"
                # "c" is "cognitive" assessment, "p" is pragmatic assessment.
                allowed = [
                    "nil", "low", "medium", "high", "total", "nil_c", "low_c",
                    "medium_c", "high_c", "total_c", "nil_p", "low_p",
                    "medium_p", "high_p", "total_p"
                ]
                if value and value.lower() not in allowed:
                    issues.append((3, "Assessment must be empty or one of: " +
                                   ", ".join(allowed)))
            elif c == "pedigree":
                # A valid pedigree specification is just an integer
                try:
                    int(value)
                except:
                    issues.append((3, "The pedigree specification '" + value +
                                   "' must be an integer"))
            elif c == "time":
                # A valid time specification. Possibilities: Year, Month-Year / Year-Month, Time span (two dates)
                if not isinstance(value, str):
                    value = str(value)
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.time_expression, value)
            elif c == "geolocation":
                # A reference to a geolocation
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, value)
                except:
                    some_error = True
                    issues.append((3, "The geolocation must be a reference"))
            elif c == "source":
                # Who or what provided the information. It can be formal or informal. Formal can be references (but evaluated later)
                pass
            elif c == "comments":
                # Free text
                pass

            # Store the parsed value
            row[c] = value

        for c in attribute_cols:
            if c in already_processed:
                continue

            value = values[attribute_cols[c]]

            # != "" or not
            if not value:
                taxa[c] = None
                continue  # Skip the rest of the iteration!

            # TODO Check value. Valid identifier, no whitespace
            # Validate "value", it has to be a simple ID
            try:
                if not isinstance(value, str):
                    value = str(value)
                parser_field_parsers.simple_ident.parseString(value,
                                                              parseAll=True)
            except:
                value = None
                some_error = True
                issues.append((
                    3, "The value in column '" + c +
                    "' has to be a simple identifier: start with letter, then letters, numbers and '_', no whitespace, in row "
                    + str(r)))

            taxa[c] = value

            # Disable the registration of taxa. If a Dataset reference is used, there is no way to register
            # taxa at parse time (the dataset is still not obtained). Leave it for the execution
            if c not in set_taxa:
                set_taxa[c] = create_dictionary()
            if value is not None:
                set_taxa[c][value] = None

        # Now that individual columns have been parsed, do other things

        if referenced_dataset:
            row["_referenced_dataset"] = referenced_dataset

        # If "processor" not specified, concatenate taxa columns in order to generate an automatic name
        # (excluding the processor type)
        p_taxa = taxa.copy()
        for k in processor_attribute_exclusions:
            if k in p_taxa: del p_taxa[k]

        if "processor" not in row:
            row["processor"] = "_".join(
                [str(taxa[t]) for t in processor_attributes]
            )  # TODO Which order? (the current is "order of appearance"; maybe "alphabetical order" would be better option)
        # Add as "taxa" the processor type (which is an optional input parameter to this function)
        if processors_type:
            taxa["_processors_type"] = processors_type
        # Store taxa (attributes and taxa)
        row["taxa"] = taxa
        # Store taxa if the processor still does not have it
        if row["processor"] not in processors_taxa:
            processors_taxa[row[
                "processor"]] = p_taxa  # "::".join([taxa[t] for t in lst_taxa_cols])
        else:
            # Taxa should be the same for each "processor". Error if different
            t = processors_taxa[row["processor"]]
            if t != p_taxa:
                issues.append(
                    (3, "The processor '" + row["processor"] +
                     "' has different taxa assigned, in row " + str(r)))

        # Register new processor names, pedigree templates, and variable names
        if "processor" in row:
            set_processors[row["processor"]] = None
        if "pedigree_matrix" in row:
            set_pedigree_matrices[row["pedigree_matrix"]] = None
        if "factor" in row:
            set_factors[row["factor"]] = None
        if referenced_dataset:
            set_referenced_datasets[referenced_dataset] = None

        lst_observations.append(row)

    content = {
        "factor_observations": lst_observations,
        "processor_attributes": processor_attributes,
        "processors": [k for k in set_processors],
        "pedigree_matrices": [k for k in set_pedigree_matrices],
        "factors": [k for k in set_factors],
        "referenced_datasets": [ds for ds in set_referenced_datasets],
        "code_lists": {k: [k2 for k2 in set_taxa[k]]
                       for k in set_taxa}
    }
    return issues, label, content
예제 #28
0
def get_interfaces(glb_idx: PartialRetrievalDictionary) -> pd.DataFrame:
    # Used to examine "value" as expression, and find variables that are interface names vs parameter names
    params = create_dictionary(
        data={p.name: None
              for p in glb_idx.get(Parameter.partial_key())})
    s = State()
    procs = glb_idx.get(Processor.partial_key())
    d = {}
    for p in procs:
        parent_relations = glb_idx.get(
            ProcessorsRelationPartOfObservation.partial_key(child=p))
        d[p.ident] = set([p.parent_processor.ident for p in parent_relations])

    lst = [[
        "Processor", "InterfaceType", "Interface", "Sphere", "RoegenType",
        "Orientation", "OppositeSubsystemType", "GeolocationRef",
        "GeolocationCode", "InterfaceAttributes", "Value", "Unit",
        "RelativeTo", "Uncertainty", "Assessment", "PedigreeMatrix",
        "Pedigree", "Time", "Source", "NumberAttributes", "Comments"
    ]]
    # Elaborate a DAG, then iterate over it
    for ident in list(toposort.toposort_flatten(d)):
        p = glb_idx.get(Processor.partial_key(ident=ident))[0]
        ifaces = glb_idx.get((Factor.partial_key(processor=p)))
        iface_names = create_dictionary(
            data={iface.name: iface
                  for iface in ifaces})
        # Elaborate DAG of Interfaces because of Observations
        d = {}
        for iface in ifaces:
            if iface.ident not in d:
                d[iface.ident] = set()
            for obs in iface.quantitative_observations:
                if obs.relative_factor:
                    d[iface.ident].add(obs.relative_factor.ident)
                # Consider obs.value and non linear dependencies
                if isinstance(obs.value, str):
                    ast = string_to_ast(expression_with_parameters, obs.value)
                    evaluation_issues = []
                    value, unresolved_vars = ast_evaluator(
                        exp=ast,
                        state=s,
                        obj=None,
                        issue_lst=evaluation_issues)
                    for unresolved in unresolved_vars:
                        if unresolved not in params:
                            d[iface.ident].add(iface_names[unresolved].ident)

        for ident2 in list(toposort.toposort_flatten(d)):
            iface = glb_idx.get(Factor.partial_key(ident=ident2))[0]
            lst1 = [
                iface.processor.name, iface.taxon.name, iface.name,
                iface.sphere, iface.roegen_type.name, iface.orientation,
                iface.opposite_processor_type, "", "", ""
            ]
            observations = iface.quantitative_observations
            if len(observations) > 0:
                for obs in observations:
                    lst2 = [
                        obs.value,
                        obs.attributes.get("unit",
                                           ""), obs.relative_factor.name
                        if obs.relative_factor else "",
                        obs.attributes.get("spread", ""),
                        obs.attributes.get("assessment", ""),
                        obs.attributes.get("pedigree_template", ""),
                        obs.attributes.get("pedigree", ""),
                        obs.attributes.get("time", ""),
                        obs.observer.name if obs.observer else "", "",
                        obs.attributes.get("comments", "")
                    ]
                    lst.append(lst1 + lst2)
            else:
                lst.append(lst1 + ["", "", "", "", "", "", "", "", "", ""])

    return list_to_dataframe(lst)
예제 #29
0
    def get_dataset_structure(self, database, dataset) -> Dataset:
        """ Obtain the structure of a dataset: concepts, dimensions, attributes and measures """
        refs = dict(references='all')
        dsd_response = estat.datastructure("DSD_" + dataset, params=refs)
        dsd = dsd_response.datastructure["DSD_" + dataset]
        metadata = dsd_response.write()
        # SDMXConcept = collections.namedtuple('Concept', 'type name istime description code_list')
        # DataSource <- Database <- DATASET <- Dimension(s) (including Measures) <- CodeList
        #                                      |
        #                                      v
        #                                      Concept <- CodeList  (NOT CONSIDERED NOW)
        ds = Dataset()
        ds.code = dataset
        ds.description = None  # How to get description?
        ds.attributes = {
        }  # Dataset level attributes? (encode them using a dictionary)
        ds.metadata = None  # Metadata for the dataset SDMX (flow, date of production, etc.)
        ds.database = database  # Reference to containing database

        dims = {}

        for d in dsd.dimensions:
            istime = str(
                dsd.dimensions.get(d)).split("|")[0].strip() == "TimeDimension"
            dd = Dimension()
            dd.code = d
            dd.description = None
            dd.attributes = None
            dd.is_time = istime
            dd.is_measure = False
            dd.dataset = ds
            dims[d] = dd
        for m in dsd.measures:
            dd = Dimension()
            dd.code = m
            dd.description = None
            dd.attributes = None
            dd.is_time = False
            dd.is_measure = True
            dd.dataset = ds
            dims[m] = dd
        for a in dsd.attributes:
            ds.attributes[a] = None  # TODO Get the value
        for l in metadata.codelist.index.levels[0]:
            first = True
            # Read code lists
            cl = create_dictionary()
            for m, v in list(
                    zip(metadata.codelist.loc[l].index,
                        metadata.codelist.loc[l]["name"])):
                if not first:
                    cl[m] = v.replace("\n", ";")
                else:
                    first = False
            # Attach it to the Dimension or Measure
            if metadata.codelist.loc[l]["dim_or_attr"][0] == "D":
                # Build Code List from dictionary
                dims[l].code_list = CodeList.construct(
                    l, None, [""],
                    [CodeImmutable(k, cl[k], "", []) for k in cl])

        return ds
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType,
                                       dataset_name: str,
                                       state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)
    # Dataset source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)

    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)

    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = [k.lower()
                     for k in dims[d].code_list.keys()]  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True
    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            tmp = [
                to["d"] for o in mappings[m].map for to in o["to"] if to["d"]
            ]
            cl[mappings[m].destination] = set(
                tmp)  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.
    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian
    measures = []
    out_dims = []
    agg_funcs = []
    measures_as = []
    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue

        if col_name.lower().strip() in [
                "dimensions_kept", "dims", "dimensions"
        ]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for d in lst:
                if not d:
                    continue
                if d not in cl:
                    issues.append((
                        3, "The dimension specified for output, '" + d +
                        "' is neither a dataset dimension nor a mapped dimension. ["
                        + ', '.join([d2 for d2 in cl]) + "]"))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in [
                "aggregation_function", "aggfunc", "agg_func"
        ]:  # "SELECT AGGREGATORS"
            lst = obtain_column(c, area[0] + 1, area[1])
            for f in lst:
                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append((
                        3, "The specified aggregation function, '" + f +
                        "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'"
                    ))
                else:
                    agg_funcs.append(f)
        elif col_name.lower().strip() in ["measures"]:  # "SELECT"
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for m in lst:
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        (3, "The specified measure, '" + m +
                         "' is not a measure available in the dataset. [" +
                         ', '.join([m2 for m2 in measures]) + "]"))
                else:
                    measures.append(m)
        elif col_name.lower().strip() in ["measuresas"]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for m in lst:
                measures_as.append(m)
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for cd in lst:
                if not cd:
                    continue
                if str(cd).lower() not in cl[col_name]:
                    issues.append((
                        3, "The code '" + cd +
                        "' is not present in the codes declared for dimension '"
                        + col_name + "'. Please, check them."))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in ["result_name", "result name", "resultname"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append((3, "Column '" + col_name +
                                   "' has an invalid dataset name '" +
                                   result_name + "'"))

    if len(measures) == 0:
        issues.append((3, "At least one measure should be specified"))

    if len(agg_funcs) == 0:
        issues.append(
            (2, "No aggregation function specified. Assuming 'average'"))
        agg_funcs.append("average")

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            (2, "No result name specified. Assuming '" + result_name + "'"))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": None,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content