Пример #1
0
    def execute(self, state: "State"):
        """
        Process each of the references, simply storing them as Reference objects
        """
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]
        issues = []

        # Receive a list of validated references
        # Store them as objects, which can be referred to later
        for ref in self._content["items"]:
            r = ref["_row"]

            if "ref_id" not in ref:
                issues.append(
                    Issue(itype=3,
                          description="'ref_id' field not found: " + str(ref),
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue
            else:
                ref_id = ref["ref_id"]
                existing = glb_idx.get(self.ref_type.partial_key(ref_id))
                if len(existing) == 1:
                    issues.append(
                        Issue(itype=3,
                              description="Reference '" + ref_id +
                              "' of type '" + str(self.ref_type) +
                              "' is already defined. Not allowed",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                elif len(existing) > 1:  # This condition should not occur...
                    issues.append(
                        Issue(itype=3,
                              description="The reference '" + ref_id +
                              "' of type '" + str(self.ref_type) +
                              "' is defined more than one time (" +
                              str(len(existing)) + ")",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Create and store the Reference
                reference = self.ref_type(ref_id, ref)
                glb_idx.put(reference.key(), reference)

                # BibliographicReference and ProvenanceReference ar also Observer
                if isinstance(reference, Observer):
                    glb_idx.put(Observer.key(reference), reference)

        return issues, None
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", None)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=3,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=3,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code + "' has been done already",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[
                    mh_dst_code] = mh_weight  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict
Пример #3
0
 def add_issue(itype: int, description: str):
     issues.append(
         Issue(itype=itype,
               description=description,
               location=IssueLocation(sheet_name=name,
                                      row=i,
                                      column=None)))
     return
 def _add_issue(self, itype: int, description: str):
     self._issues.append(
         Issue(itype=itype,
               description=description,
               location=IssueLocation(sheet_name=self._command_name,
                                      row=self._current_row_number,
                                      column=None)))
     return
Пример #5
0
def transform_issues(issues: List[Union[dict, backend.Issue, tuple, Issue]],
                     cmd, sheet_number: int) -> (List[Issue], bool):

    errors_exist = False
    new_issues: List[Issue] = []

    for i in issues:
        if isinstance(i, dict):
            issue = Issue(itype=i["type"],
                          description=i["message"],
                          ctype=i["c_type"],
                          location=IssueLocation(
                              sheet_name=i["sheet_name"],
                              sheet_number=i["sheet_number"]))
        elif isinstance(i, backend.Issue):  # namedtuple
            issue = Issue(itype=i.type,
                          description=i.message,
                          ctype=i.c_type,
                          location=IssueLocation(sheet_name=i.sheet_name,
                                                 sheet_number=i.sheet_number))
        elif isinstance(i, tuple):
            issue = Issue(itype=i[0],
                          description=i[1],
                          location=IssueLocation(sheet_name=""))
        else:  # isinstance(i, Issue):
            issue = i

        if issue.itype == IType.error():
            errors_exist = True

        if not issue.ctype and cmd:  # "cmd" may be "None", in case the Issue is produced by the commands container loop
            issue.ctype = cmd._serialization_type

        if not issue.location.sheet_name or issue.location.sheet_name == "":
            issue.location.sheet_name = cmd._source_block_name if hasattr(
                cmd, "_source_block_name") else ""

        if not issue.location.sheet_number:
            issue.location.sheet_number = sheet_number

        new_issues.append(issue)

    return new_issues, errors_exist
Пример #6
0
    def execute(self, state: "State"):
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.code for ds in datasets.values()]

        # Process parsed information
        for r, line in enumerate(self._content["items"]):
            # A dataset
            dataset_name = line["name"]
            # Find it in the already available datasets. MUST EXIST
            for n in ds_names:
                if strcmp(dataset_name, n):
                    df = pd.read_json(StringIO(line["values"]), orient="split")
                    # Check columns
                    ds = datasets[n]
                    iss = prepare_dataframe_after_external_read(ds, df)
                    for issue in iss:
                        issues.append(
                            Issue(itype=3,
                                  description=issue,
                                  location=IssueLocation(sheet_name=name, row=-1, column=-1)))
                    # Everything ok? Store the dataframe!
                    if len(iss) == 0:
                        ds.data = df
                    break
            else:
                issues.append(
                    Issue(itype=3,
                          description="Metadata for the dataset '"+dataset_name+"' must be defined previously",
                          location=IssueLocation(sheet_name=name, row=-1, column=-1)))

        return issues, None
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", None)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=3,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=3,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code + "' has been done already",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[
                    mh_dst_code] = mh_weight  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict

        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        local_mappings = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items
                pass
            else:
                process_line(line)

        # Mappings post-processing
        for d in local_mappings:
            # Convert the mapping into:
            # [{"o": "", "to": [{"d": "", "w": ""}]}]
            # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
            mapping = []
            for orig in local_mappings[d].mapping:
                lst = []
                for dst in local_mappings[d].mapping[orig]:
                    lst.append(
                        dict(d=dst, w=local_mappings[d].mapping[orig][dst]))
                mapping.append(dict(o=orig, to=lst))
            if local_mappings[d].origin_dataset:
                dims, attrs, meas = obtain_dataset_metadata(
                    local_mappings[d].origin_dataset)
                if local_mappings[d].origin_hierarchy not in dims:
                    issues.append(
                        Issue(itype=3,
                              description="The origin dimension '" +
                              local_mappings[d].origin_hierarchy +
                              "' does not exist in dataset '" +
                              local_mappings[d].origin_dataset + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    dim = dims[local_mappings[d].origin_hierarchy]
                    mapping = fill_map_with_all_origin_categories(dim, mapping)
            #
            origin_dataset = local_mappings[d].origin_dataset
            origin_hierarchy = local_mappings[d].origin_hierarchy
            destination_hierarchy = local_mappings[d].destination_hierarchy
            # Create Mapping and add it to Case Study mappings variable
            mappings[d] = Mapping(d, obtain_dataset_source(origin_dataset),
                                  origin_dataset, origin_hierarchy,
                                  destination_hierarchy, mapping)

        # TODO
        # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns"
        # Put it to work !!!

        # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy"
        # Read mapping parameters

        return issues, None
Пример #8
0
        def parse_and_unfold_line(item):
            # Consider multiplicity because of:
            # - A dataset (only one). First a list of dataset concepts used in the line is obtained.
            #   Then the unique tuples formed by them are obtained.
            # - Processor name.
            #   - A set of processors (wildcard or filter by attributes)
            #   - A set of interfaces (according to another filter?)
            # - Multiple types of relation
            # - Both (first each dataset record applied -expanded-, then the name evaluation is applied)
            # - UNRESOLVED: expressions are resolved partially. Parts where parameters
            # expressions depending on parameters. Only the part of the expression depending on varying things
            # - The processor name could be a concatenation of multiple literals
            #
            # Look for multiple items in r_source_processor_name, r_source_interface_name,
            #                            r_target_processor_name, r_target_interface_name
            if item["_complex"]:
                asts = parse_line(item, fields)
                if item["_expandable"]:
                    # It is an expandable line
                    # Look for fields which are specified to be variable in order to originate the expansion
                    res = classify_variables(asts, datasets, hh, parameters)
                    ds_list = res["datasets"]
                    ds_concepts = res["ds_concepts"]
                    h_list = res["hierarchies"]
                    if len(ds_list) >= 1 and len(h_list) >= 1:
                        issues.append(
                            Issue(
                                itype=3,
                                description="Dataset(s): " +
                                ", ".join([d.name for d in ds_list]) +
                                ", and hierarchy(ies): " +
                                ", ".join([h.name for h in h_list]) +
                                ", have been specified. Only a single dataset is supported.",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    elif len(ds_list) > 1:
                        issues.append(
                            Issue(
                                itype=3,
                                description=
                                "More than one dataset has been specified: " +
                                ", ".join([d.name for d in ds_list]) +
                                ", just one dataset is supported.",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    elif len(h_list) > 0:
                        issues.append(
                            Issue(
                                itype=3,
                                description=
                                "One or more hierarchies have been specified: "
                                + ", ".join([h.name for h in h_list]),
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    const_dict = obtain_dictionary_with_literal_fields(
                        item, asts)
                    if len(ds_list) == 1:
                        # If a measure is requested and not all dimensions are used, aggregate or
                        # issue an error (because it is not possible to reduce without aggregation).
                        # If only dimensions are used, then obtain all the unique tuples
                        ds = ds_list[0]
                        measure_requested = False
                        all_dimensions = set([
                            c.code for c in ds.dimensions if not c.is_measure
                        ])
                        for con in ds_concepts:
                            for c in ds.dimensions:
                                if strcmp(c.code, con):
                                    if c.is_measure:
                                        measure_requested = True
                                    else:  # Dimension
                                        all_dimensions.remove(c.code)
                        only_dimensions_requested = len(all_dimensions) == 0

                        if measure_requested and not only_dimensions_requested:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "It is not possible to use a measure if not all dataset dimensions are used (cannot assume implicit aggregation)",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif not measure_requested and not only_dimensions_requested:
                            # TODO Reduce the dataset to the unique tuples (consider the current case -sensitive or not-sensitive-)
                            data = None
                        else:  # Take the dataset as-is!!!
                            data = ds.data

                        # Each row
                        for row in data.iterrows():
                            item2 = const_dict.copy()

                            d = {}
                            for c in ds_concepts:
                                d["{" + ds.code + "." + c + "}"] = row[c]
                            # Expand in all fields
                            for f in fields:
                                if f not in const_dict:
                                    # Replace all
                                    string = item[f]
                                    # TODO Could iterate through the variables in the field (not IN ALL FIELDS of the row)
                                    for item in sorted(d.keys(),
                                                       key=len,
                                                       reverse=True):
                                        string = re.sub(item, d[item], string)
                                    item2[f] = string

                            print("Multiple by dataset: " + str(item2))
                            yield item2
                    else:  # No dataset, no hierarchy of categories, but it could be still complex, because of wildcards
                        # For now return just the line
                        yield item
                        # wildcard_in_source = ".." in item.get("source_processor", "")
                        # wildcard_in_target = ".." in item.get("target_processor", "")
                        # if wildcard_in_source or wildcard_in_target:
                        #     r_source_processor_name = string_to_ast(processor_names, item.get("source_processor", None))
                        #     r_target_processor_name = string_to_ast(processor_names, item.get("target_processor", None))
                        #     if wildcard_in_source:
                        #         source_processor_names = obtain_matching_processors(r_source_processor_name, all_processors)
                        #     else:
                        #         source_processor_names = [item["source_processor"]]
                        #     if wildcard_in_target:
                        #         target_processor_names = obtain_matching_processors(r_target_processor_name, all_processors)
                        #     else:
                        #         target_processor_names = [item["target_processor"]]
                        #     for s in source_processor_names:
                        #         for t in target_processor_names:
                        #             item3 = const_dict.copy()
                        #             item3["source_processor"] = s
                        #             item3["target_processor"] = t
                        #             print("Multiple by wildcard: "+str(item3))
                        #             yield item3
                        # else:
                        #     # yield item
                        #     raise Exception("If 'complex' is signaled, it should not pass by this line")
            else:
                # print("Single: "+str(item))
                yield item
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    # Obtain the source
    source = obtain_dataset_source(dataset_name)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join([m2 for m2 in measures]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=3,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=2,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=3,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=3,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=2,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Пример #10
0
def check_columns(sh,
                  name: str,
                  area: Tuple,
                  cols: List[CommandField],
                  command_name: str,
                  ignore_not_found=False):
    """
    When parsing of a command starts, check columns
    Try to match each column with declared column fields. If a column is not declared, raise an error (or ignore it)
    If mandatory columns are not found, raise an error

    :param sh: The worksheet being analyzed
    :param name: The name of the worksheet
    :param area: Area inside the worksheet that will be scanned
    :param cols: List of CommandField
    :param command_name: A string with the name of the command
    :param ignore_not_found: True if a column not matching declared ones has to be ignored, False if an error has to be raised in this case
    :return: The map column name to column index (or indices for multiply declared columns); The issues found
    """

    issues: List[Issue] = []

    # Set of mandatory columns
    mandatory_not_found = set([c.name for c in cols if c.mandatory])

    # Check columns
    col_map = {}  # From CommandField to a list of column index
    for c in range(area[2], area[3]):  # For each column of row 0 (Header Row)
        ##val = sh.get((area[0], c), None)
        val = sh.cell(row=area[0], column=c).value
        if not val:
            continue
        col_name = val.strip()
        for col in cols:  # Find matching CommandField from the attribute "regex_allowed_names"
            if col.regex_allowed_names.match(col_name):
                # Found matching CommandField "col". Process
                if "@" in col_name:  # In case of use of "@", remove prefix
                    col_name = col_name[col_name.index("@") + 1:]
                # Column Name to Column Index
                if not col.many_appearances:  # Column appears once
                    if col in col_map:
                        issues.append(
                            Issue(itype=3,
                                  description="The column '" + col.name +
                                  "' should not appear more than one time",
                                  location=IssueLocation(sheet_name=name,
                                                         row=1,
                                                         column=c)))
                    col_map[col] = [(col_name, c)]
                else:  # Column appears one or more times
                    if col not in col_map:
                        col_map[col] = []
                    col_map[col].append((col_name, c))
                # Mandatory found (good)
                if col.name in mandatory_not_found:
                    mandatory_not_found.discard(col.name)
                break
        else:  # No match for the column "col_name"
            if not ignore_not_found:
                issues.append(
                    Issue(
                        itype=3,
                        description="The column name '" + col_name +
                        "' does not match any of the allowed column names for the command '"
                        + command_name + "'",
                        location=IssueLocation(sheet_name=name,
                                               row=1,
                                               column=c)))

    if len(mandatory_not_found) > 0:
        issues.append(
            Issue(itype=3,
                  description="Mandatory columns: " +
                  ", ".join(mandatory_not_found) + " have not been specified",
                  location=IssueLocation(sheet_name=name, row=1, column=None)))

    return col_map, issues
Пример #11
0
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=3,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=3,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    if not dsd_concept_type:
                        attributes[
                            "_location"] = dsd_dataset_data_location  # Location
                        ds.description = dsd_concept_description
                        ds.attributes = attributes  # Set attributes
                    ds.database = None
                    current_ds[dsd_dataset_name] = ds
                # If concept_type is defined => add a concept
                if dsd_concept_type:
                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl
                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes
Пример #12
0
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=3,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=3,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    if not dsd_concept_type:
                        attributes[
                            "_location"] = dsd_dataset_data_location  # Location
                        ds.description = dsd_concept_description
                        ds.attributes = attributes  # Set attributes
                    ds.database = None
                    current_ds[dsd_dataset_name] = ds
                # If concept_type is defined => add a concept
                if dsd_concept_type:
                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl
                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes

        # -------------------------------------------------------------------------------------------------------------
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.name for ds in datasets]

        # List of available Category hierarchies
        hierarchies = create_dictionary()
        for h in hh:
            hierarchies[h.name] = hh

        # Datasets being defined in this Worksheet
        current_ds = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                pass
            else:
                process_line(line)

        # Any error?
        for issue in issues:
            if issue.itype == 3:
                error = True
                break
        else:
            error = False

        # Load the data for those datasets that are not local (data defined later in the same spreadsheet)
        for ds in current_ds.values():
            if "_location" not in ds.attributes:
                error = True
                issues.append(
                    Issue(itype=3,
                          description=
                          "Location of data not specified  for dataset '" +
                          ds.code + "'",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
            else:
                loc = ds.attributes["_location"]
                ast = parser_field_parsers.string_to_ast(url_parser, loc)
                if ast["scheme"] != "data":
                    df = load_dataset(loc)
                    if df is None:
                        error = True
                        issues.append(
                            Issue(itype=3,
                                  description=
                                  "Could not obtain data for dataset '" +
                                  ds.code + "'",
                                  location=IssueLocation(sheet_name=name,
                                                         row=r,
                                                         column=None)))
                    else:
                        iss = prepare_dataframe_after_external_read(ds, df)
                        for issue in iss:
                            issues.append(
                                Issue(itype=3,
                                      description=issue,
                                      location=IssueLocation(sheet_name=name,
                                                             row=-1,
                                                             column=-1)))
                        # Everything ok? Store the dataframe!
                        if len(iss) == 0:
                            ds.data = df

        if not error:
            # If no error happened, add the new Datasets to the Datasets in the "global" state
            for ds in current_ds:
                datasets[ds] = current_ds[ds]

        return issues, None
Пример #13
0
        def process_line(item):
            # Read variables
            ft_h_name = item.get(
                "interface_type_hierarchy",
                "_default")  # "_default" InterfaceType Hierarchy NAME <<<<<<
            ft_name = item.get("interface_type", None)
            ft_sphere = item.get("sphere", None)
            ft_roegen_type = item.get("roegen_type", None)
            ft_parent = item.get("parent_interface_type", None)
            ft_formula = item.get("formula", None)
            ft_description = item.get("description", None)
            ft_unit = item.get("unit", None)
            # ft_orientation = item.get("orientation", None)
            ft_unit = item.get("unit", None)
            ft_attributes = item.get("attributes", {})
            if ft_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        ft_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=3,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            # Process
            # Mandatory fields
            if not ft_h_name:
                issues.append(
                    Issue(itype=3,
                          description=
                          "Empty interface type hierarchy name. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if not ft_name:
                issues.append(
                    Issue(itype=3,
                          description="Empty interface type name. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it
            hie = glb_idx.get(Hierarchy.partial_key(name=ft_h_name))
            if not hie:
                hie = Hierarchy(name=ft_h_name, type_name="interfacetype")
                glb_idx.put(hie.key(), hie)
            else:
                hie = hie[0]

            # If parent defined, check if it exists
            # (it must be registered both in the global registry AND in the hierarchy)
            if ft_parent:
                parent = glb_idx.get(FactorType.partial_key(ft_parent))
                if len(parent) > 0:
                    for p in parent:
                        if p.hierarchy == hie:
                            parent = p
                            break
                    if not isinstance(parent, FactorType):
                        issues.append(
                            Issue(itype=3,
                                  description="Parent interface type name '" +
                                  ft_parent + "' not found in hierarchy '" +
                                  ft_h_name + "'",
                                  location=IssueLocation(sheet_name=name,
                                                         row=r,
                                                         column=None)))
                        return
                else:
                    issues.append(
                        Issue(itype=3,
                              description="Parent interface type name '" +
                              ft_parent + "' not found",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
                # Double check, it must be defined in "hie"
                if ft_parent not in hie.codes:
                    issues.append(
                        Issue(itype=3,
                              description="Parent interface type name '" +
                              ft_parent +
                              "' not registered in the hierarchy '" +
                              ft_h_name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                parent = None

            # Check if FactorType exists
            ft = glb_idx.get(FactorType.partial_key(ft_name))
            if len(ft) == 0:
                # TODO Compile and CONSIDER attributes (on the FactorType side)
                roegen_type = None
                if ft_roegen_type:
                    roegen_type = FlowFundRoegenType.flow if strcmp(
                        ft_roegen_type, "flow") else FlowFundRoegenType.fund

                ft = FactorType(
                    ft_name,
                    parent=parent,
                    hierarchy=hie,
                    roegen_type=roegen_type,
                    tags=None,  # No tags
                    attributes=dict(unit=ft_unit,
                                    description=ft_description,
                                    **ft_attributes),
                    expression=ft_formula,
                    # orientation=ft_orientation,
                    sphere=ft_sphere)
                # Simple name
                glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft)
                if not strcmp(ft_name, ft.full_hierarchy_name()):
                    glb_idx.put(
                        FactorType.partial_key(ft.full_hierarchy_name(),
                                               ft.ident), ft)
            else:
                issues.append(
                    Issue(itype=3,
                          description="Interface type name '" + ft_name +
                          "' already registered",
                          location=IssueLocation(sheet_name=name,
                                                 row=r + 1,
                                                 column=None)))
                return
Пример #14
0
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str,
                               state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """

    issues: List[Issue] = []

    # Analyze column names
    col_map = create_dictionary()
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value.strip()
        # Avoid repetitions
        if col_name in col_map:
            issues.append(
                Issue(itype=3,
                      description="The column name '" + col_name +
                      "' is repeated",
                      location=IssueLocation(sheet_name=name, row=1,
                                             column=c)))

        if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"):
            col_map["dataset"] = c
        elif col_name:
            # Concept name
            col_map[col_name] = c

    if "dataset" not in col_map:
        issues.append(
            Issue(
                itype=3,
                description=
                "The column name 'DatasetName' is not defined for command 'DatasetData'",
                location=IssueLocation(sheet_name=name, row=1, column=c)))

    if any([i.itype == 3 for i in issues]):
        return issues, None, None

    # Read all the content into a list of lists
    lines = []
    for r in range(area[0] + 1, area[1]):
        line = []
        for col_name, c in col_map.items():
            v = sh.cell(row=r, column=c).value
            if isinstance(v, str):
                v = v.strip()
            line.append(v)
        lines.append(line)

    # pd.DataFrame
    df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines)

    # Find the different datasets
    datasets = df["dataset"].unique()
    datasets = set([d.lower() for d in datasets])

    content = []  # The output JSON
    for dataset in datasets:
        # Obtain filtered
        df2 = df.loc[df['dataset'].str.lower() == dataset]
        # Convert to JSON and store in content
        del df2["dataset"]
        s = StringIO()
        df2.to_json(s, orient="split")
        content.append(dict(name=dataset, values=s.getvalue()))

    return issues, None, dict(items=content, command_name=name)
        def process_line(item):
            sc_src_hierarchy = item.get("source_hierarchy")
            sc_src_interface_type = item.get("source_interface_type")
            sc_tgt_hierarchy = item.get("target_hierarchy")
            sc_tgt_interface_type = item.get("target_interface_type")
            sc_scale = item.get("scale")
            sc_src_context = item.get("source_context")
            sc_tgt_context = item.get("target_context")
            sc_src_unit = item.get("source_unit")
            sc_tgt_unit = item.get("target_unit")

            # Check the existence of the interface types

            force_create = True
            if force_create:
                pass

            # Check if FactorTypes exist
            fts = []
            for i, (hierarchy, interface_type) in enumerate([
                (sc_src_hierarchy, sc_src_interface_type),
                (sc_tgt_hierarchy, sc_tgt_interface_type)
            ]):
                m = "origin" if i == 0 else "destination"
                if not interface_type:
                    issues.append(
                        Issue(itype=3,
                              description="The " + m +
                              "interface type name has not been specified",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return

                # Check if FactorType exists
                ft = glb_idx.get(FactorType.partial_key(interface_type))
                if len(ft) > 0:
                    if len(ft) == 1:
                        fts.append(ft[0])
                    else:
                        if not hierarchy:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description="The hierarchy of the " + m +
                                    "interface type name has not been specified and the interface type name is not unique",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return

                        for ft2 in ft:
                            if strcmp(ft2.hierarchy.name, hierarchy):
                                fts.append(ft2)

            if len(fts) != 2:
                issues.append(
                    Issue(
                        itype=3,
                        description="Found " + str(len(fts)) +
                        " interface types in the specification of a scale change",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                return

            # Check that the interface types are from different hierarchies (warn if not; not error)
            if fts[0].hierarchy == fts[1].hierarchy:
                issues.append(
                    Issue(itype=2,
                          description="The interface types '" + fts[0].name +
                          "' and '" + fts[1].name +
                          "' are in the same hierarchy",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))

            # Create the directed Scale (Linear "Transformation") Relationship
            origin = fts[0]
            destination = fts[1]
            FactorTypesRelationUnidirectionalLinearTransformObservation.\
                create_and_append(origin, destination, sc_scale,
                                  sc_src_context, sc_tgt_context,
                                  Observer.no_observer_specified)
Пример #16
0
def parse_command(sh: Worksheet, area: AreaTupleType, name: Optional[str],
                  cmd_name: str) -> IssuesLabelContentTripleType:
    """
    Parse command in general
    Generate a JSON
    Generate a list of issues

    :param sh: Worksheet to read
    :param area: Area of the worksheet
    :param name: Name of the worksheet
    :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions
    :return: issues List, None, content (JSON)
    """

    issues: List[Issue] = []

    from backend.command_field_definitions import command_fields

    cols = command_fields[
        cmd_name]  # List of CommandField that will guide the parsing
    ##sh_dict = read_worksheet(sh)
    ##col_map, local_issues = check_columns(sh_dict, name, area, cols, cmd_name)
    col_map, local_issues = check_columns(sh, name, area, cols, cmd_name)

    if any([i.itype == 3 for i in local_issues]):
        return local_issues, None, None

    issues.extend(local_issues)

    # "mandatory" can be defined as expression depending on other base fields (like in RefBibliographic command fields)
    # Elaborate a list of fields having this "complex" mandatory property
    complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)]

    content = []  # The output JSON
    # Parse each Row
    for r in range(area[0] + 1, area[1]):
        line = {}
        expandable = False  # The line contains at least one field implying expansion into multiple lines
        complex = False  # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast)

        # Constant mandatory values
        mandatory_not_found = set([
            c.name for c in cols
            if c.mandatory and isinstance(c.mandatory, bool)
        ])

        # Each "field"
        for col in col_map.keys():
            cname = col.name
            # Appearances of field (normally just once, there attributes allowing more than one appearance)
            for col_name, col_idx in col_map[col]:
                # Read and prepare "value"
                ##value = sh_dict.get((r, col_idx), None)
                value = sh.cell(row=r, column=col_idx).value
                if value:
                    if not isinstance(value, str):
                        value = str(value)
                    value = value.strip()
                else:
                    continue

                if col.allowed_values:  # If the CommandField checks for a list of allowed values
                    if value.lower() not in [
                            v.lower() for v in col.allowed_values
                    ]:  # TODO Case insensitive CI
                        issues.append(
                            Issue(
                                itype=3,
                                description=
                                f"Field '{col_name}' of command '{cmd_name}' has invalid value '{value}'."
                                f" Allowed values are: {', '.join(col.allowed_values)}.",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=col_idx)))
                    else:
                        line[cname] = value
                else:  # Instead of a list of values, check if a syntactic rule is met by the value
                    if col.parser:  # Parse, just check syntax (do not store the AST)
                        try:
                            ast = parser_field_parsers.string_to_ast(
                                col.parser, value)
                            # Rules are in charge of informing if the result is expandable and if it complex
                            if "expandable" in ast and ast["expandable"]:
                                expandable = True
                            if "complex" in ast and ast["complex"]:
                                complex = True

                            # With many appearances, just a "Key-Value list" syntax is permitted
                            if col.many_appearances:
                                if cname in line:
                                    line[
                                        cname] += ", " + col_name + "='" + value + "'"
                                else:
                                    line[cname] = col_name + "='" + value + "'"
                            else:
                                if cname in line:
                                    line[cname] += ", " + value
                                else:
                                    line[cname] = value  # Store the value
                        except:
                            ##col_header = sh_dict.get((1, col_idx), None)
                            col_header = sh.cell(row=1, column=col_idx).value
                            issues.append(
                                Issue(
                                    itype=3,
                                    description="The value in field '" +
                                    col_header + "' of command '" + cmd_name +
                                    "' is not syntactically correct. Entered: "
                                    + value,
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=col_idx)))
                    else:
                        line[
                            cname] = value  # No parser, just store blindly the value

            if col.name in mandatory_not_found:
                mandatory_not_found.discard(col.name)

        if len(line) == 0:
            continue  # Empty line (allowed)

        # Flags to accelerate the second evaluation, during execution
        line["_row"] = r
        line["_expandable"] = expandable
        line["_complex"] = complex

        # Append if all mandatory fields have been filled
        may_append = True
        if len(mandatory_not_found) > 0:
            issues.append(
                Issue(itype=3,
                      description="Mandatory columns: " +
                      ", ".join(mandatory_not_found) +
                      " have not been specified",
                      location=IssueLocation(sheet_name=name,
                                             row=r,
                                             column=None)))
            may_append = False

        # Check varying mandatory fields (fields depending on the value of other fields)
        for c in complex_mandatory_cols:
            col = c.name  # next(c2 for c2 in col_map if strcmp(c.name, c2.name))
            if isinstance(c.mandatory, str):
                # Evaluate
                mandatory = eval(c.mandatory, None, line)
                may_append = (mandatory and col in line) or (not mandatory)
                if mandatory and col not in line:
                    issues.append(
                        Issue(itype=3,
                              description="Mandatory column: " + col +
                              " has not been specified",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))

        if may_append:
            content.append(line)

    return issues, None, {"items": content, "command_name": name}
Пример #17
0
    def execute(self, state: "State"):
        """
        Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept:
        * FactorTypes and Categories use Hierarchies, which are intrinsic.
            The hierarchy name is passed to the containing Hierarchy object
        * Processors use Part-Of Relations. In this case, the hierarchy name is lost
        Names of Processor and FactorTypes are built both in hierarchical and simple form
        The hierarchical is all the ancestors from root down to the current node, separated by "."
        The simple name is just the current node. If there is already another concept with that name, the simple name
        is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??)
        """
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # Process parsed information
        for item in self._content["items"]:
            r = item["_row"]
            # HierarchySource (Optional)
            hsource = item.get("source",
                               None)  # Code of entity defining the Hierarchy
            if hsource:
                tmp = hsource
                hsource = glb_idx.get(
                    HierarchySource.partial_key(name=hsource))
                if len(hsource) == 0:
                    hsource = HierarchySource(name=tmp)
                    glb_idx.put(hsource.key(), hsource)
                else:
                    hsource = hsource[0]

            hname = item.get("hierarchy_name", None)
            if not hname:
                issues.append(
                    Issue(
                        itype=3,
                        description=
                        "The name of the Hierarchy has not been defined. Skipped.",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                continue

            # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL)
            hg = item.get("hierarchy_group", None)
            if hg:
                is_code_list = False  # Hierarchy group
            else:
                is_code_list = True  # Hierarchy group for the Code List, with the same name
                hg = hname

            # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup
            tmp = hg
            hg = glb_idx.get(HierarchyGroup.partial_key(name=hg))
            if len(hg) == 0:
                hg = HierarchyGroup(name=tmp, source=hsource)
                glb_idx.put(hg.key(), hg)
            else:
                hg = hg[0]

            # Check if the Hierarchy is defined. YES, get it; NO, create it
            tmp = hname
            h = glb_idx.get(Hierarchy.partial_key(name=hname))
            if len(h) == 0:
                h = Hierarchy(name=tmp)
                glb_idx.put(h.key(), h)
                glb_idx.put(h.key(hg.name + "." + h.name),
                            h)  # Register with alternative (full) name
            else:
                h = h[0]

            # Add the Hierarchy to the HierarchyGroup (if not)
            if h not in hg.hierarchies:
                hg.hierarchies.append(h)

            # Level
            level = item.get("level", None)
            if level:
                # Check if the level is defined. YES, get it; NO, create it
                for l in h.levels:
                    if strcmp(l.name, level):
                        level = l
                        break
                else:
                    level = HierarchyLevel(name=level, hierarchy=h)
                    h.levels.append(level)

            code = item.get("code", None)
            label = item.get("label", None)
            description = item.get("description", None)
            attributes = item.get("attributes", None)
            expression = item.get("expression", None)

            # Parent property (what really defines Hierarchies)
            parent_code = item.get("parent_code", None)
            if parent_code:
                ph = h  # Parent Hierarchy is the same as current hierarchy
                pcode = ph.codes.get(parent_code, None)
                if not pcode:
                    issues.append(
                        Issue(itype=3,
                              description="Could not find code '" +
                              parent_code + "' in hierarchy '" + ph.name +
                              "'. Skipped.",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
            else:
                pcode = None

            # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned
            if not is_code_list:
                ref_hierarchy = item.get("referred_hierarchy", None)
                if not ref_hierarchy:
                    issues.append(
                        Issue(
                            itype=3,
                            description=
                            "For HCLs, defining ReferredHierarchy is mandatory",
                            location=IssueLocation(sheet_name=name,
                                                   row=r,
                                                   column=None)))
                    continue

                tmp = ref_hierarchy
                ref_hierarchy = glb_idx.get(
                    Hierarchy.partial_key(name=ref_hierarchy))
                if len(ref_hierarchy) == 0:
                    issues.append(
                        Issue(itype=3,
                              description="ReferredHierarchy '" + tmp +
                              "' not defined previously",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    ref_hierarchy = ref_hierarchy[0]

                ref_code = ref_hierarchy.codes.get(code, None)
                if not ref_code:
                    issues.append(
                        Issue(itype=3,
                              description="Code '" + code +
                              "' not found in referred hierarchy '" +
                              ref_hierarchy.name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Ignore: LABEL, DESCRIPTION. Copy them from referred code
                label = ref_code.label
                description = ref_code.description
            else:
                ref_code = None

            c = h.codes.get(code, None)
            if c:
                issues.append(
                    Issue(itype=3,
                          description="Code '" + code + "' in hierarchy '" +
                          h.name + "' redefined.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue

            # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other
            # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None)
            c = Taxon(name=code,
                      hierarchy=h,
                      level=level,
                      referred_taxon=ref_code,
                      parent=pcode,
                      label=label,
                      description=description,
                      attributes=attributes,
                      expression=expression)
            # Add code to hierarchy
            h.codes[code] = c
            if not c.parent:
                h.roots_append(c)
            # Add code to level
            if level:
                level.codes.add(c)
            # Add child to parent code
            # (DONE BY THE CONSTRUCTOR!!)
            # if pcode:
            #     pcode.children_codes.append(c)

        return issues, None  # Issues, Output
Пример #18
0
 def create_issue(itype: int, description: str) -> Issue:
     return Issue(itype=itype,
                  description=description,
                  location=IssueLocation(sheet_name=command_name,
                                         row=row,
                                         column=None))