def execute(self, state: "State"): """ Process each of the references, simply storing them as Reference objects """ glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] issues = [] # Receive a list of validated references # Store them as objects, which can be referred to later for ref in self._content["items"]: r = ref["_row"] if "ref_id" not in ref: issues.append( Issue(itype=3, description="'ref_id' field not found: " + str(ref), location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_id = ref["ref_id"] existing = glb_idx.get(self.ref_type.partial_key(ref_id)) if len(existing) == 1: issues.append( Issue(itype=3, description="Reference '" + ref_id + "' of type '" + str(self.ref_type) + "' is already defined. Not allowed", location=IssueLocation(sheet_name=name, row=r, column=None))) continue elif len(existing) > 1: # This condition should not occur... issues.append( Issue(itype=3, description="The reference '" + ref_id + "' of type '" + str(self.ref_type) + "' is defined more than one time (" + str(len(existing)) + ")", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Create and store the Reference reference = self.ref_type(ref_id, ref) glb_idx.put(reference.key(), reference) # BibliographicReference and ProvenanceReference ar also Observer if isinstance(reference, Observer): glb_idx.put(Observer.key(reference), reference) return issues, None
def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", None) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=3, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=3, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been done already", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[ mh_dst_code] = mh_weight # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict
def add_issue(itype: int, description: str): issues.append( Issue(itype=itype, description=description, location=IssueLocation(sheet_name=name, row=i, column=None))) return
def _add_issue(self, itype: int, description: str): self._issues.append( Issue(itype=itype, description=description, location=IssueLocation(sheet_name=self._command_name, row=self._current_row_number, column=None))) return
def transform_issues(issues: List[Union[dict, backend.Issue, tuple, Issue]], cmd, sheet_number: int) -> (List[Issue], bool): errors_exist = False new_issues: List[Issue] = [] for i in issues: if isinstance(i, dict): issue = Issue(itype=i["type"], description=i["message"], ctype=i["c_type"], location=IssueLocation( sheet_name=i["sheet_name"], sheet_number=i["sheet_number"])) elif isinstance(i, backend.Issue): # namedtuple issue = Issue(itype=i.type, description=i.message, ctype=i.c_type, location=IssueLocation(sheet_name=i.sheet_name, sheet_number=i.sheet_number)) elif isinstance(i, tuple): issue = Issue(itype=i[0], description=i[1], location=IssueLocation(sheet_name="")) else: # isinstance(i, Issue): issue = i if issue.itype == IType.error(): errors_exist = True if not issue.ctype and cmd: # "cmd" may be "None", in case the Issue is produced by the commands container loop issue.ctype = cmd._serialization_type if not issue.location.sheet_name or issue.location.sheet_name == "": issue.location.sheet_name = cmd._source_block_name if hasattr( cmd, "_source_block_name") else "" if not issue.location.sheet_number: issue.location.sheet_number = sheet_number new_issues.append(issue) return new_issues, errors_exist
def execute(self, state: "State"): issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.code for ds in datasets.values()] # Process parsed information for r, line in enumerate(self._content["items"]): # A dataset dataset_name = line["name"] # Find it in the already available datasets. MUST EXIST for n in ds_names: if strcmp(dataset_name, n): df = pd.read_json(StringIO(line["values"]), orient="split") # Check columns ds = datasets[n] iss = prepare_dataframe_after_external_read(ds, df) for issue in iss: issues.append( Issue(itype=3, description=issue, location=IssueLocation(sheet_name=name, row=-1, column=-1))) # Everything ok? Store the dataframe! if len(iss) == 0: ds.data = df break else: issues.append( Issue(itype=3, description="Metadata for the dataset '"+dataset_name+"' must be defined previously", location=IssueLocation(sheet_name=name, row=-1, column=-1))) return issues, None
def execute(self, state: "State"): def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", None) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=3, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=3, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been done already", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[ mh_dst_code] = mh_weight # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] local_mappings = create_dictionary() # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items pass else: process_line(line) # Mappings post-processing for d in local_mappings: # Convert the mapping into: # [{"o": "", "to": [{"d": "", "w": ""}]}] # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ] mapping = [] for orig in local_mappings[d].mapping: lst = [] for dst in local_mappings[d].mapping[orig]: lst.append( dict(d=dst, w=local_mappings[d].mapping[orig][dst])) mapping.append(dict(o=orig, to=lst)) if local_mappings[d].origin_dataset: dims, attrs, meas = obtain_dataset_metadata( local_mappings[d].origin_dataset) if local_mappings[d].origin_hierarchy not in dims: issues.append( Issue(itype=3, description="The origin dimension '" + local_mappings[d].origin_hierarchy + "' does not exist in dataset '" + local_mappings[d].origin_dataset + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: dim = dims[local_mappings[d].origin_hierarchy] mapping = fill_map_with_all_origin_categories(dim, mapping) # origin_dataset = local_mappings[d].origin_dataset origin_hierarchy = local_mappings[d].origin_hierarchy destination_hierarchy = local_mappings[d].destination_hierarchy # Create Mapping and add it to Case Study mappings variable mappings[d] = Mapping(d, obtain_dataset_source(origin_dataset), origin_dataset, origin_hierarchy, destination_hierarchy, mapping) # TODO # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns" # Put it to work !!! # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy" # Read mapping parameters return issues, None
def parse_and_unfold_line(item): # Consider multiplicity because of: # - A dataset (only one). First a list of dataset concepts used in the line is obtained. # Then the unique tuples formed by them are obtained. # - Processor name. # - A set of processors (wildcard or filter by attributes) # - A set of interfaces (according to another filter?) # - Multiple types of relation # - Both (first each dataset record applied -expanded-, then the name evaluation is applied) # - UNRESOLVED: expressions are resolved partially. Parts where parameters # expressions depending on parameters. Only the part of the expression depending on varying things # - The processor name could be a concatenation of multiple literals # # Look for multiple items in r_source_processor_name, r_source_interface_name, # r_target_processor_name, r_target_interface_name if item["_complex"]: asts = parse_line(item, fields) if item["_expandable"]: # It is an expandable line # Look for fields which are specified to be variable in order to originate the expansion res = classify_variables(asts, datasets, hh, parameters) ds_list = res["datasets"] ds_concepts = res["ds_concepts"] h_list = res["hierarchies"] if len(ds_list) >= 1 and len(h_list) >= 1: issues.append( Issue( itype=3, description="Dataset(s): " + ", ".join([d.name for d in ds_list]) + ", and hierarchy(ies): " + ", ".join([h.name for h in h_list]) + ", have been specified. Only a single dataset is supported.", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(ds_list) > 1: issues.append( Issue( itype=3, description= "More than one dataset has been specified: " + ", ".join([d.name for d in ds_list]) + ", just one dataset is supported.", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h_list) > 0: issues.append( Issue( itype=3, description= "One or more hierarchies have been specified: " + ", ".join([h.name for h in h_list]), location=IssueLocation(sheet_name=name, row=r, column=None))) return const_dict = obtain_dictionary_with_literal_fields( item, asts) if len(ds_list) == 1: # If a measure is requested and not all dimensions are used, aggregate or # issue an error (because it is not possible to reduce without aggregation). # If only dimensions are used, then obtain all the unique tuples ds = ds_list[0] measure_requested = False all_dimensions = set([ c.code for c in ds.dimensions if not c.is_measure ]) for con in ds_concepts: for c in ds.dimensions: if strcmp(c.code, con): if c.is_measure: measure_requested = True else: # Dimension all_dimensions.remove(c.code) only_dimensions_requested = len(all_dimensions) == 0 if measure_requested and not only_dimensions_requested: issues.append( Issue( itype=3, description= "It is not possible to use a measure if not all dataset dimensions are used (cannot assume implicit aggregation)", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif not measure_requested and not only_dimensions_requested: # TODO Reduce the dataset to the unique tuples (consider the current case -sensitive or not-sensitive-) data = None else: # Take the dataset as-is!!! data = ds.data # Each row for row in data.iterrows(): item2 = const_dict.copy() d = {} for c in ds_concepts: d["{" + ds.code + "." + c + "}"] = row[c] # Expand in all fields for f in fields: if f not in const_dict: # Replace all string = item[f] # TODO Could iterate through the variables in the field (not IN ALL FIELDS of the row) for item in sorted(d.keys(), key=len, reverse=True): string = re.sub(item, d[item], string) item2[f] = string print("Multiple by dataset: " + str(item2)) yield item2 else: # No dataset, no hierarchy of categories, but it could be still complex, because of wildcards # For now return just the line yield item # wildcard_in_source = ".." in item.get("source_processor", "") # wildcard_in_target = ".." in item.get("target_processor", "") # if wildcard_in_source or wildcard_in_target: # r_source_processor_name = string_to_ast(processor_names, item.get("source_processor", None)) # r_target_processor_name = string_to_ast(processor_names, item.get("target_processor", None)) # if wildcard_in_source: # source_processor_names = obtain_matching_processors(r_source_processor_name, all_processors) # else: # source_processor_names = [item["source_processor"]] # if wildcard_in_target: # target_processor_names = obtain_matching_processors(r_target_processor_name, all_processors) # else: # target_processor_names = [item["target_processor"]] # for s in source_processor_names: # for t in target_processor_names: # item3 = const_dict.copy() # item3["source_processor"] = s # item3["target_processor"] = t # print("Multiple by wildcard: "+str(item3)) # yield item3 # else: # # yield item # raise Exception("If 'complex' is signaled, it should not pass by this line") else: # print("Single: "+str(item)) yield item
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition # Obtain the source source = obtain_dataset_source(dataset_name) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=3, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=3, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=3, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=3, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=3, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=2, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=3, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=3, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=3, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=3, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=2, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def check_columns(sh, name: str, area: Tuple, cols: List[CommandField], command_name: str, ignore_not_found=False): """ When parsing of a command starts, check columns Try to match each column with declared column fields. If a column is not declared, raise an error (or ignore it) If mandatory columns are not found, raise an error :param sh: The worksheet being analyzed :param name: The name of the worksheet :param area: Area inside the worksheet that will be scanned :param cols: List of CommandField :param command_name: A string with the name of the command :param ignore_not_found: True if a column not matching declared ones has to be ignored, False if an error has to be raised in this case :return: The map column name to column index (or indices for multiply declared columns); The issues found """ issues: List[Issue] = [] # Set of mandatory columns mandatory_not_found = set([c.name for c in cols if c.mandatory]) # Check columns col_map = {} # From CommandField to a list of column index for c in range(area[2], area[3]): # For each column of row 0 (Header Row) ##val = sh.get((area[0], c), None) val = sh.cell(row=area[0], column=c).value if not val: continue col_name = val.strip() for col in cols: # Find matching CommandField from the attribute "regex_allowed_names" if col.regex_allowed_names.match(col_name): # Found matching CommandField "col". Process if "@" in col_name: # In case of use of "@", remove prefix col_name = col_name[col_name.index("@") + 1:] # Column Name to Column Index if not col.many_appearances: # Column appears once if col in col_map: issues.append( Issue(itype=3, description="The column '" + col.name + "' should not appear more than one time", location=IssueLocation(sheet_name=name, row=1, column=c))) col_map[col] = [(col_name, c)] else: # Column appears one or more times if col not in col_map: col_map[col] = [] col_map[col].append((col_name, c)) # Mandatory found (good) if col.name in mandatory_not_found: mandatory_not_found.discard(col.name) break else: # No match for the column "col_name" if not ignore_not_found: issues.append( Issue( itype=3, description="The column name '" + col_name + "' does not match any of the allowed column names for the command '" + command_name + "'", location=IssueLocation(sheet_name=name, row=1, column=c))) if len(mandatory_not_found) > 0: issues.append( Issue(itype=3, description="Mandatory columns: " + ", ".join(mandatory_not_found) + " have not been specified", location=IssueLocation(sheet_name=name, row=1, column=None))) return col_map, issues
def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=3, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=3, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name if not dsd_concept_type: attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes ds.database = None current_ds[dsd_dataset_name] = ds # If concept_type is defined => add a concept if dsd_concept_type: d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=3, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=3, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes
def execute(self, state: "State"): def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=3, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=3, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name if not dsd_concept_type: attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes ds.database = None current_ds[dsd_dataset_name] = ds # If concept_type is defined => add a concept if dsd_concept_type: d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=3, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=3, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes # ------------------------------------------------------------------------------------------------------------- issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.name for ds in datasets] # List of available Category hierarchies hierarchies = create_dictionary() for h in hh: hierarchies[h.name] = hh # Datasets being defined in this Worksheet current_ds = create_dictionary() # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: pass else: process_line(line) # Any error? for issue in issues: if issue.itype == 3: error = True break else: error = False # Load the data for those datasets that are not local (data defined later in the same spreadsheet) for ds in current_ds.values(): if "_location" not in ds.attributes: error = True issues.append( Issue(itype=3, description= "Location of data not specified for dataset '" + ds.code + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: loc = ds.attributes["_location"] ast = parser_field_parsers.string_to_ast(url_parser, loc) if ast["scheme"] != "data": df = load_dataset(loc) if df is None: error = True issues.append( Issue(itype=3, description= "Could not obtain data for dataset '" + ds.code + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: iss = prepare_dataframe_after_external_read(ds, df) for issue in iss: issues.append( Issue(itype=3, description=issue, location=IssueLocation(sheet_name=name, row=-1, column=-1))) # Everything ok? Store the dataframe! if len(iss) == 0: ds.data = df if not error: # If no error happened, add the new Datasets to the Datasets in the "global" state for ds in current_ds: datasets[ds] = current_ds[ds] return issues, None
def process_line(item): # Read variables ft_h_name = item.get( "interface_type_hierarchy", "_default") # "_default" InterfaceType Hierarchy NAME <<<<<< ft_name = item.get("interface_type", None) ft_sphere = item.get("sphere", None) ft_roegen_type = item.get("roegen_type", None) ft_parent = item.get("parent_interface_type", None) ft_formula = item.get("formula", None) ft_description = item.get("description", None) ft_unit = item.get("unit", None) # ft_orientation = item.get("orientation", None) ft_unit = item.get("unit", None) ft_attributes = item.get("attributes", {}) if ft_attributes: try: attributes = dictionary_from_key_value_list( ft_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=3, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} # Process # Mandatory fields if not ft_h_name: issues.append( Issue(itype=3, description= "Empty interface type hierarchy name. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if not ft_name: issues.append( Issue(itype=3, description="Empty interface type name. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it hie = glb_idx.get(Hierarchy.partial_key(name=ft_h_name)) if not hie: hie = Hierarchy(name=ft_h_name, type_name="interfacetype") glb_idx.put(hie.key(), hie) else: hie = hie[0] # If parent defined, check if it exists # (it must be registered both in the global registry AND in the hierarchy) if ft_parent: parent = glb_idx.get(FactorType.partial_key(ft_parent)) if len(parent) > 0: for p in parent: if p.hierarchy == hie: parent = p break if not isinstance(parent, FactorType): issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not found in hierarchy '" + ft_h_name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not found", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Double check, it must be defined in "hie" if ft_parent not in hie.codes: issues.append( Issue(itype=3, description="Parent interface type name '" + ft_parent + "' not registered in the hierarchy '" + ft_h_name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: parent = None # Check if FactorType exists ft = glb_idx.get(FactorType.partial_key(ft_name)) if len(ft) == 0: # TODO Compile and CONSIDER attributes (on the FactorType side) roegen_type = None if ft_roegen_type: roegen_type = FlowFundRoegenType.flow if strcmp( ft_roegen_type, "flow") else FlowFundRoegenType.fund ft = FactorType( ft_name, parent=parent, hierarchy=hie, roegen_type=roegen_type, tags=None, # No tags attributes=dict(unit=ft_unit, description=ft_description, **ft_attributes), expression=ft_formula, # orientation=ft_orientation, sphere=ft_sphere) # Simple name glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft) if not strcmp(ft_name, ft.full_hierarchy_name()): glb_idx.put( FactorType.partial_key(ft.full_hierarchy_name(), ft.ident), ft) else: issues.append( Issue(itype=3, description="Interface type name '" + ft_name + "' already registered", location=IssueLocation(sheet_name=name, row=r + 1, column=None))) return
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ issues: List[Issue] = [] # Analyze column names col_map = create_dictionary() for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value.strip() # Avoid repetitions if col_name in col_map: issues.append( Issue(itype=3, description="The column name '" + col_name + "' is repeated", location=IssueLocation(sheet_name=name, row=1, column=c))) if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"): col_map["dataset"] = c elif col_name: # Concept name col_map[col_name] = c if "dataset" not in col_map: issues.append( Issue( itype=3, description= "The column name 'DatasetName' is not defined for command 'DatasetData'", location=IssueLocation(sheet_name=name, row=1, column=c))) if any([i.itype == 3 for i in issues]): return issues, None, None # Read all the content into a list of lists lines = [] for r in range(area[0] + 1, area[1]): line = [] for col_name, c in col_map.items(): v = sh.cell(row=r, column=c).value if isinstance(v, str): v = v.strip() line.append(v) lines.append(line) # pd.DataFrame df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines) # Find the different datasets datasets = df["dataset"].unique() datasets = set([d.lower() for d in datasets]) content = [] # The output JSON for dataset in datasets: # Obtain filtered df2 = df.loc[df['dataset'].str.lower() == dataset] # Convert to JSON and store in content del df2["dataset"] s = StringIO() df2.to_json(s, orient="split") content.append(dict(name=dataset, values=s.getvalue())) return issues, None, dict(items=content, command_name=name)
def process_line(item): sc_src_hierarchy = item.get("source_hierarchy") sc_src_interface_type = item.get("source_interface_type") sc_tgt_hierarchy = item.get("target_hierarchy") sc_tgt_interface_type = item.get("target_interface_type") sc_scale = item.get("scale") sc_src_context = item.get("source_context") sc_tgt_context = item.get("target_context") sc_src_unit = item.get("source_unit") sc_tgt_unit = item.get("target_unit") # Check the existence of the interface types force_create = True if force_create: pass # Check if FactorTypes exist fts = [] for i, (hierarchy, interface_type) in enumerate([ (sc_src_hierarchy, sc_src_interface_type), (sc_tgt_hierarchy, sc_tgt_interface_type) ]): m = "origin" if i == 0 else "destination" if not interface_type: issues.append( Issue(itype=3, description="The " + m + "interface type name has not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check if FactorType exists ft = glb_idx.get(FactorType.partial_key(interface_type)) if len(ft) > 0: if len(ft) == 1: fts.append(ft[0]) else: if not hierarchy: issues.append( Issue( itype=3, description="The hierarchy of the " + m + "interface type name has not been specified and the interface type name is not unique", location=IssueLocation(sheet_name=name, row=r, column=None))) return for ft2 in ft: if strcmp(ft2.hierarchy.name, hierarchy): fts.append(ft2) if len(fts) != 2: issues.append( Issue( itype=3, description="Found " + str(len(fts)) + " interface types in the specification of a scale change", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Check that the interface types are from different hierarchies (warn if not; not error) if fts[0].hierarchy == fts[1].hierarchy: issues.append( Issue(itype=2, description="The interface types '" + fts[0].name + "' and '" + fts[1].name + "' are in the same hierarchy", location=IssueLocation(sheet_name=name, row=r, column=None))) # Create the directed Scale (Linear "Transformation") Relationship origin = fts[0] destination = fts[1] FactorTypesRelationUnidirectionalLinearTransformObservation.\ create_and_append(origin, destination, sc_scale, sc_src_context, sc_tgt_context, Observer.no_observer_specified)
def parse_command(sh: Worksheet, area: AreaTupleType, name: Optional[str], cmd_name: str) -> IssuesLabelContentTripleType: """ Parse command in general Generate a JSON Generate a list of issues :param sh: Worksheet to read :param area: Area of the worksheet :param name: Name of the worksheet :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions :return: issues List, None, content (JSON) """ issues: List[Issue] = [] from backend.command_field_definitions import command_fields cols = command_fields[ cmd_name] # List of CommandField that will guide the parsing ##sh_dict = read_worksheet(sh) ##col_map, local_issues = check_columns(sh_dict, name, area, cols, cmd_name) col_map, local_issues = check_columns(sh, name, area, cols, cmd_name) if any([i.itype == 3 for i in local_issues]): return local_issues, None, None issues.extend(local_issues) # "mandatory" can be defined as expression depending on other base fields (like in RefBibliographic command fields) # Elaborate a list of fields having this "complex" mandatory property complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)] content = [] # The output JSON # Parse each Row for r in range(area[0] + 1, area[1]): line = {} expandable = False # The line contains at least one field implying expansion into multiple lines complex = False # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast) # Constant mandatory values mandatory_not_found = set([ c.name for c in cols if c.mandatory and isinstance(c.mandatory, bool) ]) # Each "field" for col in col_map.keys(): cname = col.name # Appearances of field (normally just once, there attributes allowing more than one appearance) for col_name, col_idx in col_map[col]: # Read and prepare "value" ##value = sh_dict.get((r, col_idx), None) value = sh.cell(row=r, column=col_idx).value if value: if not isinstance(value, str): value = str(value) value = value.strip() else: continue if col.allowed_values: # If the CommandField checks for a list of allowed values if value.lower() not in [ v.lower() for v in col.allowed_values ]: # TODO Case insensitive CI issues.append( Issue( itype=3, description= f"Field '{col_name}' of command '{cmd_name}' has invalid value '{value}'." f" Allowed values are: {', '.join(col.allowed_values)}.", location=IssueLocation(sheet_name=name, row=r, column=col_idx))) else: line[cname] = value else: # Instead of a list of values, check if a syntactic rule is met by the value if col.parser: # Parse, just check syntax (do not store the AST) try: ast = parser_field_parsers.string_to_ast( col.parser, value) # Rules are in charge of informing if the result is expandable and if it complex if "expandable" in ast and ast["expandable"]: expandable = True if "complex" in ast and ast["complex"]: complex = True # With many appearances, just a "Key-Value list" syntax is permitted if col.many_appearances: if cname in line: line[ cname] += ", " + col_name + "='" + value + "'" else: line[cname] = col_name + "='" + value + "'" else: if cname in line: line[cname] += ", " + value else: line[cname] = value # Store the value except: ##col_header = sh_dict.get((1, col_idx), None) col_header = sh.cell(row=1, column=col_idx).value issues.append( Issue( itype=3, description="The value in field '" + col_header + "' of command '" + cmd_name + "' is not syntactically correct. Entered: " + value, location=IssueLocation(sheet_name=name, row=r, column=col_idx))) else: line[ cname] = value # No parser, just store blindly the value if col.name in mandatory_not_found: mandatory_not_found.discard(col.name) if len(line) == 0: continue # Empty line (allowed) # Flags to accelerate the second evaluation, during execution line["_row"] = r line["_expandable"] = expandable line["_complex"] = complex # Append if all mandatory fields have been filled may_append = True if len(mandatory_not_found) > 0: issues.append( Issue(itype=3, description="Mandatory columns: " + ", ".join(mandatory_not_found) + " have not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) may_append = False # Check varying mandatory fields (fields depending on the value of other fields) for c in complex_mandatory_cols: col = c.name # next(c2 for c2 in col_map if strcmp(c.name, c2.name)) if isinstance(c.mandatory, str): # Evaluate mandatory = eval(c.mandatory, None, line) may_append = (mandatory and col in line) or (not mandatory) if mandatory and col not in line: issues.append( Issue(itype=3, description="Mandatory column: " + col + " has not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) if may_append: content.append(line) return issues, None, {"items": content, "command_name": name}
def execute(self, state: "State"): """ Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept: * FactorTypes and Categories use Hierarchies, which are intrinsic. The hierarchy name is passed to the containing Hierarchy object * Processors use Part-Of Relations. In this case, the hierarchy name is lost Names of Processor and FactorTypes are built both in hierarchical and simple form The hierarchical is all the ancestors from root down to the current node, separated by "." The simple name is just the current node. If there is already another concept with that name, the simple name is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??) """ issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # Process parsed information for item in self._content["items"]: r = item["_row"] # HierarchySource (Optional) hsource = item.get("source", None) # Code of entity defining the Hierarchy if hsource: tmp = hsource hsource = glb_idx.get( HierarchySource.partial_key(name=hsource)) if len(hsource) == 0: hsource = HierarchySource(name=tmp) glb_idx.put(hsource.key(), hsource) else: hsource = hsource[0] hname = item.get("hierarchy_name", None) if not hname: issues.append( Issue( itype=3, description= "The name of the Hierarchy has not been defined. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL) hg = item.get("hierarchy_group", None) if hg: is_code_list = False # Hierarchy group else: is_code_list = True # Hierarchy group for the Code List, with the same name hg = hname # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup tmp = hg hg = glb_idx.get(HierarchyGroup.partial_key(name=hg)) if len(hg) == 0: hg = HierarchyGroup(name=tmp, source=hsource) glb_idx.put(hg.key(), hg) else: hg = hg[0] # Check if the Hierarchy is defined. YES, get it; NO, create it tmp = hname h = glb_idx.get(Hierarchy.partial_key(name=hname)) if len(h) == 0: h = Hierarchy(name=tmp) glb_idx.put(h.key(), h) glb_idx.put(h.key(hg.name + "." + h.name), h) # Register with alternative (full) name else: h = h[0] # Add the Hierarchy to the HierarchyGroup (if not) if h not in hg.hierarchies: hg.hierarchies.append(h) # Level level = item.get("level", None) if level: # Check if the level is defined. YES, get it; NO, create it for l in h.levels: if strcmp(l.name, level): level = l break else: level = HierarchyLevel(name=level, hierarchy=h) h.levels.append(level) code = item.get("code", None) label = item.get("label", None) description = item.get("description", None) attributes = item.get("attributes", None) expression = item.get("expression", None) # Parent property (what really defines Hierarchies) parent_code = item.get("parent_code", None) if parent_code: ph = h # Parent Hierarchy is the same as current hierarchy pcode = ph.codes.get(parent_code, None) if not pcode: issues.append( Issue(itype=3, description="Could not find code '" + parent_code + "' in hierarchy '" + ph.name + "'. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: pcode = None # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned if not is_code_list: ref_hierarchy = item.get("referred_hierarchy", None) if not ref_hierarchy: issues.append( Issue( itype=3, description= "For HCLs, defining ReferredHierarchy is mandatory", location=IssueLocation(sheet_name=name, row=r, column=None))) continue tmp = ref_hierarchy ref_hierarchy = glb_idx.get( Hierarchy.partial_key(name=ref_hierarchy)) if len(ref_hierarchy) == 0: issues.append( Issue(itype=3, description="ReferredHierarchy '" + tmp + "' not defined previously", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_hierarchy = ref_hierarchy[0] ref_code = ref_hierarchy.codes.get(code, None) if not ref_code: issues.append( Issue(itype=3, description="Code '" + code + "' not found in referred hierarchy '" + ref_hierarchy.name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Ignore: LABEL, DESCRIPTION. Copy them from referred code label = ref_code.label description = ref_code.description else: ref_code = None c = h.codes.get(code, None) if c: issues.append( Issue(itype=3, description="Code '" + code + "' in hierarchy '" + h.name + "' redefined.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None) c = Taxon(name=code, hierarchy=h, level=level, referred_taxon=ref_code, parent=pcode, label=label, description=description, attributes=attributes, expression=expression) # Add code to hierarchy h.codes[code] = c if not c.parent: h.roots_append(c) # Add code to level if level: level.codes.add(c) # Add child to parent code # (DONE BY THE CONSTRUCTOR!!) # if pcode: # pcode.children_codes.append(c) return issues, None # Issues, Output
def create_issue(itype: int, description: str) -> Issue: return Issue(itype=itype, description=description, location=IssueLocation(sheet_name=command_name, row=row, column=None))