Exemplo n.º 1
0
    def _write_coverage_file(self, track: Track, group_obs_counts: Dict[Optional[str], int],
                             group_var_counts: Dict[Optional[str], Dict[Tuple[str, ...], int]], infix: str) -> None:
        fn: str = self.file_prefix + "_" + infix + ".csv"
        logging.info("Writing coverage file to %s.", fn)

        groups: List[str] = sorted([str(x) for x in group_obs_counts.keys()])
        columns: List[str] = ["variable", "in_schema", "var_id", "data_type"] + groups
        sorted_vars = _get_sorted_vars(group_var_counts, track)

        with open(fn, "w") as fh:
            writer: csv.DictWriter = csv.DictWriter(fh, columns)
            writer.writeheader()
            for var_path in sorted_vars:
                logging.debug("Writing coverage for %s.", nesteddicts.path_to_str(var_path))
                row: Dict = self._init_row(var_path)
                for group in group_obs_counts.keys():
                    n_in_group: int = group_obs_counts[group]
                    times_var_observed: int = group_var_counts[group].get(var_path, 0)
                    frac: float = times_var_observed / n_in_group
                    if frac > 1.0:
                        logging.warning("Observed coverage of {:.5f} (>1) for variable {:}."
                                        .format(frac, nesteddicts.path_to_str(var_path)))
                    #assert frac <= 1.0
                    row[str(group)] = "%0.2f" % frac
                writer.writerow(row)
Exemplo n.º 2
0
    def __call__(self,
                 fixture: Any,
                 actual: Optional[Any],
                 path: Optional[ListType[str]] = None) -> bool:
        assert fixture is not None

        # If we have a dictionary and no path, we're starting with the root
        if isinstance(fixture, dict) and path is None:
            return self.compare_folders(fixture, actual, [])

        # Otherwise, find out what kind of variable we're looking at
        assert path is not None
        var: Optional[Variable] = self.schema.lookup(tuple(path))
        if var is None:
            raise ValueError("Unrecognized variable %s" % path_to_str(path))
        data_type: str = var.data_type

        if data_type == "Folder":
            return self.compare_folders(fixture, actual, path)

        if data_type == "List":
            return self.compare_lists(fixture, actual, path)

        if data_type == "KeyedList":
            return self.compare_keyed_lists(fixture, actual, path)

        if data_type == "MultipleText":
            return compare_multiple_text(fixture, actual)

        return compare_primitives(fixture, actual)
Exemplo n.º 3
0
 def _handle_list(self, composite_id: str, child_path: Tuple[str, ...], value: Any, observed: Set) -> None:
     for child_value in value:
         if child_value is None:
             logging.debug("Encountered empty list item in composite %s (path %s).", composite_id,
                           nesteddicts.path_to_str(child_path))
             continue
         self._crawl(composite_id, child_value, observed, child_path)
Exemplo n.º 4
0
    def _record_missing(self, path: ListType, data_type: str,
                        value: Optional[Any]) -> None:
        if not _is_simple_value(value):
            value = json.dumps(value, sort_keys=True)

        path_str = nesteddicts.path_to_str(path)
        missing: MissingValue = MissingValue(self.entity_id, self.label,
                                             path_str, data_type, value)
        self.outcome.missings.append(missing)
Exemplo n.º 5
0
def _source_path(var: Variable, source_id: VariableId) -> str:
    source_track: Optional[Track] = var.track.source
    try:
        assert source_track is not None
        source_var: Variable = source_track[source_id]
    except Exception as e:
        print("breakpoint")
        raise e
    return path_to_str(source_var.absolute_path)
Exemplo n.º 6
0
def _verify_source_parent(variable: "Variable",
                          source_var_id: VariableId) -> None:
    list_ancestor: Optional["Variable"] = variable.get_first_list_ancestor()
    if list_ancestor is None:
        return
    parent_sources: Set[VariableId] = set(list_ancestor.sources)
    assert variable.track.source is not None
    source: "Variable" = variable.track.source[source_var_id]
    while source.parent is not None and source.var_id not in parent_sources:
        source = variable.track.source[source.parent]
    if source.var_id not in parent_sources:
        template: str = 'Variable %s (%s), which descends from %s %s (%s), includes %s (%s) as a source, but that ' \
                        'does not descend from one of the root list\'s sources.'
        msg = template % (path_to_str(
            variable.absolute_path), variable.var_id, list_ancestor.data_type,
                          path_to_str(list_ancestor.absolute_path),
                          list_ancestor.var_id,
                          path_to_str(source.absolute_path), source.var_id)
        raise ValueError(msg)
Exemplo n.º 7
0
 def __call__(self) -> None:
     writer = csv.writer(self.fh)
     for var in self.schema:  # type: Variable
         var_id: VariableId = var.var_id
         abs_path: str = path_to_str(var.absolute_path)
         if not var.sources:
             writer.writerow([var_id, abs_path])
             continue
         row: ListType = [var_id, abs_path] + [_source_path(var, source_id) for source_id in var.sources]
         writer.writerow(row)
Exemplo n.º 8
0
    def _write_coverage_file(self) -> None:
        output_filename: str = self.output_filename or "source_coverage.csv"
        fn: str = os.path.join(self.context.output_dir, output_filename)
        logging.info("Writing coverage file to %s.", fn)

        columns: List[str] = [
            "source_var_id", "source_var_path", "target_var_id",
            "target_var_path", "data_type", "n"
        ]

        source_schema = self.schema.source
        assert source_schema is not None

        with open(fn, "w") as fh:
            writer: csv.DictWriter = csv.DictWriter(fh, columns)
            writer.writeheader()
            for var_info in sorted(self.coverage_result):
                source_var_id = var_info.source_var_id
                target_var_id = var_info.target_var_id

                logging.debug("Writing coverage for %s -> %s.", source_var_id,
                              target_var_id)
                source_var: Optional[Variable] = source_schema.get(
                    source_var_id)
                target_var: Optional[Variable] = self.schema.get(target_var_id)
                assert source_var is not None and target_var is not None

                row = {
                    "source_var_id":
                    source_var_id,
                    "source_var_path":
                    nesteddicts.path_to_str(source_var.absolute_path),
                    "target_var_id":
                    target_var_id,
                    "target_var_path":
                    nesteddicts.path_to_str(target_var.absolute_path),
                    "data_type":
                    source_var.data_type,
                    "n":
                    self.coverage_result[var_info]
                }
                writer.writerow(row)
Exemplo n.º 9
0
    def _record_mismatch(self, path: ListType, data_type: str,
                         expected: Optional[Any],
                         actual: Optional[Any]) -> None:
        if not _is_simple_value(expected):
            expected = json.dumps(expected, sort_keys=True)
        if not _is_simple_value(actual):
            actual = json.dumps(actual, sort_keys=True)

        path_str = nesteddicts.path_to_str(path)
        mismatch: ValueMismatch = ValueMismatch(self.entity_id, self.label,
                                                path_str, data_type, expected,
                                                actual)
        self.outcome.mismatches.append(mismatch)
Exemplo n.º 10
0
    def _crawl_folder(self, node: Dict, path: List,
                      period: Optional[str]) -> None:
        keys: List = list(
            node.keys())  # May need to delete a key, so create a copy
        for key in keys:
            if key.startswith("_"):
                logging.debug("Ignoring system variable %s" %
                              nesteddicts.path_to_str(path + [key]))
                continue
            value = node[key]
            child_path = path + [key]

            var: Optional[Variable] = self.composite.schema.lookup(child_path)
            if var is None:
                logging.warning(
                    "Unknown variable path %s in period %s of composite %s" %
                    (nesteddicts.path_to_str(path), period
                     or "immutable", self.composite.composite_id))
                self._record_exception("unknown_vars", child_path, value,
                                       period)
                continue

            # Only primitives have the "cast" method
            if isinstance(var, Primitive):
                try:
                    casted: Any = var.cast(value)
                    node[key] = casted
                except ValueError:
                    logging.warning(
                        'Could not cast value "%s" into data type "%s"' %
                        (value, var.data_type))
                    self._record_exception("cast_errors", path, {key: value},
                                           period)
                    del node[key]
            else:
                self._crawl(value, child_path, period)
Exemplo n.º 11
0
def _process_track(track: Track, temporality: str,
                   writer: csv.DictWriter) -> None:
    rows: Dict[str, Dict[str, str]] = {}
    for var_id, variable in track.items():
        abs_path: str = path_to_str(variable.absolute_path)
        row: Dict = {
            "variable_id": var_id,
            "absolute_path": abs_path,
            "data_type": variable.data_type,
            "temporality": temporality
        }
        rows[abs_path] = row

    for abs_path in sorted(rows.keys()):
        row = rows[abs_path]
        writer.writerow(row)
Exemplo n.º 12
0
    def _write_groups_file(self, group_obs_counts: Dict[Optional[str], int], grouping_var_id: Optional[str], infix: str) -> None:
        groups_fn: str = self.file_prefix + "_" + infix + "_groups.csv"
        with open(groups_fn, "w") as fh:
            group_var_path: str = "Group"
            if grouping_var_id is not None:
                group_var: Optional[Variable] = self.schema.lookup(grouping_var_id)
                if group_var is not None:
                    group_var_path = nesteddicts.path_to_str(group_var.absolute_path)

            writer: csv.DictWriter = csv.DictWriter(fh, [group_var_path, "observations"])
            writer.writeheader()
            for key, value in sorted(zip([str(key) for key in group_obs_counts.keys()], group_obs_counts.values())):
                writer.writerow({
                    group_var_path: str(key),
                    "observations": str(value)
                })
Exemplo n.º 13
0
 def _record_all_as_missing(self, f_subtree: Optional[Any], path: ListType[str]) -> None:
     """Recursively find all non-folders in the subtree, recording them as missing variables."""
     data_type: str
     if len(path) == 0:
         data_type = "Folder"
     else:
         var: Optional[Variable] = self.schema.lookup(path)
         assert var is not None
         data_type = var.data_type
     if data_type == "Folder":
         assert f_subtree is not None
         for key, subfolder in f_subtree.items():
             self._record_all_as_missing(subfolder, path + [key])
     else:
         var_path: str = nesteddicts.path_to_str(path)
         missing: MissingValue = MissingValue(self.entity_id, self.label, var_path, data_type, f_subtree)
         self.outcome.missings.append(missing)
Exemplo n.º 14
0
 def _init_row(self, var_path: Tuple) -> Dict:
     var_path_str: str = nesteddicts.path_to_str(var_path)
     var: Optional[Variable] = self.schema.lookup(var_path)
     if var is not None:
         return {
             "variable": var_path_str,
             "in_schema": "TRUE",
             "var_id": var.var_id,
             "data_type": var.data_type
         }
     else:
         return {
             "variable": var_path_str,
             "in_schema": "FALSE",
             "var_id": "",
             "data_type": ""
         }
Exemplo n.º 15
0
    def _inspect(self, key: str, f_tree: Optional[Any], a_tree: Dict, path: ListType[str]) -> None:
        child_path: ListType[str] = path + [key]
        var: Optional[Variable] = self.schema.lookup(child_path)
        if var is None:
            raise ValueError("No variable called %s" % nesteddicts.path_to_str(path + [key]))
        data_type: str = var.data_type

        if f_tree == POLYTROPOS_NA:
            self._handle_explicit_na(data_type, a_tree, child_path)
            return

        if data_type == "Folder":
            assert isinstance(f_tree, dict)
            self._inspect_folder(f_tree, a_tree, child_path)
        elif data_type in {"List", "NamedList"}:
            self._inspect_complex(data_type, f_tree, a_tree, child_path)
        else:
            self._inspect_primitive(data_type, f_tree, a_tree, child_path)
Exemplo n.º 16
0
    def _crawl(self, node: Any, path: List, period: Optional[str]) -> None:
        if len(path) == 0:
            self._crawl_folder(node, path, period)
            return

        var: Optional[Variable] = self.composite.schema.lookup(path)
        if var is None:
            logging.warning("Unknown variable path %s in composite %s" %
                            (nesteddicts.path_to_str(path), self.composite.composite_id))
            self._record_exception("unknown_vars", path, node, period)
            return

        if var.data_type == "List":
            self._crawl_list(node, path, period)

        elif var.data_type == "KeyedList":
            self._crawl_keyed_list(node, path, period)

        elif var.data_type == "Folder":
            self._crawl_folder(node, path, period)

        else:
            raise ValueError
Exemplo n.º 17
0
    def _nested_case(self, descriptor: Dict) -> Iterator[str]:
        var_id: VariableId = list(descriptor.keys())[0]
        content: Dict = list(descriptor.values())[0]

        if "type" not in content:
            raise ValueError("Expected type specification for nested columns")
        ctype: str = content["type"]
        if ctype not in {"List", "KeyedList"}:
            raise ValueError('Unexpected type specification "%s"' % ctype)

        variable: Variable = self.schema.get(var_id)
        if variable is None:
            raise ValueError('Unrecognized variable ID "%s"' % var_id)
        if variable.data_type != ctype:
            raise ValueError('%s root "%s" is actually a %s' %
                             (ctype, var_id, variable.data_type))

        if ctype == "KeyedList":
            if "key_column_name" in content:
                yield content["key_column_name"]
            else:
                yield nesteddicts.path_to_str(variable.absolute_path)
        if "children" in content:
            yield from self(content["children"])
Exemplo n.º 18
0
 def __str__(self) -> str:
     msg: str = "Unrecognized variable %s" % path_to_str(self.path)
     return msg
Exemplo n.º 19
0
 def _record_invalid(self, path: ListType[str],
                     content: Optional[Any]) -> None:
     invalid: InvalidPath = InvalidPath(self.entity_id,
                                        nesteddicts.path_to_str(path),
                                        content)
     self.outcome.invalids.append(invalid)
Exemplo n.º 20
0
 def _record_match(self, path: ListType, data_type: str, value: Optional[Any]) -> None:
     path_str = nesteddicts.path_to_str(path)
     match: ValueMatch = ValueMatch(self.entity_id, self.label, path_str, data_type, value)
     self.outcome.matches.append(match)
Exemplo n.º 21
0
 def _record_missing(self, path: ListType, data_type: str, value: Optional[Any]) -> None:
     path_str = nesteddicts.path_to_str(path)
     missing: MissingValue = MissingValue(self.entity_id, self.label, path_str, data_type, value)
     self.outcome.missings.append(missing)
Exemplo n.º 22
0
 def _var_path(self, var_id: str) -> str:
     var: Variable = self.schema.get(var_id)
     if var is None:
         raise ValueError('Unrecognized variable id "%s"' % var_id)
     return nesteddicts.path_to_str(var.absolute_path)
Exemplo n.º 23
0
 def _record_mismatch(self, path: ListType, data_type: str, expected: Optional[Any], actual: Optional[Any]) -> None:
     path_str = nesteddicts.path_to_str(path)
     mismatch: ValueMismatch = ValueMismatch(self.entity_id, self.label, path_str, data_type, expected, actual)
     self.outcome.mismatches.append(mismatch)