예제 #1
0
def test_inc_column(root_table):
    child = add_child_table(root_table, "/tender/items", "tender", "items")
    child.add_column("/tender/items/id", ["string"], "Tender Id", additional=True, abs_path="/tender/items/0/test")
    child.arrays["/tender/items/additionalClassifications"] = 0
    root_table.inc_column("ocid", "ocid")
    assert root_table.combined_columns["ocid"].hits == 1

    root_table.inc_column("/tender/awardCriteriaDetails", "/tender/awardCriteriaDetails")
    assert root_table.combined_columns["/tender/awardCriteriaDetails"].hits == 1

    child.inc_column("/tender/items/0/id", "/tender/items/id")
    assert root_table.combined_columns["/tender/items/0/id"].hits == 1
    assert child["/tender/items/id"].hits == 1
    assert child.combined_columns["/tender/items/id"].hits == 1

    child_child = add_child_table(
        child, "/tender/items/additionalClassifications", "items", "additionalClassifications"
    )
    child_child.add_column(
        "/tender/items/additionalClassifications/id",
        ["string"],
        "Classification Id",
        additional=True,
    )
    child_child.inc_column(
        "/tender/items/0/additionalClassifications/0/id", "/tender/items/additionalClassifications/id"
    )
    assert child.combined_columns["/tender/items/additionalClassifications/0/id"].hits == 1
    assert root_table.combined_columns["/tender/items/0/additionalClassifications/0/id"].hits == 1
예제 #2
0
def test_get_pointer(root_table):
    child = add_child_table(root_table, "/tender/items", "tender", "items")
    child_child = add_child_table(child,
                                  "/tender/items/additionalClassifications",
                                  "items", "additionalClassifications")
    pointer = get_pointer(
        child_child,
        "/tender/items/0/additionalClassifications/0/id",
        "/tender/items/additionalClassifications/id",
        True,
    )
    assert pointer == "/tender/items/additionalClassifications/id"

    pointer = get_pointer(child,
                          "/tender/items/0/additionalClassifications/0/id",
                          "/tender/items/additionalClassifications/id", True)
    assert pointer == "/tender/items/additionalClassifications/0/id"

    pointer = get_pointer(child, "/tender/items/0/additionalClassifications/0",
                          "/tender/items/additionalClassifications", True)
    assert pointer == "/tender/items/additionalClassifications/0"

    pointer = get_pointer(child, "/tender/items/0/additionalClassifications",
                          "/tender/items/additionalClassifications", True)
    assert pointer == "/tender/items/additionalClassifications"

    pointer = get_pointer(
        root_table,
        "/tender/items/0/additionalClassifications/0/id",
        "/tender/items/additionalClassifications/id",
        True,
    )
    assert pointer == "/tender/items/0/additionalClassifications/0/id"

    pointer = get_pointer(root_table, "/tender/items/0/id", "/tender/items/id",
                          True)
    assert pointer == "/tender/items/0/id"

    pointer = get_pointer(child, "/tender/items/0/id", "/tender/items/id",
                          True)
    assert pointer == "/tender/items/id"
    pointer = get_pointer(root_table, "/tender/id", "/tender/id", True)
    assert pointer == "/tender/id"

    pointer = get_pointer(root_table,
                          "/tender/items",
                          "/tender/items",
                          True,
                          index="0")
    assert pointer == "/tender/items/0"

    pointer = get_pointer(root_table, "/tender", "/tender", True, index="0")
    assert pointer == "/tender"
예제 #3
0
def test_add_child_table(root_table):
    data = root_table.dump()
    assert not data["parent"]
    child = add_child_table(root_table, "/tender/tenderers", "", "tenderers")
    assert child.name == "tenders_tenderers"
    assert child.name in root_table.child_tables
    assert child.total_rows == 0
    data = child.dump()
    data["parent"] == root_table.name
예제 #4
0
def test_add_column(root_table):
    root_table.add_column("/tender/id", ["string", "integer"], "Tender Id")
    assert "/tender/id" in root_table
    assert "/tender/id" in root_table.combined_columns

    root_table.add_column("/tender/itemsCount", ["string", "integer"],
                          "Items Count")
    assert "/tender/itemsCount" in root_table
    assert "/tender/itemsCount" in root_table.combined_columns

    root_table.add_column("/tender/items/additionalClassificationsCount",
                          ["string", "integer"], "Classifications Count")
    assert "/tender/items/0/additionalClassificationsCount" in root_table
    assert "/tender/items/0/additionalClassificationsCount" in root_table.combined_columns

    child = add_child_table(root_table, "/tender/items", "tender", "items")
    child.add_column(
        "/tender/items/test",
        ["string", "integer"],
        "/tender/items/test",
        additional=True,
        abs_path="/tender/items/0/test",
    )
    assert "/tender/items/test" in child
    assert "/tender/items/test" in child.combined_columns
    assert "/tender/items/0/test" in root_table
    assert "/tender/items/0/test" in root_table.combined_columns
    child.add_column("/tender/items/id", ["string", "integer"], "Items Id")

    child.arrays["/tender/items/additionalClassifications"] = 0
    child.add_column("/tender/items/additionalClassifications/id",
                     ["string", "integer"], "Classification ID")
    assert "/tender/items/additionalClassifications/0/id" in child
    assert "/tender/items/id" in child
    assert "/tender/items/0/id" in root_table
    assert "/tender/items/0/additionalClassifications/0/id" in root_table

    child.add_column(
        "/tender/items/additionalClassificationsCount",
        ["string", "integer"],
        "Classification Count",
        propagate=False,
    )
    assert "/tender/items/additionalClassificationsCount" in child
    assert "/tender/items/additionalClassificationsCount" in child.combined_columns

    assert "/tender/items/additionalClassificationsCount" not in root_table
    assert "/tender/items/additionalClassificationsCount" not in root_table.combined_columns
예제 #5
0
 def add_additional_table(self, pointer, abs_pointer, parent_key, key,
                          item):
     LOGGER.debug(_("Detected additional table: %s") % pointer)
     self.current_table.types[pointer] = ["array"]
     self._add_table(
         add_child_table(self.current_table, pointer, parent_key, key),
         pointer)
     # add columns beforehand because it might be required
     # to recalculate  and reorder headers when enlarging array
     # there must be a better way but it should work for now
     for extended_item in item:
         for path_, it in flatten(extended_item, reducer="path").items():
             ppointer = self.join_path(pointer, path_)
             if ppointer not in self.current_table:
                 self.current_table.add_column(
                     ppointer,
                     self.guess_type(it),
                     ppointer,
                     abs_path=self.join_path(abs_pointer, path_),
                     header=ppointer,
                 )
예제 #6
0
    def parse_schema(self):
        """Extract all available information from schema"""
        if isinstance(self.schema, (str, Path)):
            self.schema = resolve_file_uri(self.schema)
        self.schema = jsonref.JsonRef.replace_refs(self.schema)
        self.init_tables(self.root_tables)
        if self.combined_tables:
            self.init_tables(self.combined_tables, is_combined=True)
        separator = self.header_separator
        to_analyze = deque([("", "", {}, self.schema)])

        # TODO: check if recursion is better for field ordering
        while to_analyze:
            path, parent_key, parent, prop = to_analyze.pop()
            if prop.get("deprecated"):
                continue
            # TODO: handle oneOf anyOf allOf
            properties = prop.get("properties", {})
            if properties:
                for key, item in properties.items():
                    if item.get("deprecated"):
                        continue
                    if hasattr(item, "__reference__"
                               ) and item.__reference__.get("deprecated"):
                        continue

                    typeset = extract_type(item)
                    pointer = separator.join([path, key])
                    self.current_table = self.get_table(pointer)
                    if not self.current_table:
                        continue

                    self.current_table.types[pointer] = typeset
                    if "object" in typeset:
                        to_analyze.append((pointer, key, properties, item))
                    elif "array" in typeset:
                        items = item["items"]
                        items_type = extract_type(items)
                        if set(items_type) & {"array", "object"}:
                            if pointer not in self.current_table.path:
                                # found child array, need to create child table
                                key = self.name_check(parent_key, key)
                                self._add_table(
                                    add_child_table(self.current_table,
                                                    pointer, parent_key, key),
                                    pointer)
                            to_analyze.append(
                                (pointer, key, properties, items))
                        else:
                            # This means we in array of strings, so this becomes a single joinable column
                            typeset = ARRAY.format(items_type)
                            self.current_table.types[pointer] = JOINABLE
                            self.current_table.add_column(
                                pointer, typeset, _(pointer, self.language))
                    else:
                        if self.current_table.is_combined:
                            pointer = separator + separator.join(
                                (parent_key, key))
                        self.current_table.add_column(
                            pointer, typeset, _(pointer, self.language))
            else:
                # TODO: not sure what to do here
                continue
예제 #7
0
    def process_items(self, releases, with_preview=True):
        """Analyze releases

        Iterate over every item in provided list to
        calculate metrics and optionally generate preview for combined and split version of the table

        :param releases: Iterator of items to analyze
        :param with_preview: If set to True generates previews for each table
        """
        separator = self.header_separator
        for count, release in enumerate(releases):
            to_analyze = deque([("", "", "", {}, release)])
            ocid = release["ocid"]
            top_level_id = release["id"]

            while to_analyze:
                abs_path, path, parent_key, parent, record = to_analyze.pop()
                for key, item in record.items():
                    pointer = separator.join([path, key])
                    self.current_table = self.get_table(pointer)
                    if not self.current_table:
                        continue
                    item_type = self.current_table.types.get(pointer)
                    if pointer in self.current_table.path:
                        # strict match like /parties, /tender
                        row_id = generate_row_id(ocid, record.get("id", ""),
                                                 parent_key, top_level_id)
                        c = item if isinstance(item, list) else [item]
                        for _nop in c:
                            self.current_table.inc()
                            if with_preview and count < PREVIEW_ROWS:
                                parent_table = not self.current_table.is_root and parent_key
                                self.add_preview_row(ocid, record.get("id"),
                                                     row_id, parent.get("id"),
                                                     parent_table)

                    # TODO: this validation should probably be smarter with arrays
                    if item_type and item_type != JOINABLE and not validate_type(
                            item_type, item):
                        LOGGER.error("Mismatched type on %s expected %s" %
                                     (pointer, item_type))
                        continue

                    if isinstance(item, dict):
                        to_analyze.append((
                            separator.join([abs_path, key]),
                            pointer,
                            key,
                            record,
                            item,
                        ))
                    elif item and isinstance(item, list):
                        abs_pointer = separator.join([abs_path, key])
                        if not isinstance(item[0], dict) and not item_type:
                            LOGGER.debug(
                                _("Detected additional column: %s in %s table")
                                % (abs_pointer, root.name))
                            item_type = JOINABLE
                            self.current_table.add_column(
                                pointer,
                                JOINABLE,
                                _(pointer, self.language),
                                additional=True,
                                abs_path=abs_pointer,
                            )
                        if item_type == JOINABLE:
                            self.current_table.inc_column(abs_pointer, pointer)
                            if with_preview and count < PREVIEW_ROWS:
                                value = JOINABLE_SEPARATOR.join(item)
                                self.current_table.set_preview_path(
                                    abs_pointer, pointer, value,
                                    self.table_threshold)
                        elif self.current_table.is_root or self.current_table.is_combined:
                            for value in item:
                                to_analyze.append((
                                    abs_pointer,
                                    pointer,
                                    key,
                                    record,
                                    value,
                                ))
                        else:
                            parent_table = self.current_table.parent
                            if pointer not in parent_table.arrays:
                                LOGGER.debug(
                                    _("Detected additional table: %s") %
                                    pointer)
                                self.current_table.types[pointer] = ["array"]
                                parent_table = self.current_table
                                # TODO: do we need to mark this table as additional
                                self._add_table(
                                    add_child_table(self.current_table,
                                                    pointer, parent_key, key),
                                    pointer)
                                self.add_preview_row(ocid, record.get("id"),
                                                     row_id, parent.get("id"),
                                                     parent_table)

                            if parent_table.set_array(pointer, item):
                                should_split = len(
                                    item) >= self.table_threshold
                                if should_split:
                                    parent_table.should_split = True
                                    self.current_table.roll_up = True
                                recalculate_headers(parent_table, pointer,
                                                    abs_path, key, item,
                                                    should_split, separator)

                            for i, value in enumerate(item):
                                if isinstance(value, dict):
                                    abs_pointer = separator.join(
                                        [abs_path, key, str(i)])
                                    to_analyze.append((
                                        abs_pointer,
                                        pointer,
                                        parent_key,
                                        record,
                                        value,
                                    ))
                    else:
                        root = get_root(self.current_table)
                        abs_pointer = separator.join((abs_path, key))
                        if self.current_table.is_combined:
                            LOGGER.debug(
                                _("Path %s is targeted to combined table %s") %
                                (pointer, self.current_table.name))
                            pointer = separator + separator.join(
                                (parent_key, key))
                            abs_pointer = pointer
                        if abs_pointer not in root.combined_columns:
                            self.current_table.add_column(
                                pointer,
                                PYTHON_TO_JSON_TYPE.get(
                                    type(item).__name__, "N/A"),
                                _(pointer, self.language),
                                additional=True,
                                abs_path=abs_pointer,
                            )
                        self.current_table.inc_column(abs_pointer, pointer)
                        if item and with_preview and count < PREVIEW_ROWS:
                            self.current_table.set_preview_path(
                                abs_pointer, pointer, item,
                                self.table_threshold)
            yield count
        self.total_items = count
예제 #8
0
def test_add_child_table(root_table):
    child = add_child_table(root_table, "/tender/tenderers", "", "tenderers")
    assert child.name == "tenders_tenderers"
    assert child.name in root_table.child_tables
    assert child.total_rows == 0
    child.parent == root_table.name
예제 #9
0
    def parse_schema(self):
        """
        Extract information from the schema.
        """
        self.load_schema()
        # self.prepare_tables()
        proxy = add_paths_to_schema(self.schema)
        to_analyze = deque([("", "", {}, proxy)])

        # TODO: check if recursion is better for field ordering
        while to_analyze:
            path, parent_key, parent, prop = to_analyze.pop()
            if prop.get("deprecated"):
                continue
            # TODO: handle oneOf anyOf allOf
            properties = prop.get("properties", {})
            if properties:
                for key, item in properties.items():
                    if key in ("$title", "$path"):
                        continue
                    if item.get("deprecated"):
                        continue
                    if hasattr(item, "__reference__"
                               ) and item.__reference__.get("deprecated"):
                        continue

                    typeset = extract_type(item)
                    pointer = self.join_path(path, key)
                    self.current_table = self.get_table(pointer)

                    if not self.current_table:
                        continue

                    self.current_table.types[pointer] = typeset

                    if "object" in typeset:
                        to_analyze.append((pointer, key, properties, item))
                    elif "array" in typeset:
                        items = item["items"]
                        items_type = extract_type(items)
                        if set(items_type) & {"array", "object"}:
                            if pointer not in self.current_table.path:
                                # found child array, need to create child table
                                key = self.name_check(parent_key, key)
                                self._add_table(
                                    add_child_table(self.current_table,
                                                    pointer, parent_key, key),
                                    pointer)
                            to_analyze.append(
                                (pointer, key, properties, items))
                        else:
                            # This means we in array of strings, so this becomes a single joinable column
                            typeset = ARRAY.format(items_type)
                            self.current_table.types[pointer] = JOINABLE
                            self.current_table.add_column(
                                pointer,
                                typeset,
                                pointer,
                                header=item["$title"])
                    else:
                        if self.current_table.is_combined:
                            pointer = SEPARATOR + self.join_path(
                                parent_key, key)
                        self.current_table.add_column(pointer,
                                                      typeset,
                                                      pointer,
                                                      header=item["$title"])

            else:
                # TODO: not sure what to do here
                continue