def test_inc_column(root_table): child = add_child_table(root_table, "/tender/items", "tender", "items") child.add_column("/tender/items/id", ["string"], "Tender Id", additional=True, abs_path="/tender/items/0/test") child.arrays["/tender/items/additionalClassifications"] = 0 root_table.inc_column("ocid", "ocid") assert root_table.combined_columns["ocid"].hits == 1 root_table.inc_column("/tender/awardCriteriaDetails", "/tender/awardCriteriaDetails") assert root_table.combined_columns["/tender/awardCriteriaDetails"].hits == 1 child.inc_column("/tender/items/0/id", "/tender/items/id") assert root_table.combined_columns["/tender/items/0/id"].hits == 1 assert child["/tender/items/id"].hits == 1 assert child.combined_columns["/tender/items/id"].hits == 1 child_child = add_child_table( child, "/tender/items/additionalClassifications", "items", "additionalClassifications" ) child_child.add_column( "/tender/items/additionalClassifications/id", ["string"], "Classification Id", additional=True, ) child_child.inc_column( "/tender/items/0/additionalClassifications/0/id", "/tender/items/additionalClassifications/id" ) assert child.combined_columns["/tender/items/additionalClassifications/0/id"].hits == 1 assert root_table.combined_columns["/tender/items/0/additionalClassifications/0/id"].hits == 1
def test_get_pointer(root_table): child = add_child_table(root_table, "/tender/items", "tender", "items") child_child = add_child_table(child, "/tender/items/additionalClassifications", "items", "additionalClassifications") pointer = get_pointer( child_child, "/tender/items/0/additionalClassifications/0/id", "/tender/items/additionalClassifications/id", True, ) assert pointer == "/tender/items/additionalClassifications/id" pointer = get_pointer(child, "/tender/items/0/additionalClassifications/0/id", "/tender/items/additionalClassifications/id", True) assert pointer == "/tender/items/additionalClassifications/0/id" pointer = get_pointer(child, "/tender/items/0/additionalClassifications/0", "/tender/items/additionalClassifications", True) assert pointer == "/tender/items/additionalClassifications/0" pointer = get_pointer(child, "/tender/items/0/additionalClassifications", "/tender/items/additionalClassifications", True) assert pointer == "/tender/items/additionalClassifications" pointer = get_pointer( root_table, "/tender/items/0/additionalClassifications/0/id", "/tender/items/additionalClassifications/id", True, ) assert pointer == "/tender/items/0/additionalClassifications/0/id" pointer = get_pointer(root_table, "/tender/items/0/id", "/tender/items/id", True) assert pointer == "/tender/items/0/id" pointer = get_pointer(child, "/tender/items/0/id", "/tender/items/id", True) assert pointer == "/tender/items/id" pointer = get_pointer(root_table, "/tender/id", "/tender/id", True) assert pointer == "/tender/id" pointer = get_pointer(root_table, "/tender/items", "/tender/items", True, index="0") assert pointer == "/tender/items/0" pointer = get_pointer(root_table, "/tender", "/tender", True, index="0") assert pointer == "/tender"
def test_add_child_table(root_table): data = root_table.dump() assert not data["parent"] child = add_child_table(root_table, "/tender/tenderers", "", "tenderers") assert child.name == "tenders_tenderers" assert child.name in root_table.child_tables assert child.total_rows == 0 data = child.dump() data["parent"] == root_table.name
def test_add_column(root_table): root_table.add_column("/tender/id", ["string", "integer"], "Tender Id") assert "/tender/id" in root_table assert "/tender/id" in root_table.combined_columns root_table.add_column("/tender/itemsCount", ["string", "integer"], "Items Count") assert "/tender/itemsCount" in root_table assert "/tender/itemsCount" in root_table.combined_columns root_table.add_column("/tender/items/additionalClassificationsCount", ["string", "integer"], "Classifications Count") assert "/tender/items/0/additionalClassificationsCount" in root_table assert "/tender/items/0/additionalClassificationsCount" in root_table.combined_columns child = add_child_table(root_table, "/tender/items", "tender", "items") child.add_column( "/tender/items/test", ["string", "integer"], "/tender/items/test", additional=True, abs_path="/tender/items/0/test", ) assert "/tender/items/test" in child assert "/tender/items/test" in child.combined_columns assert "/tender/items/0/test" in root_table assert "/tender/items/0/test" in root_table.combined_columns child.add_column("/tender/items/id", ["string", "integer"], "Items Id") child.arrays["/tender/items/additionalClassifications"] = 0 child.add_column("/tender/items/additionalClassifications/id", ["string", "integer"], "Classification ID") assert "/tender/items/additionalClassifications/0/id" in child assert "/tender/items/id" in child assert "/tender/items/0/id" in root_table assert "/tender/items/0/additionalClassifications/0/id" in root_table child.add_column( "/tender/items/additionalClassificationsCount", ["string", "integer"], "Classification Count", propagate=False, ) assert "/tender/items/additionalClassificationsCount" in child assert "/tender/items/additionalClassificationsCount" in child.combined_columns assert "/tender/items/additionalClassificationsCount" not in root_table assert "/tender/items/additionalClassificationsCount" not in root_table.combined_columns
def add_additional_table(self, pointer, abs_pointer, parent_key, key, item): LOGGER.debug(_("Detected additional table: %s") % pointer) self.current_table.types[pointer] = ["array"] self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) # add columns beforehand because it might be required # to recalculate and reorder headers when enlarging array # there must be a better way but it should work for now for extended_item in item: for path_, it in flatten(extended_item, reducer="path").items(): ppointer = self.join_path(pointer, path_) if ppointer not in self.current_table: self.current_table.add_column( ppointer, self.guess_type(it), ppointer, abs_path=self.join_path(abs_pointer, path_), header=ppointer, )
def parse_schema(self): """Extract all available information from schema""" if isinstance(self.schema, (str, Path)): self.schema = resolve_file_uri(self.schema) self.schema = jsonref.JsonRef.replace_refs(self.schema) self.init_tables(self.root_tables) if self.combined_tables: self.init_tables(self.combined_tables, is_combined=True) separator = self.header_separator to_analyze = deque([("", "", {}, self.schema)]) # TODO: check if recursion is better for field ordering while to_analyze: path, parent_key, parent, prop = to_analyze.pop() if prop.get("deprecated"): continue # TODO: handle oneOf anyOf allOf properties = prop.get("properties", {}) if properties: for key, item in properties.items(): if item.get("deprecated"): continue if hasattr(item, "__reference__" ) and item.__reference__.get("deprecated"): continue typeset = extract_type(item) pointer = separator.join([path, key]) self.current_table = self.get_table(pointer) if not self.current_table: continue self.current_table.types[pointer] = typeset if "object" in typeset: to_analyze.append((pointer, key, properties, item)) elif "array" in typeset: items = item["items"] items_type = extract_type(items) if set(items_type) & {"array", "object"}: if pointer not in self.current_table.path: # found child array, need to create child table key = self.name_check(parent_key, key) self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) to_analyze.append( (pointer, key, properties, items)) else: # This means we in array of strings, so this becomes a single joinable column typeset = ARRAY.format(items_type) self.current_table.types[pointer] = JOINABLE self.current_table.add_column( pointer, typeset, _(pointer, self.language)) else: if self.current_table.is_combined: pointer = separator + separator.join( (parent_key, key)) self.current_table.add_column( pointer, typeset, _(pointer, self.language)) else: # TODO: not sure what to do here continue
def process_items(self, releases, with_preview=True): """Analyze releases Iterate over every item in provided list to calculate metrics and optionally generate preview for combined and split version of the table :param releases: Iterator of items to analyze :param with_preview: If set to True generates previews for each table """ separator = self.header_separator for count, release in enumerate(releases): to_analyze = deque([("", "", "", {}, release)]) ocid = release["ocid"] top_level_id = release["id"] while to_analyze: abs_path, path, parent_key, parent, record = to_analyze.pop() for key, item in record.items(): pointer = separator.join([path, key]) self.current_table = self.get_table(pointer) if not self.current_table: continue item_type = self.current_table.types.get(pointer) if pointer in self.current_table.path: # strict match like /parties, /tender row_id = generate_row_id(ocid, record.get("id", ""), parent_key, top_level_id) c = item if isinstance(item, list) else [item] for _nop in c: self.current_table.inc() if with_preview and count < PREVIEW_ROWS: parent_table = not self.current_table.is_root and parent_key self.add_preview_row(ocid, record.get("id"), row_id, parent.get("id"), parent_table) # TODO: this validation should probably be smarter with arrays if item_type and item_type != JOINABLE and not validate_type( item_type, item): LOGGER.error("Mismatched type on %s expected %s" % (pointer, item_type)) continue if isinstance(item, dict): to_analyze.append(( separator.join([abs_path, key]), pointer, key, record, item, )) elif item and isinstance(item, list): abs_pointer = separator.join([abs_path, key]) if not isinstance(item[0], dict) and not item_type: LOGGER.debug( _("Detected additional column: %s in %s table") % (abs_pointer, root.name)) item_type = JOINABLE self.current_table.add_column( pointer, JOINABLE, _(pointer, self.language), additional=True, abs_path=abs_pointer, ) if item_type == JOINABLE: self.current_table.inc_column(abs_pointer, pointer) if with_preview and count < PREVIEW_ROWS: value = JOINABLE_SEPARATOR.join(item) self.current_table.set_preview_path( abs_pointer, pointer, value, self.table_threshold) elif self.current_table.is_root or self.current_table.is_combined: for value in item: to_analyze.append(( abs_pointer, pointer, key, record, value, )) else: parent_table = self.current_table.parent if pointer not in parent_table.arrays: LOGGER.debug( _("Detected additional table: %s") % pointer) self.current_table.types[pointer] = ["array"] parent_table = self.current_table # TODO: do we need to mark this table as additional self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) self.add_preview_row(ocid, record.get("id"), row_id, parent.get("id"), parent_table) if parent_table.set_array(pointer, item): should_split = len( item) >= self.table_threshold if should_split: parent_table.should_split = True self.current_table.roll_up = True recalculate_headers(parent_table, pointer, abs_path, key, item, should_split, separator) for i, value in enumerate(item): if isinstance(value, dict): abs_pointer = separator.join( [abs_path, key, str(i)]) to_analyze.append(( abs_pointer, pointer, parent_key, record, value, )) else: root = get_root(self.current_table) abs_pointer = separator.join((abs_path, key)) if self.current_table.is_combined: LOGGER.debug( _("Path %s is targeted to combined table %s") % (pointer, self.current_table.name)) pointer = separator + separator.join( (parent_key, key)) abs_pointer = pointer if abs_pointer not in root.combined_columns: self.current_table.add_column( pointer, PYTHON_TO_JSON_TYPE.get( type(item).__name__, "N/A"), _(pointer, self.language), additional=True, abs_path=abs_pointer, ) self.current_table.inc_column(abs_pointer, pointer) if item and with_preview and count < PREVIEW_ROWS: self.current_table.set_preview_path( abs_pointer, pointer, item, self.table_threshold) yield count self.total_items = count
def test_add_child_table(root_table): child = add_child_table(root_table, "/tender/tenderers", "", "tenderers") assert child.name == "tenders_tenderers" assert child.name in root_table.child_tables assert child.total_rows == 0 child.parent == root_table.name
def parse_schema(self): """ Extract information from the schema. """ self.load_schema() # self.prepare_tables() proxy = add_paths_to_schema(self.schema) to_analyze = deque([("", "", {}, proxy)]) # TODO: check if recursion is better for field ordering while to_analyze: path, parent_key, parent, prop = to_analyze.pop() if prop.get("deprecated"): continue # TODO: handle oneOf anyOf allOf properties = prop.get("properties", {}) if properties: for key, item in properties.items(): if key in ("$title", "$path"): continue if item.get("deprecated"): continue if hasattr(item, "__reference__" ) and item.__reference__.get("deprecated"): continue typeset = extract_type(item) pointer = self.join_path(path, key) self.current_table = self.get_table(pointer) if not self.current_table: continue self.current_table.types[pointer] = typeset if "object" in typeset: to_analyze.append((pointer, key, properties, item)) elif "array" in typeset: items = item["items"] items_type = extract_type(items) if set(items_type) & {"array", "object"}: if pointer not in self.current_table.path: # found child array, need to create child table key = self.name_check(parent_key, key) self._add_table( add_child_table(self.current_table, pointer, parent_key, key), pointer) to_analyze.append( (pointer, key, properties, items)) else: # This means we in array of strings, so this becomes a single joinable column typeset = ARRAY.format(items_type) self.current_table.types[pointer] = JOINABLE self.current_table.add_column( pointer, typeset, pointer, header=item["$title"]) else: if self.current_table.is_combined: pointer = SEPARATOR + self.join_path( parent_key, key) self.current_table.add_column(pointer, typeset, pointer, header=item["$title"]) else: # TODO: not sure what to do here continue