def import_directory(self, datadir): """ import a JSON directory into the database """ # id: json data_by_id = {} # hash(json): id seen_hashes = {} # load all json, mapped by json_id for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')): with open(fname) as f: data = json.load(f) json_id = data.pop('_id') objhash = omnihash(data) if objhash not in seen_hashes: seen_hashes[objhash] = json_id data_by_id[json_id] = data else: self.duplicates[json_id] = seen_hashes[objhash] # toposort the nodes so parents are imported first network = Network() in_network = set() import_order = [] for json_id, data in data_by_id.items(): parent_id = data.get('parent_id', None) if parent_id: # Right. There's an import dep. We need to add the edge from # the parent to the current node, so that we import the parent # before the current node. network.add_edge(parent_id, json_id) else: # Otherwise, there is no parent, and we just need to add it to # the network to add whenever we feel like it during the import # phase. network.add_node(json_id) # resolve the sorted import order for jid in network.sort(): import_order.append((jid, data_by_id[jid])) in_network.add(jid) # ensure all data made it into network if in_network != set(data_by_id.keys()): raise Exception("import is missing nodes in network set") # time to actually do the import for json_id, data in import_order: parent_id = data.get('parent_id', None) if parent_id: # If we've got a parent ID, let's resolve it's JSON id # (scrape-time) to a Database ID (needs to have had the # parent imported first - which we asserted is true via # the topological sort) data['parent_id'] = self.resolve_json_id(parent_id) obj, what = self.import_json(data) self.json_to_db_id[json_id] = obj.id self.results[what] += 1 return {self._type: self.results}
def test_cycles_simple(): network = Network() network.add_node("A") network.add_node("B") network.add_edge("A", "B") network.add_edge("B", "A") assert chash(network.cycles()) == chash([("A", "B", "A")])
def test_cyclic_graph_error_simple(): network = Network() network.add_node("A") network.add_node("B") network.add_edge("A", "B") network.add_edge("B", "A") with pytest.raises(CyclicGraphError): list(network.sort())
def test_dot_debug(): network = Network() network.add_node("A") network.add_node("B") network.add_edge("A", "B") dot = network.dot() assert dot == "digraph graphname {A -> B;}"
def test_sort_order_basic(): network = Network() network.add_node("A") network.add_node("B") network.add_node("C") network.add_edge("A", "B") network.add_edge("B", "C") assert (list(network.sort())) == ["A", "B", "C"]
def test_cyclic_graph_error_massive(): network = Network() entries = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "A"] for i, e in enumerate(entries[:-1]): network.add_node(e) network.add_edge(e, entries[1 + i]) with pytest.raises(CyclicGraphError): list(network.sort())
def test_link_before_nodes(): network = Network() network.add_edge("A", "B") network.add_edge("B", "C") network.add_edge("C", "D") network.add_node("A") network.add_node("B") network.add_node("C") network.add_node("D") assert list(network.sort()) == ["A", "B", "C", "D"]
def test_sort_order_double(): network = Network() network.add_node("A") network.add_node("B") network.add_node("C") network.add_edge("A", "B") network.add_edge("A", "C") network.add_edge("C", "B") # A => B # / # A => C assert (list(network.sort())) == ["A", "C", "B"]
def _order_imports(self, dicts): # id: json data_by_id = {} # hash(json): id seen_hashes = {} # load all json, mapped by json_id for data in dicts: json_id = data.pop('_id') objhash = omnihash(data) if objhash not in seen_hashes: seen_hashes[objhash] = json_id data_by_id[json_id] = data else: self.duplicates[json_id] = seen_hashes[objhash] # toposort the nodes so parents are imported first network = Network() in_network = set() import_order = [] for json_id, data in data_by_id.items(): parent_id = data.get('parent_id', None) network.add_node(json_id) if parent_id: # Right. There's an import dep. We need to add the edge from # the parent to the current node, so that we import the parent # before the current node. network.add_edge(parent_id, json_id) # resolve the sorted import order for jid in network.sort(): import_order.append((jid, data_by_id[jid])) in_network.add(jid) # ensure all data made it into network (paranoid check, should never fail) if in_network != set(data_by_id.keys()): # pragma: no cover raise Exception("import is missing nodes in network set") return import_order
def test_internal_node_removal(): network = Network() network.add_node("A") network.add_node("B") network.add_node("C") network.add_node("D") network.add_edge("A", "B") network.add_edge("B", "C") network.add_edge("C", "D") network.add_edge("A", "C") # Useful for ensuring the ending list # is deterministic. # Ensure that we can't remove an internal node without a ValueError # by default. with pytest.raises(ValueError): network.prune_node("B") # OK. Now that we know that works, let's prune it harder. network.prune_node("B", remove_backrefs=True) # And make sure "B" is gone. assert list(network.sort()) == ["A", "C", "D"]
def test_cycles_complex(): network = Network() network.add_node("A") network.add_node("B") network.add_node("C") network.add_node("D") network.add_edge("A", "B") network.add_edge("B", "C") network.add_edge("C", "D") network.add_edge("D", "A") network.add_edge("D", "C") network.add_edge("C", "B") network.add_edge("B", "D") # with open("/home/tag/debug.dot", 'w') as fd: # fd.write(network.dot()) assert chash(network.cycles()) == chash([ ('B', 'C', 'B'), ('C', 'D', 'C'), ('A', 'B', 'D', 'A') ])
def import_from_json(self, datadir): # load all json, mapped by json_id raw_objects = {} for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')): with open(fname) as f: data = json.load(f) # prepare object from json if data['_type'] != 'person': data['jurisdiction_id'] = self.jurisdiction_id data = self.prepare_object_from_json(data) # convert dict=>class and store in raw_objects obj = self._model_class.from_dict(data) json_id = obj._id raw_objects[json_id] = obj # map duplicate ids to first occurance of same object inverse = defaultdict(list) for json_id, obj in raw_objects.items(): inverse[_hash(obj)].append(json_id) self.duplicates = {} for json_ids in inverse.values(): for json_id in json_ids[1:]: self.duplicates[json_id] = json_ids[0] # now do import, ignoring duplicates # Firstly, before we start, let's de-dupe the pool. import_pool = {k: v for k, v in raw_objects.items() if k not in self.duplicates} # Now, we create a pupa.utils.topsort.Network object, so that # we can contain the import dependencies. network = Network() to_import = [] # Used to hold the import order seen = set() # Used to ensure we got all nodes. for json_id, obj in import_pool.items(): parent_id = getattr(obj, 'parent_id', None) if parent_id: # Right. There's an import dep. We need to add the edge from # the parent to the current node, so that we import the parent # before the current node. network.add_edge(parent_id, json_id) else: # Otherwise, there is no parent, and we just need to add it to # the network to add whenever we feel like it during the import # phase. network.add_node(json_id) for link in network.sort(): to_import.append((link, import_pool[link])) seen.add(link) # This extra step is to make sure that our plan # is actually importing all entries into the database. if seen != set(import_pool.keys()): # If it's gone wrong (shouldn't) raise ValueError("""Something went wrong internally with the dependency resolution.""") # We'll blow up, since we've not done our job and failed to import # all of our files into the Database. for json_id, obj in to_import: parent_id = getattr(obj, 'parent_id', None) if parent_id: # If we've got a parent ID, let's resolve it's JSON id # (scrape-time) to a Database ID (needs to have had the # parent imported first - which we asserted is true via # the topological sort) obj.parent_id = self.resolve_json_id(parent_id) self.json_to_db_id[json_id] = self.import_object(obj) return {self._type: self.results}
def test_sort_order_staged(): network = Network() network.add_node("A1") network.add_node("A2") network.add_node("A3") network.add_edge("A1", "A2") network.add_edge("A1", "A3") network.add_edge("A2", "A3") network.add_node("B1") network.add_node("B2") network.add_node("B3") network.add_edge("B1", "B2") network.add_edge("B1", "B3") network.add_edge("B2", "B3") network.add_edge("B1", "A1") network.add_node("C1") network.add_node("C2") network.add_node("C3") network.add_edge("C1", "C2") network.add_edge("C1", "C3") network.add_edge("C2", "C3") network.add_edge("C1", "A1") network.add_edge("C1", "B1") network.add_edge("C1", "B1") network.add_edge("B1", "A1") network.add_edge("A1", "C2") network.add_edge("A1", "C3") # with open("/home/tag/debug.dot", 'w') as fd: # fd.write(network.dot()) sorted_order = list(network.sort()) assert sorted_order.pop(0) == "C1" assert sorted_order.pop(0) == "B1" assert sorted_order.pop(0) in ("A1", "B2") # ^^ This makes more sense after you dot debug it assert sorted_order.pop(0) in ("A1", "B2")