def test_merge_all_graphs(): """ Test for merging three graphs into one, while preserving conflicting node and edge properties. """ graphs = get_graphs() # merge while preserving conflicting nodes and edges merged_graph = merge_all_graphs(graphs, preserve=True) assert merged_graph.number_of_nodes() == 6 assert merged_graph.number_of_edges() == 6 assert merged_graph.name == 'Graph 2' data = merged_graph.nodes()['A'] assert data['name'] == 'Node A' assert data['description'] == 'Node A in Graph 2' edges = merged_graph.get_edge('B', 'A') assert len(edges) == 2 data = list(edges.values())[0] assert len(data['provided_by']) == 2 assert data['provided_by'] == ['Graph 2', 'Graph 1'] graphs = get_graphs() # merge while not preserving conflicting nodes and edges merged_graph = merge_all_graphs(graphs, preserve=False) assert merged_graph.number_of_nodes() == 6 assert merged_graph.number_of_edges() == 6 assert merged_graph.name == 'Graph 2' data = merged_graph.nodes()['A'] assert data['name'] == 'Node A' assert data['description'] == 'Node A in Graph 2' edges = merged_graph.get_edge('B', 'A') assert len(edges) == 2 data = list(edges.values())[0] assert isinstance(data['provided_by'], list) assert 'Graph 1' in data['provided_by'] assert 'Graph 2' in data['provided_by']
def test_merge_all_graphs(): """ Test for merging three graphs into one, while preserving conflicting node and edge properties. """ graphs = get_graphs() # merge while preserving conflicting nodes and edges merged_graph = merge_all_graphs(graphs, preserve=True) assert merged_graph.number_of_nodes() == 6 assert merged_graph.number_of_edges() == 6 assert merged_graph.name == "Graph 2" data = merged_graph.nodes()["A"] assert data["name"] == "Node A" assert data["description"] == "Node A in Graph 2" edges = merged_graph.get_edge("B", "A") assert len(edges) == 2 data = list(edges.values())[0] assert len(data["provided_by"]) == 2 assert data["provided_by"] == ["Graph 2", "Graph 1"] graphs = get_graphs() # merge while not preserving conflicting nodes and edges merged_graph = merge_all_graphs(graphs, preserve=False) assert merged_graph.number_of_nodes() == 6 assert merged_graph.number_of_edges() == 6 assert merged_graph.name == "Graph 2" data = merged_graph.nodes()["A"] assert data["name"] == "Node A" assert data["description"] == "Node A in Graph 2" edges = merged_graph.get_edge("B", "A") assert len(edges) == 2 data = list(edges.values())[0] assert isinstance(data["provided_by"], list) assert "Graph 1" in data["provided_by"] assert "Graph 2" in data["provided_by"]
def test_merge(): """ Test for merging graphs. """ input_args1 = { 'filename': [ os.path.join(RESOURCE_DIR, 'merge', 'test1_nodes.tsv'), os.path.join(RESOURCE_DIR, 'merge', 'test1_edges.tsv'), ], 'format': 'tsv', } t1 = Transformer() t1.transform(input_args1) input_args2 = { 'filename': [ os.path.join(RESOURCE_DIR, 'merge', 'test2_nodes.tsv'), os.path.join(RESOURCE_DIR, 'merge', 'test2_edges.tsv'), ], 'format': 'tsv', } t2 = Transformer() t2.transform(input_args2) merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph], preserve=True) assert len(merged_graph.nodes()) == 6 assert len(merged_graph.edges()) == 8 x1 = merged_graph.nodes()['x1'] assert x1['name'] == 'node x1' assert isinstance(x1['category'], list) assert 'a' in x1['p1'] assert '1' in x1['p1'] x10 = merged_graph.nodes()['x10'] assert x10['id'] == 'x10' assert x10['name'] == 'node x10'
def test_merge(): """ Test for merging graphs. """ input_args1 = { "filename": [ os.path.join(RESOURCE_DIR, "merge", "test1_nodes.tsv"), os.path.join(RESOURCE_DIR, "merge", "test1_edges.tsv"), ], "format": "tsv", } t1 = Transformer() t1.transform(input_args1) input_args2 = { "filename": [ os.path.join(RESOURCE_DIR, "merge", "test2_nodes.tsv"), os.path.join(RESOURCE_DIR, "merge", "test2_edges.tsv"), ], "format": "tsv", } t2 = Transformer() t2.transform(input_args2) merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph], preserve=True) assert len(merged_graph.nodes()) == 6 assert len(merged_graph.edges()) == 8 x1 = merged_graph.nodes()["x1"] assert x1["name"] == "node x1" assert isinstance(x1["category"], list) assert "a" in x1["p1"] assert "1" in x1["p1"] x10 = merged_graph.nodes()["x10"] assert x10["id"] == "x10" assert x10["name"] == "node x10"
def test_merge_no_preserve(): """ Test for merging graphs, overwriting conflicting properties. """ input_args1 = { 'filename': [ os.path.join(RESOURCE_DIR, 'merge', 'test1_nodes.tsv'), os.path.join(RESOURCE_DIR, 'merge', 'test1_edges.tsv'), ], 'format': 'tsv', } t1 = Transformer() t1.transform(input_args1) input_args2 = { 'filename': [ os.path.join(RESOURCE_DIR, 'merge', 'test2_nodes.tsv'), os.path.join(RESOURCE_DIR, 'merge', 'test2_edges.tsv'), ], 'format': 'tsv', } t2 = Transformer() t2.transform(input_args2) merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph], preserve=False) assert len(merged_graph.nodes()) == 6 assert len(merged_graph.edges()) == 8 x1 = merged_graph.nodes()['x1'] assert x1['name'] == 'node x1' assert isinstance(x1['category'], list) assert list(t1.store.graph.nodes()['x1']['category'])[0] in x1['category'] assert list(t2.store.graph.nodes()['x1']['category'])[0] in x1['category'] assert x1['p1'] == 'a'
def test_merge_no_preserve(): """ Test for merging graphs, overwriting conflicting properties. """ input_args1 = { "filename": [ os.path.join(RESOURCE_DIR, "merge", "test1_nodes.tsv"), os.path.join(RESOURCE_DIR, "merge", "test1_edges.tsv"), ], "format": "tsv", } t1 = Transformer() t1.transform(input_args1) input_args2 = { "filename": [ os.path.join(RESOURCE_DIR, "merge", "test2_nodes.tsv"), os.path.join(RESOURCE_DIR, "merge", "test2_edges.tsv"), ], "format": "tsv", } t2 = Transformer() t2.transform(input_args2) merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph], preserve=False) assert len(merged_graph.nodes()) == 6 assert len(merged_graph.edges()) == 8 x1 = merged_graph.nodes()["x1"] assert x1["name"] == "node x1" assert isinstance(x1["category"], list) assert list(t1.store.graph.nodes()["x1"]["category"])[0] in x1["category"] assert list(t2.store.graph.nodes()["x1"]["category"])[0] in x1["category"] assert x1["p1"] == "a"
def merge( merge_config: str, source: Optional[List] = None, destination: Optional[List] = None, processes: int = 1, ) -> BaseGraph: """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. Parameters ---------- merge_config: str Merge config YAML source: Optional[List] A list of source to load from the YAML destination: Optional[List] A list of destination to write to, as defined in the YAML processes: int Number of processes to use Returns ------- kgx.graph.base_graph.BaseGraph The merged graph """ # Use the directory within which the 'merge_config' file # exists as a 'current working directory' for # resolving relative filename paths in the configuration. cwd = dirname(merge_config) with open(merge_config, "r") as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) output_directory = "output" top_level_args = {} if "configuration" in cfg: top_level_args = prepare_top_level_args(cfg["configuration"]) if ("output_directory" in cfg["configuration"] and cfg["configuration"]["output_directory"]): output_directory = cfg["configuration"]["output_directory"] if not output_directory.startswith(os.path.sep): # relative path output_directory = f"{os.path.abspath(os.path.dirname(merge_config))}{os.path.sep}{output_directory}" if not os.path.exists(output_directory): os.mkdir(output_directory) if not source: source = cfg["merged_graph"]["source"].keys() if not destination: destination = cfg["merged_graph"]["destination"].keys() for s in source: source_properties = cfg["merged_graph"]["source"][s] if source_properties["input"]["format"] in get_input_file_types(): source_properties["input"]["filename"] = _validate_files( cwd=cwd, file_paths=source_properties["input"]["filename"], context=s) sources_to_parse = {} for key in cfg["merged_graph"]["source"]: if key in source: sources_to_parse[key] = cfg["merged_graph"]["source"][key] results = [] pool = Pool(processes=processes) for k, v in sources_to_parse.items(): log.info(f"Spawning process for '{k}'") result = pool.apply_async( parse_source, ( k, v, output_directory, top_level_args["prefix_map"], top_level_args["node_property_predicates"], top_level_args["predicate_mappings"], top_level_args["checkpoint"], ), ) results.append(result) pool.close() pool.join() stores = [r.get() for r in results] merged_graph = merge_all_graphs([x.graph for x in stores]) log.info( f"Merged graph has {merged_graph.number_of_nodes()} nodes and {merged_graph.number_of_edges()} edges" ) if "name" in cfg["merged_graph"]: merged_graph.name = cfg["merged_graph"]["name"] if "operations" in cfg["merged_graph"]: apply_graph_operations(merged_graph, cfg["merged_graph"]["operations"]) destination_to_write: Dict[str, Dict] = {} for d in destination: if d in cfg["merged_graph"]["destination"]: destination_to_write[d] = cfg["merged_graph"]["destination"][d] else: raise KeyError(f"Cannot find destination '{d}' in YAML") # write the merged graph node_properties = set() edge_properties = set() for s in stores: node_properties.update(s.node_properties) edge_properties.update(s.edge_properties) input_args = {"graph": merged_graph, "format": "graph"} if destination_to_write: for key, destination_info in destination_to_write.items(): log.info(f"Writing merged graph to {key}") output_args = { "format": destination_info["format"], "reverse_prefix_map": top_level_args["reverse_prefix_map"], "reverse_predicate_mappings": top_level_args["reverse_predicate_mappings"], } if "reverse_prefix_map" in destination_info: output_args["reverse_prefix_map"].update( destination_info["reverse_prefix_map"]) if "reverse_predicate_mappings" in destination_info: output_args["reverse_predicate_mappings"].update( destination_info["reverse_predicate_mappings"]) if destination_info["format"] == "neo4j": output_args["uri"] = destination_info["uri"] output_args["username"] = destination_info["username"] output_args["password"] = destination_info["password"] elif destination_info["format"] in get_input_file_types(): filename = destination_info["filename"] if isinstance(filename, list): filename = filename[0] destination_filename = f"{output_directory}/{filename}" output_args["filename"] = destination_filename output_args["compression"] = (destination_info["compression"] if "compression" in destination_info else None) if destination_info['format'] == 'nt': output_args['property_types'] = top_level_args[ 'property_types'] if 'property_types' in top_level_args and 'property_types' in destination_info.keys( ): output_args['property_types'].update( destination_info['property_types']) if destination_info['format'] in {'csv', 'tsv'}: output_args['node_properties'] = node_properties output_args['edge_properties'] = edge_properties else: raise TypeError( f"type {destination_info['format']} not yet supported for KGX merge operation." ) transformer = Transformer() transformer.transform(input_args, output_args) else: log.warning( f"No destination provided in {merge_config}. The merged graph will not be persisted." ) return merged_graph
def merge( merge_config: str, source: Optional[List] = None, destination: Optional[List] = None, processes: int = 1, ) -> BaseGraph: """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. Parameters ---------- merge_config: str Merge config YAML source: Optional[List] A list of source to load from the YAML destination: Optional[List] A list of destination to write to, as defined in the YAML processes: int Number of processes to use Returns ------- kgx.graph.base_graph.BaseGraph The merged graph """ with open(merge_config, 'r') as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) output_directory = 'output' top_level_args = {} if 'configuration' in cfg: top_level_args = prepare_top_level_args(cfg['configuration']) if 'output_directory' in cfg['configuration'] and cfg['configuration'][ 'output_directory']: output_directory = cfg['configuration']['output_directory'] if not output_directory.startswith(os.path.sep): # relative path output_directory = f"{os.path.abspath(os.path.dirname(merge_config))}{os.path.sep}{output_directory}" if not os.path.exists(output_directory): os.mkdir(output_directory) if not source: source = cfg['merged_graph']['source'].keys() if not destination: destination = cfg['merged_graph']['destination'].keys() for s in source: source_properties = cfg['merged_graph']['source'][s] if source_properties['input']['format'] in get_input_file_types(): for f in source_properties['input']['filename']: if not os.path.exists(f): raise FileNotFoundError( f"Filename '{f}' for source '{s}' does not exist!") elif not os.path.isfile(f): raise FileNotFoundError( f"Filename '{f}' for source '{s}' is not a file!") sources_to_parse = {} for key in cfg['merged_graph']['source']: if key in source: sources_to_parse[key] = cfg['merged_graph']['source'][key] results = [] pool = Pool(processes=processes) for k, v in sources_to_parse.items(): log.info(f"Spawning process for '{k}'") result = pool.apply_async( parse_source, ( k, v, output_directory, top_level_args['prefix_map'], top_level_args['node_property_predicates'], top_level_args['predicate_mappings'], top_level_args['checkpoint'], ), ) results.append(result) pool.close() pool.join() stores = [r.get() for r in results] merged_graph = merge_all_graphs([x.graph for x in stores]) log.info( f"Merged graph has {merged_graph.number_of_nodes()} nodes and {merged_graph.number_of_edges()} edges" ) if 'name' in cfg['merged_graph']: merged_graph.name = cfg['merged_graph']['name'] if 'operations' in cfg['merged_graph']: apply_graph_operations(merged_graph, cfg['merged_graph']['operations']) destination_to_write: Dict[str, Dict] = {} for d in destination: if d in cfg['merged_graph']['destination']: destination_to_write[d] = cfg['merged_graph']['destination'][d] else: raise KeyError(f"Cannot find destination '{d}' in YAML") # write the merged graph node_properties = set() edge_properties = set() for s in stores: node_properties.update(s.node_properties) edge_properties.update(s.edge_properties) input_args = {'graph': merged_graph, 'format': 'graph'} if destination_to_write: for key, destination_info in destination_to_write.items(): log.info(f"Writing merged graph to {key}") output_args = { 'format': destination_info['format'], 'reverse_prefix_map': top_level_args['reverse_prefix_map'], 'reverse_predicate_mappings': top_level_args['reverse_predicate_mappings'], } if 'reverse_prefix_map' in destination_info: output_args['reverse_prefix_map'].update( destination_info['reverse_prefix_map']) if 'reverse_predicate_mappings' in destination_info: output_args['reverse_predicate_mappings'].update( destination_info['reverse_predicate_mappings']) if destination_info['format'] == 'neo4j': output_args['uri'] = destination_info['uri'] output_args['username'] = destination_info['username'] output_args['password'] = destination_info['password'] elif destination_info['format'] in get_input_file_types(): filename = destination_info['filename'] if isinstance(filename, list): filename = filename[0] destination_filename = f"{output_directory}/{filename}" output_args['filename'] = destination_filename output_args['compression'] = (destination_info['compression'] if 'compression' in destination_info else None) if destination_info['format'] == 'nt': output_args['property_types'] = top_level_args[ 'property_types'] if 'property_types' in top_level_args: output_args['property_types'].update( destination_info['property_types']) if destination_info['format'] in {'csv', 'tsv'}: output_args['node_properties'] = node_properties output_args['edge_properties'] = edge_properties else: raise TypeError( f"type {destination_info['format']} not yet supported for KGX merge operation." ) transformer = Transformer() transformer.transform(input_args, output_args) else: log.warning( f"No destination provided in {merge_config}. The merged graph will not be persisted." ) return merged_graph