示例#1
0
def test_merge_all_graphs():
    """
    Test for merging three graphs into one,
    while preserving conflicting node and edge properties.
    """
    graphs = get_graphs()
    # merge while preserving conflicting nodes and edges
    merged_graph = merge_all_graphs(graphs, preserve=True)
    assert merged_graph.number_of_nodes() == 6
    assert merged_graph.number_of_edges() == 6
    assert merged_graph.name == 'Graph 2'

    data = merged_graph.nodes()['A']
    assert data['name'] == 'Node A'
    assert data['description'] == 'Node A in Graph 2'

    edges = merged_graph.get_edge('B', 'A')
    assert len(edges) == 2

    data = list(edges.values())[0]
    assert len(data['provided_by']) == 2
    assert data['provided_by'] == ['Graph 2', 'Graph 1']

    graphs = get_graphs()
    # merge while not preserving conflicting nodes and edges
    merged_graph = merge_all_graphs(graphs, preserve=False)
    assert merged_graph.number_of_nodes() == 6
    assert merged_graph.number_of_edges() == 6
    assert merged_graph.name == 'Graph 2'

    data = merged_graph.nodes()['A']
    assert data['name'] == 'Node A'
    assert data['description'] == 'Node A in Graph 2'

    edges = merged_graph.get_edge('B', 'A')
    assert len(edges) == 2

    data = list(edges.values())[0]
    assert isinstance(data['provided_by'], list)
    assert 'Graph 1' in data['provided_by']
    assert 'Graph 2' in data['provided_by']
示例#2
0
def test_merge_all_graphs():
    """
    Test for merging three graphs into one,
    while preserving conflicting node and edge properties.
    """
    graphs = get_graphs()
    # merge while preserving conflicting nodes and edges
    merged_graph = merge_all_graphs(graphs, preserve=True)
    assert merged_graph.number_of_nodes() == 6
    assert merged_graph.number_of_edges() == 6
    assert merged_graph.name == "Graph 2"

    data = merged_graph.nodes()["A"]
    assert data["name"] == "Node A"
    assert data["description"] == "Node A in Graph 2"

    edges = merged_graph.get_edge("B", "A")
    assert len(edges) == 2

    data = list(edges.values())[0]
    assert len(data["provided_by"]) == 2
    assert data["provided_by"] == ["Graph 2", "Graph 1"]

    graphs = get_graphs()
    # merge while not preserving conflicting nodes and edges
    merged_graph = merge_all_graphs(graphs, preserve=False)
    assert merged_graph.number_of_nodes() == 6
    assert merged_graph.number_of_edges() == 6
    assert merged_graph.name == "Graph 2"

    data = merged_graph.nodes()["A"]
    assert data["name"] == "Node A"
    assert data["description"] == "Node A in Graph 2"

    edges = merged_graph.get_edge("B", "A")
    assert len(edges) == 2

    data = list(edges.values())[0]
    assert isinstance(data["provided_by"], list)
    assert "Graph 1" in data["provided_by"]
    assert "Graph 2" in data["provided_by"]
示例#3
0
def test_merge():
    """
    Test for merging graphs.
    """
    input_args1 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test1_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test1_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t1 = Transformer()
    t1.transform(input_args1)

    input_args2 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test2_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test2_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t2 = Transformer()
    t2.transform(input_args2)

    merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph],
                                    preserve=True)
    assert len(merged_graph.nodes()) == 6
    assert len(merged_graph.edges()) == 8

    x1 = merged_graph.nodes()['x1']
    assert x1['name'] == 'node x1'

    assert isinstance(x1['category'], list)
    assert 'a' in x1['p1']
    assert '1' in x1['p1']

    x10 = merged_graph.nodes()['x10']
    assert x10['id'] == 'x10'
    assert x10['name'] == 'node x10'
示例#4
0
def test_merge():
    """
    Test for merging graphs.
    """
    input_args1 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "merge", "test1_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "merge", "test1_edges.tsv"),
        ],
        "format":
        "tsv",
    }
    t1 = Transformer()
    t1.transform(input_args1)

    input_args2 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "merge", "test2_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "merge", "test2_edges.tsv"),
        ],
        "format":
        "tsv",
    }
    t2 = Transformer()
    t2.transform(input_args2)

    merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph],
                                    preserve=True)
    assert len(merged_graph.nodes()) == 6
    assert len(merged_graph.edges()) == 8

    x1 = merged_graph.nodes()["x1"]
    assert x1["name"] == "node x1"

    assert isinstance(x1["category"], list)
    assert "a" in x1["p1"]
    assert "1" in x1["p1"]

    x10 = merged_graph.nodes()["x10"]
    assert x10["id"] == "x10"
    assert x10["name"] == "node x10"
示例#5
0
def test_merge_no_preserve():
    """
    Test for merging graphs, overwriting conflicting properties.
    """
    input_args1 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test1_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test1_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t1 = Transformer()
    t1.transform(input_args1)

    input_args2 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test2_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test2_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t2 = Transformer()
    t2.transform(input_args2)
    merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph],
                                    preserve=False)
    assert len(merged_graph.nodes()) == 6
    assert len(merged_graph.edges()) == 8

    x1 = merged_graph.nodes()['x1']
    assert x1['name'] == 'node x1'

    assert isinstance(x1['category'], list)
    assert list(t1.store.graph.nodes()['x1']['category'])[0] in x1['category']
    assert list(t2.store.graph.nodes()['x1']['category'])[0] in x1['category']
    assert x1['p1'] == 'a'
示例#6
0
def test_merge_no_preserve():
    """
    Test for merging graphs, overwriting conflicting properties.
    """
    input_args1 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "merge", "test1_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "merge", "test1_edges.tsv"),
        ],
        "format":
        "tsv",
    }
    t1 = Transformer()
    t1.transform(input_args1)

    input_args2 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "merge", "test2_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "merge", "test2_edges.tsv"),
        ],
        "format":
        "tsv",
    }
    t2 = Transformer()
    t2.transform(input_args2)
    merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph],
                                    preserve=False)
    assert len(merged_graph.nodes()) == 6
    assert len(merged_graph.edges()) == 8

    x1 = merged_graph.nodes()["x1"]
    assert x1["name"] == "node x1"

    assert isinstance(x1["category"], list)
    assert list(t1.store.graph.nodes()["x1"]["category"])[0] in x1["category"]
    assert list(t2.store.graph.nodes()["x1"]["category"])[0] in x1["category"]
    assert x1["p1"] == "a"
示例#7
0
def merge(
    merge_config: str,
    source: Optional[List] = None,
    destination: Optional[List] = None,
    processes: int = 1,
) -> BaseGraph:
    """
    Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph.
    The merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file.

    Parameters
    ----------
    merge_config: str
        Merge config YAML
    source: Optional[List]
        A list of source to load from the YAML
    destination: Optional[List]
        A list of destination to write to, as defined in the YAML
    processes: int
        Number of processes to use

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The merged graph

    """
    # Use the directory within which the 'merge_config' file
    # exists as a 'current working directory' for
    # resolving relative filename paths in the configuration.
    cwd = dirname(merge_config)

    with open(merge_config, "r") as YML:
        cfg = yaml.load(YML, Loader=yaml.FullLoader)

    output_directory = "output"

    top_level_args = {}
    if "configuration" in cfg:
        top_level_args = prepare_top_level_args(cfg["configuration"])
        if ("output_directory" in cfg["configuration"]
                and cfg["configuration"]["output_directory"]):
            output_directory = cfg["configuration"]["output_directory"]
            if not output_directory.startswith(os.path.sep):
                # relative path
                output_directory = f"{os.path.abspath(os.path.dirname(merge_config))}{os.path.sep}{output_directory}"

    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    if not source:
        source = cfg["merged_graph"]["source"].keys()

    if not destination:
        destination = cfg["merged_graph"]["destination"].keys()

    for s in source:
        source_properties = cfg["merged_graph"]["source"][s]
        if source_properties["input"]["format"] in get_input_file_types():
            source_properties["input"]["filename"] = _validate_files(
                cwd=cwd,
                file_paths=source_properties["input"]["filename"],
                context=s)

    sources_to_parse = {}
    for key in cfg["merged_graph"]["source"]:
        if key in source:
            sources_to_parse[key] = cfg["merged_graph"]["source"][key]

    results = []
    pool = Pool(processes=processes)
    for k, v in sources_to_parse.items():
        log.info(f"Spawning process for '{k}'")
        result = pool.apply_async(
            parse_source,
            (
                k,
                v,
                output_directory,
                top_level_args["prefix_map"],
                top_level_args["node_property_predicates"],
                top_level_args["predicate_mappings"],
                top_level_args["checkpoint"],
            ),
        )
        results.append(result)
    pool.close()
    pool.join()
    stores = [r.get() for r in results]
    merged_graph = merge_all_graphs([x.graph for x in stores])
    log.info(
        f"Merged graph has {merged_graph.number_of_nodes()} nodes and {merged_graph.number_of_edges()} edges"
    )
    if "name" in cfg["merged_graph"]:
        merged_graph.name = cfg["merged_graph"]["name"]
    if "operations" in cfg["merged_graph"]:
        apply_graph_operations(merged_graph, cfg["merged_graph"]["operations"])

    destination_to_write: Dict[str, Dict] = {}
    for d in destination:
        if d in cfg["merged_graph"]["destination"]:
            destination_to_write[d] = cfg["merged_graph"]["destination"][d]
        else:
            raise KeyError(f"Cannot find destination '{d}' in YAML")

    # write the merged graph
    node_properties = set()
    edge_properties = set()
    for s in stores:
        node_properties.update(s.node_properties)
        edge_properties.update(s.edge_properties)

    input_args = {"graph": merged_graph, "format": "graph"}
    if destination_to_write:
        for key, destination_info in destination_to_write.items():
            log.info(f"Writing merged graph to {key}")
            output_args = {
                "format":
                destination_info["format"],
                "reverse_prefix_map":
                top_level_args["reverse_prefix_map"],
                "reverse_predicate_mappings":
                top_level_args["reverse_predicate_mappings"],
            }
            if "reverse_prefix_map" in destination_info:
                output_args["reverse_prefix_map"].update(
                    destination_info["reverse_prefix_map"])
            if "reverse_predicate_mappings" in destination_info:
                output_args["reverse_predicate_mappings"].update(
                    destination_info["reverse_predicate_mappings"])
            if destination_info["format"] == "neo4j":
                output_args["uri"] = destination_info["uri"]
                output_args["username"] = destination_info["username"]
                output_args["password"] = destination_info["password"]
            elif destination_info["format"] in get_input_file_types():
                filename = destination_info["filename"]
                if isinstance(filename, list):
                    filename = filename[0]
                destination_filename = f"{output_directory}/{filename}"
                output_args["filename"] = destination_filename
                output_args["compression"] = (destination_info["compression"]
                                              if "compression"
                                              in destination_info else None)
                if destination_info['format'] == 'nt':
                    output_args['property_types'] = top_level_args[
                        'property_types']
                    if 'property_types' in top_level_args and 'property_types' in destination_info.keys(
                    ):
                        output_args['property_types'].update(
                            destination_info['property_types'])
                if destination_info['format'] in {'csv', 'tsv'}:
                    output_args['node_properties'] = node_properties
                    output_args['edge_properties'] = edge_properties
            else:
                raise TypeError(
                    f"type {destination_info['format']} not yet supported for KGX merge operation."
                )
            transformer = Transformer()
            transformer.transform(input_args, output_args)
    else:
        log.warning(
            f"No destination provided in {merge_config}. The merged graph will not be persisted."
        )
    return merged_graph
示例#8
0
def merge(
    merge_config: str,
    source: Optional[List] = None,
    destination: Optional[List] = None,
    processes: int = 1,
) -> BaseGraph:
    """
    Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph.
    The merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file.

    Parameters
    ----------
    merge_config: str
        Merge config YAML
    source: Optional[List]
        A list of source to load from the YAML
    destination: Optional[List]
        A list of destination to write to, as defined in the YAML
    processes: int
        Number of processes to use

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The merged graph

    """
    with open(merge_config, 'r') as YML:
        cfg = yaml.load(YML, Loader=yaml.FullLoader)

    output_directory = 'output'

    top_level_args = {}
    if 'configuration' in cfg:
        top_level_args = prepare_top_level_args(cfg['configuration'])
        if 'output_directory' in cfg['configuration'] and cfg['configuration'][
                'output_directory']:
            output_directory = cfg['configuration']['output_directory']
            if not output_directory.startswith(os.path.sep):
                # relative path
                output_directory = f"{os.path.abspath(os.path.dirname(merge_config))}{os.path.sep}{output_directory}"

    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    if not source:
        source = cfg['merged_graph']['source'].keys()

    if not destination:
        destination = cfg['merged_graph']['destination'].keys()

    for s in source:
        source_properties = cfg['merged_graph']['source'][s]
        if source_properties['input']['format'] in get_input_file_types():
            for f in source_properties['input']['filename']:
                if not os.path.exists(f):
                    raise FileNotFoundError(
                        f"Filename '{f}' for source '{s}' does not exist!")
                elif not os.path.isfile(f):
                    raise FileNotFoundError(
                        f"Filename '{f}' for source '{s}' is not a file!")

    sources_to_parse = {}
    for key in cfg['merged_graph']['source']:
        if key in source:
            sources_to_parse[key] = cfg['merged_graph']['source'][key]

    results = []
    pool = Pool(processes=processes)
    for k, v in sources_to_parse.items():
        log.info(f"Spawning process for '{k}'")
        result = pool.apply_async(
            parse_source,
            (
                k,
                v,
                output_directory,
                top_level_args['prefix_map'],
                top_level_args['node_property_predicates'],
                top_level_args['predicate_mappings'],
                top_level_args['checkpoint'],
            ),
        )
        results.append(result)
    pool.close()
    pool.join()
    stores = [r.get() for r in results]
    merged_graph = merge_all_graphs([x.graph for x in stores])
    log.info(
        f"Merged graph has {merged_graph.number_of_nodes()} nodes and {merged_graph.number_of_edges()} edges"
    )
    if 'name' in cfg['merged_graph']:
        merged_graph.name = cfg['merged_graph']['name']
    if 'operations' in cfg['merged_graph']:
        apply_graph_operations(merged_graph, cfg['merged_graph']['operations'])

    destination_to_write: Dict[str, Dict] = {}
    for d in destination:
        if d in cfg['merged_graph']['destination']:
            destination_to_write[d] = cfg['merged_graph']['destination'][d]
        else:
            raise KeyError(f"Cannot find destination '{d}' in YAML")

    # write the merged graph
    node_properties = set()
    edge_properties = set()
    for s in stores:
        node_properties.update(s.node_properties)
        edge_properties.update(s.edge_properties)

    input_args = {'graph': merged_graph, 'format': 'graph'}
    if destination_to_write:
        for key, destination_info in destination_to_write.items():
            log.info(f"Writing merged graph to {key}")
            output_args = {
                'format':
                destination_info['format'],
                'reverse_prefix_map':
                top_level_args['reverse_prefix_map'],
                'reverse_predicate_mappings':
                top_level_args['reverse_predicate_mappings'],
            }
            if 'reverse_prefix_map' in destination_info:
                output_args['reverse_prefix_map'].update(
                    destination_info['reverse_prefix_map'])
            if 'reverse_predicate_mappings' in destination_info:
                output_args['reverse_predicate_mappings'].update(
                    destination_info['reverse_predicate_mappings'])
            if destination_info['format'] == 'neo4j':
                output_args['uri'] = destination_info['uri']
                output_args['username'] = destination_info['username']
                output_args['password'] = destination_info['password']
            elif destination_info['format'] in get_input_file_types():
                filename = destination_info['filename']
                if isinstance(filename, list):
                    filename = filename[0]
                destination_filename = f"{output_directory}/{filename}"
                output_args['filename'] = destination_filename
                output_args['compression'] = (destination_info['compression']
                                              if 'compression'
                                              in destination_info else None)
                if destination_info['format'] == 'nt':
                    output_args['property_types'] = top_level_args[
                        'property_types']
                    if 'property_types' in top_level_args:
                        output_args['property_types'].update(
                            destination_info['property_types'])
                if destination_info['format'] in {'csv', 'tsv'}:
                    output_args['node_properties'] = node_properties
                    output_args['edge_properties'] = edge_properties
            else:
                raise TypeError(
                    f"type {destination_info['format']} not yet supported for KGX merge operation."
                )
            transformer = Transformer()
            transformer.transform(input_args, output_args)
    else:
        log.warning(
            f"No destination provided in {merge_config}. The merged graph will not be persisted."
        )
    return merged_graph