Exemplo n.º 1
0
def parse_source(
    key: str,
    source: dict,
    output_directory: str,
    prefix_map: Dict[str, str] = None,
    node_property_predicates: Set[str] = None,
    predicate_mappings: Dict[str, str] = None,
    checkpoint: bool = False,
) -> Sink:
    """
    Parse a source from a merge config YAML.

    Parameters
    ----------
    key: str
        Source key
    source: Dict
        Source configuration
    output_directory: str
        Location to write output to
    prefix_map: Dict[str, str]
        Non-canonical CURIE mappings
    node_property_predicates: Set[str]
        A set of predicates that ought to be treated as node properties (This is applicable for RDF)
    predicate_mappings: Dict[str, str]
        A mapping of predicate IRIs to property names (This is applicable for RDF)
    checkpoint: bool
        Whether to serialize each individual source to a TSV

    Returns
    -------
    kgx.sink.sink.Sink
        Returns an instance of Sink

    """
    log.info(f"Processing source '{key}'")
    if not key:
        key = os.path.basename(source["input"]["filename"][0])
    input_args = prepare_input_args(
        key,
        source,
        output_directory,
        prefix_map,
        node_property_predicates,
        predicate_mappings,
    )
    transformer = Transformer(stream=True)
    transformer.transform(input_args)
    transformer.store.graph.name = key
    if checkpoint:
        log.info(f"Writing checkpoint for source '{key}'")
        checkpoint_output = f"{output_directory}/{key}" if output_directory else key
        transformer.save({"filename": checkpoint_output, "format": "tsv"})

    # Current "Callable" metadata not needed at this  point
    # but causes peculiar problems downstream, so we clear it.
    transformer.store.clear_graph_metadata()

    return transformer.store
Exemplo n.º 2
0
def neo4j_download(
    uri: str,
    username: str,
    password: str,
    output: str,
    output_format: str,
    output_compression: Optional[str],
    stream: bool,
    node_filters: Optional[Tuple] = None,
    edge_filters: Optional[Tuple] = None,
) -> Transformer:
    """
    Download nodes and edges from Neo4j database.

    Parameters
    ----------
    uri: str
        Neo4j URI. For example, https://localhost:7474
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)
    output_format: Optional[str]
        The output type (``tsv``, by default)
    output_compression: Optional[str]
        The output compression type
    stream: bool
        Whether to parse input as a stream
    node_filters: Optional[Tuple]
        Node filters
    edge_filters: Optional[Tuple]
        Edge filters

    Returns
    -------
    kgx.Transformer
        The NeoTransformer

    """
    transformer = Transformer(stream=stream)
    transformer.transform({
        "uri": uri,
        "username": username,
        "password": password,
        "format": "neo4j",
        "node_filters": node_filters,
        "edge_filters": edge_filters,
    })

    if not output_format:
        output_format = "tsv"
    transformer.save({
        "filename": output,
        "format": output_format,
        "compression": output_compression
    })
    return transformer
Exemplo n.º 3
0
def neo4j_upload(
    inputs: List[str],
    input_format: str,
    input_compression: Optional[str],
    uri: str,
    username: str,
    password: str,
    stream: bool,
    node_filters: Optional[Tuple] = None,
    edge_filters: Optional[Tuple] = None,
) -> Transformer:
    """
    Upload a set of nodes/edges to a Neo4j database.

    Parameters
    ----------
    inputs: List[str]
        A list of files that contains nodes/edges
    input_format: str
        The input format
    input_compression: Optional[str]
        The input compression type
    uri: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    stream: bool
        Whether to parse input as a stream
    node_filters: Optional[Tuple]
        Node filters
    edge_filters: Optional[Tuple]
        Edge filters

    Returns
    -------
    kgx.Transformer
        The NeoTransformer

    """
    transformer = Transformer(stream=stream)
    transformer.transform({
        "filename": inputs,
        "format": input_format,
        "compression": input_compression,
        "node_filters": node_filters,
        "edge_filters": edge_filters,
    })
    transformer.save({
        "uri": uri,
        "username": username,
        "password": password,
        "format": "neo4j"
    })
    return transformer
Exemplo n.º 4
0
def test_rdf_transform3():
    """
    Test parsing an RDF N-triple and round-trip.
    """
    input_args1 = {
        'filename': [os.path.join(RESOURCE_DIR, 'rdf', 'test1.nt')],
        'format': 'nt'
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 2
    assert t1.store.graph.number_of_edges() == 1

    output_args1 = {
        'filename': os.path.join(TARGET_DIR, 'test1-export.nt'),
        'format': 'nt'
    }
    t1.save(output_args1)

    input_args2 = {
        'filename': [os.path.join(TARGET_DIR, 'test1-export.nt')],
        'format': 'nt'
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 2
    assert t2.store.graph.number_of_edges() == 1

    n1t1 = t1.store.graph.nodes()['ENSEMBL:ENSG0000000000001']
    n1t2 = t2.store.graph.nodes()['ENSEMBL:ENSG0000000000001']
    n1t3 = t2.store.graph.nodes()['ENSEMBL:ENSG0000000000001']

    assert n1t1['type'] == n1t2['type'] == n1t3['type'] == 'SO:0000704'
    assert len(n1t1['category']) == len(n1t2['category']) == len(
        n1t3['category']) == 4
    assert ('biolink:Gene' in n1t1['category']
            and 'biolink:Gene' in n1t2['category']
            and 'biolink:Gene' in n1t3['category'])
    assert ('biolink:GenomicEntity' in n1t1['category']
            and 'biolink:GenomicEntity' in n1t2['category']
            and 'biolink:GenomicEntity' in n1t3['category'])
    assert ('biolink:NamedThing' in n1t1['category']
            and 'biolink:NamedThing' in n1t2['category']
            and 'biolink:NamedThing' in n1t3['category'])
    assert n1t1['name'] == n1t2['name'] == n1t3['name'] == 'Test Gene 123'
    assert (n1t1['description'] == n1t2['description'] == n1t3['description']
            == 'This is a Test Gene 123')
    assert ('Test Dataset' in n1t1['provided_by']
            and 'Test Dataset' in n1t2['provided_by']
            and 'Test Dataset' in n1t3['provided_by'])
Exemplo n.º 5
0
def test_rdf_transform3():
    """
    Test parsing an RDF N-triple and round-trip.
    """
    input_args1 = {
        "filename": [os.path.join(RESOURCE_DIR, "rdf", "test1.nt")],
        "format": "nt",
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 2
    assert t1.store.graph.number_of_edges() == 1

    output_args1 = {
        "filename": os.path.join(TARGET_DIR, "test1-export.nt"),
        "format": "nt",
    }
    t1.save(output_args1)

    input_args2 = {
        "filename": [os.path.join(TARGET_DIR, "test1-export.nt")],
        "format": "nt",
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 2
    assert t2.store.graph.number_of_edges() == 1

    n1t1 = t1.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]
    n1t2 = t2.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]
    n1t3 = t2.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]

    assert n1t1["type"] == n1t2["type"] == n1t3["type"] == "SO:0000704"
    assert len(n1t1["category"]) == len(n1t2["category"]) == len(
        n1t3["category"]) == 4
    assert ("biolink:Gene" in n1t1["category"]
            and "biolink:Gene" in n1t2["category"]
            and "biolink:Gene" in n1t3["category"])
    assert ("biolink:GenomicEntity" in n1t1["category"]
            and "biolink:GenomicEntity" in n1t2["category"]
            and "biolink:GenomicEntity" in n1t3["category"])
    assert ("biolink:NamedThing" in n1t1["category"]
            and "biolink:NamedThing" in n1t2["category"]
            and "biolink:NamedThing" in n1t3["category"])
    assert n1t1["name"] == n1t2["name"] == n1t3["name"] == "Test Gene 123"
    assert (n1t1["description"] == n1t2["description"] == n1t3["description"]
            == "This is a Test Gene 123")
    assert ("Test Dataset" in n1t1["provided_by"]
            and "Test Dataset" in n1t2["provided_by"]
            and "Test Dataset" in n1t3["provided_by"])
Exemplo n.º 6
0
def parse_source(
    key: str,
    source: dict,
    output_directory: str,
    prefix_map: Dict[str, str] = None,
    node_property_predicates: Set[str] = None,
    predicate_mappings: Dict[str, str] = None,
    checkpoint: bool = False,
) -> Sink:
    """
    Parse a source from a merge config YAML.

    Parameters
    ----------
    key: str
        Source key
    source: Dict
        Source configuration
    output_directory: str
        Location to write output to
    prefix_map: Dict[str, str]
        Non-canonical CURIE mappings
    node_property_predicates: Set[str]
        A set of predicates that ought to be treated as node properties (This is applicable for RDF)
    predicate_mappings: Dict[str, str]
        A mapping of predicate IRIs to property names (This is applicable for RDF)
    checkpoint: bool
        Whether to serialize each individual source to a TSV

    Returns
    -------
    kgx.sink.sink.Sink
        Returns an instance of Sink

    """
    log.info(f"Processing source '{key}'")
    if not key:
        key = os.path.basename(source['input']['filename'][0])
    input_args = prepare_input_args(key, source, output_directory, prefix_map,
                                    node_property_predicates,
                                    predicate_mappings)
    transformer = Transformer()
    transformer.transform(input_args)
    transformer.store.graph.name = key
    if checkpoint:
        log.info(f"Writing checkpoint for source '{key}'")
        checkpoint_output = f"{output_directory}/{key}" if output_directory else key
        transformer.save({'filename': checkpoint_output, 'format': 'tsv'})
    return transformer.store
Exemplo n.º 7
0
def test_csv_to_neo4j_load_to_graph_transform(clean_database):
    """
    Test to load a csv KGX file into Neo4j.
    """
    logger.debug("test_csv_to_neo4j_load...")
    input_args1 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "cm_nodes.csv"),
            os.path.join(RESOURCE_DIR, "cm_edges.csv"),
        ],
        "format":
        "csv",
    }
    t1 = Transformer()
    t1.transform(input_args1)

    output_args = {
        "uri": DEFAULT_NEO4J_URL,
        "username": DEFAULT_NEO4J_USERNAME,
        "password": DEFAULT_NEO4J_PASSWORD,
        "format": "neo4j",
    }
    t1.save(output_args)
    """
    Continue sequentially to test read from Neo4j to write out back to CSV.
    """
    logger.debug("test_neo4j_to_graph_transform")
    input_args = {
        "uri": DEFAULT_NEO4J_URL,
        "username": DEFAULT_NEO4J_USERNAME,
        "password": DEFAULT_NEO4J_PASSWORD,
        "format": "neo4j",
    }
    output_filename = os.path.join(TARGET_DIR, "neo_graph")
    output_args = {"filename": output_filename, "format": "csv"}
    t = Transformer()
    t.transform(input_args, output_args)
    assert t.store.graph.number_of_nodes() == 10
    assert t.store.graph.number_of_edges() == 11
    assert os.path.exists(f"{output_filename}_nodes.csv")
    assert os.path.exists(f"{output_filename}_edges.csv")
Exemplo n.º 8
0
def test_csv_to_neo_load():
    """
    Test to load a CSV to Neo4j.
    """
    input_args1 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'cm_nodes.csv'),
            os.path.join(RESOURCE_DIR, 'cm_edges.csv'),
        ],
        'format':
        'csv',
    }
    t1 = Transformer()
    t1.transform(input_args1)

    output_args = {
        'uri': DEFAULT_NEO4J_URL,
        'username': DEFAULT_NEO4J_USERNAME,
        'password': DEFAULT_NEO4J_PASSWORD,
        'format': 'neo4j',
    }
    t1.save(output_args)
Exemplo n.º 9
0
def _transform(query):
    """
    Transform an input to an output via Transformer.
    """
    t1 = Transformer()
    t1.transform(query[0])
    t1.save(query[1].copy())

    assert t1.store.graph.number_of_nodes() == query[2]
    assert t1.store.graph.number_of_edges() == query[3]

    output = query[1]
    if output['format'] in {'tsv', 'csv', 'jsonl'}:
        input_args = {
            'filename': [
                f"{output['filename']}_nodes.{output['format']}",
                f"{output['filename']}_edges.{output['format']}",
            ],
            'format':
            output['format'],
        }
    elif output['format'] in {'neo4j'}:
        input_args = {
            'uri': DEFAULT_NEO4J_URL,
            'username': DEFAULT_NEO4J_USERNAME,
            'password': DEFAULT_NEO4J_PASSWORD,
            'format': 'neo4j',
        }
    else:
        input_args = {
            'filename': [f"{output['filename']}"],
            'format': output['format']
        }

    t2 = Transformer()
    t2.transform(input_args)

    assert t2.store.graph.number_of_nodes() == query[2]
    assert t2.store.graph.number_of_edges() == query[3]
Exemplo n.º 10
0
def _transform(query):
    """
    Transform an input to an output via Transformer.
    """
    t1 = Transformer()
    t1.transform(query[0])
    t1.save(query[1].copy())

    assert t1.store.graph.number_of_nodes() == query[2]
    assert t1.store.graph.number_of_edges() == query[3]

    output = query[1]
    if output["format"] in {"tsv", "csv", "jsonl"}:
        input_args = {
            "filename": [
                f"{output['filename']}_nodes.{output['format']}",
                f"{output['filename']}_edges.{output['format']}",
            ],
            "format":
            output["format"],
        }
    elif output["format"] in {"neo4j"}:
        input_args = {
            "uri": DEFAULT_NEO4J_URL,
            "username": DEFAULT_NEO4J_USERNAME,
            "password": DEFAULT_NEO4J_PASSWORD,
            "format": "neo4j",
        }
    else:
        input_args = {
            "filename": [f"{output['filename']}"],
            "format": output["format"]
        }

    t2 = Transformer()
    t2.transform(input_args)

    assert t2.store.graph.number_of_nodes() == query[2]
    assert t2.store.graph.number_of_edges() == query[3]
Exemplo n.º 11
0
def test_rdf_transform5():
    """
    Parse an RDF N-Triple and round-trip, with user defined node property predicates
    and export property types.
    """
    node_property_predicates = {
        f"https://www.example.org/UNKNOWN/{x}"
        for x in ['fusion', 'homology', 'combined_score', 'cooccurence']
    }
    property_types = {}
    for k in node_property_predicates:
        property_types[k] = 'xsd:float'

    input_args1 = {
        'filename': [os.path.join(RESOURCE_DIR, 'rdf', 'test3.nt')],
        'format': 'nt',
        'node_property_predicates': node_property_predicates,
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 7
    assert t1.store.graph.number_of_edges() == 6

    output_args2 = {
        'filename': os.path.join(TARGET_DIR, 'test3-export.nt'),
        'format': 'nt',
        'property_types': property_types,
    }
    t1.save(output_args2)

    input_args2 = {
        'filename': [os.path.join(TARGET_DIR, 'test3-export.nt')],
        'format': 'nt'
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 7
    assert t2.store.graph.number_of_edges() == 6

    n1t1 = t1.store.graph.nodes()['ENSEMBL:ENSG0000000000001']
    n1t2 = t2.store.graph.nodes()['ENSEMBL:ENSG0000000000001']

    assert n1t1['type'] == n1t2['type'] == 'SO:0000704'
    assert len(n1t1['category']) == len(n1t2['category']) == 4
    assert 'biolink:Gene' in n1t1['category'] and 'biolink:Gene' in n1t2[
        'category']
    assert ('biolink:GenomicEntity' in n1t1['category']
            and 'biolink:GenomicEntity' in n1t2['category'])
    assert 'biolink:NamedThing' in n1t1[
        'category'] and 'biolink:NamedThing' in n1t2['category']
    assert n1t1['name'] == n1t2['name'] == 'Test Gene 123'
    assert n1t1['description'] == n1t2[
        'description'] == 'This is a Test Gene 123'
    assert 'Test Dataset' in n1t1['provided_by'] and 'Test Dataset' in n1t2[
        'provided_by']

    e1t1 = list(
        t1.store.graph.get_edge('ENSEMBL:ENSP0000000000001',
                                'ENSEMBL:ENSP0000000000002').values())[0]
    e1t2 = list(
        t2.store.graph.get_edge('ENSEMBL:ENSP0000000000001',
                                'ENSEMBL:ENSP0000000000002').values())[0]

    assert e1t1['subject'] == e1t2['subject'] == 'ENSEMBL:ENSP0000000000001'
    assert e1t1['object'] == e1t2['object'] == 'ENSEMBL:ENSP0000000000002'
    assert e1t1['predicate'] == e1t2['predicate'] == 'biolink:interacts_with'
    assert e1t1['relation'] == e1t2['relation'] == 'biolink:interacts_with'
    assert e1t1['type'] == e1t2['type'] == 'biolink:Association'
    assert e1t1['id'] == e1t2[
        'id'] == 'urn:uuid:fcf76807-f909-4ccb-b40a-3b79b49aa518'
    assert e1t2['fusion'] == 0.0
    assert e1t2['homology'] == 0.0
    assert e1t2['combined_score'] == 490.0
    assert e1t2['cooccurence'] == 332.0
Exemplo n.º 12
0
def test_rdf_transform2():
    """
    Test parsing an RDF N-triple, with user defined prefix map,
    node property predicates, and predicate mappings.
    """
    prefix_map = {
        'HGNC':
        'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
        'OMIM': 'http://omim.org/entry/',
    }
    node_property_predicates = {
        'http://purl.obolibrary.org/obo/RO_0002558',
        'http://purl.org/dc/elements/1.1/source',
        'https://monarchinitiative.org/frequencyOfPhenotype',
    }
    predicate_mappings = {
        'http://purl.org/dc/elements/1.1/source':
        'source',
        'https://monarchinitiative.org/frequencyOfPhenotype':
        'frequency_of_phenotype',
    }
    input_args1 = {
        'filename': [os.path.join(RESOURCE_DIR, 'rdf', 'oban-test.nt')],
        'format': 'nt',
        'prefix_map': prefix_map,
        'node_property_predicates': node_property_predicates,
        'predicate_mappings': predicate_mappings,
    }
    t1 = Transformer()
    t1.transform(input_args1)

    assert t1.store.graph.number_of_nodes() == 14
    assert t1.store.graph.number_of_edges() == 7

    n1t1 = t1.store.graph.nodes()['HP:0000505']
    assert len(n1t1['category']) == 1
    assert 'biolink:NamedThing' in n1t1['category']

    e1t1 = list(t1.store.graph.get_edge('OMIM:166400',
                                        'HP:0000006').values())[0]
    assert e1t1['subject'] == 'OMIM:166400'
    assert e1t1['object'] == 'HP:0000006'
    assert e1t1['relation'] == 'RO:0000091'
    assert e1t1['type'] == 'OBAN:association'
    assert e1t1['has_evidence'] == 'ECO:0000501'
    assert e1t1['source'] == 'OMIM:166400'

    e2t1 = list(t1.store.graph.get_edge('ORPHA:93262',
                                        'HP:0000505').values())[0]
    assert e2t1['subject'] == 'ORPHA:93262'
    assert e2t1['object'] == 'HP:0000505'
    assert e2t1['relation'] == 'RO:0002200'
    assert e2t1['type'] == 'OBAN:association'
    assert e2t1['frequency_of_phenotype'] == 'HP:0040283'
    assert e2t1['source'] == 'ORPHA:93262'

    property_types = {
        'frequency_of_phenotype': 'uriorcurie',
        'source': 'uriorcurie'
    }
    output_args1 = {
        'filename': os.path.join(TARGET_DIR, 'oban-export.nt'),
        'format': 'nt',
        'property_types': property_types,
    }
    t1.save(output_args1)

    input_args2 = {
        'filename': [os.path.join(TARGET_DIR, 'oban-export.nt')],
        'format': 'nt'
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 14
    assert t2.store.graph.number_of_edges() == 7

    n1t2 = t2.store.graph.nodes()['HP:0000505']
    assert len(n1t2['category']) == 1
    assert 'biolink:NamedThing' in n1t2['category']

    e1t2 = list(t2.store.graph.get_edge('OMIM:166400',
                                        'HP:0000006').values())[0]
    assert e1t2['subject'] == 'OMIM:166400'
    assert e1t2['object'] == 'HP:0000006'
    assert e1t2['relation'] == 'RO:0000091'
    assert e1t2['type'] == 'biolink:Association'
    assert e1t2['has_evidence'] == 'ECO:0000501'
    assert e1t2['source'] == 'OMIM:166400'

    e2t2 = list(t2.store.graph.get_edge('ORPHA:93262',
                                        'HP:0000505').values())[0]
    assert e2t2['subject'] == 'ORPHA:93262'
    assert e2t2['object'] == 'HP:0000505'
    assert e2t2['relation'] == 'RO:0002200'
    assert e2t2['type'] == 'biolink:Association'
    assert e2t2['frequency_of_phenotype'] == 'HP:0040283'
    assert e2t2['source'] == 'ORPHA:93262'

    input_args3 = {
        'filename': [os.path.join(TARGET_DIR, 'oban-export.nt')],
        'format': 'nt'
    }
    t3 = Transformer()
    t3.transform(input_args3)
    assert t3.store.graph.number_of_nodes() == 14
    assert t3.store.graph.number_of_edges() == 7

    n1t3 = t1.store.graph.nodes()['HP:0000505']
    assert len(n1t3['category']) == 1
    assert 'biolink:NamedThing' in n1t3['category']

    e1t3 = list(t3.store.graph.get_edge('OMIM:166400',
                                        'HP:0000006').values())[0]
    assert e1t3['subject'] == 'OMIM:166400'
    assert e1t3['object'] == 'HP:0000006'
    assert e1t3['relation'] == 'RO:0000091'
    assert e1t3['type'] == 'biolink:Association'
    assert e1t3['has_evidence'] == 'ECO:0000501'
    assert e1t3['source'] == 'OMIM:166400'

    e2t3 = list(t3.store.graph.get_edge('ORPHA:93262',
                                        'HP:0000505').values())[0]
    assert e2t3['subject'] == 'ORPHA:93262'
    assert e2t3['object'] == 'HP:0000505'
    assert e2t3['relation'] == 'RO:0002200'
    assert e2t3['type'] == 'biolink:Association'
    assert e2t3['frequency_of_phenotype'] == 'HP:0040283'
    assert e2t3['source'] == 'ORPHA:93262'
Exemplo n.º 13
0
def test_rdf_transform5():
    """
    Parse an RDF N-Triple and round-trip, with user defined node property predicates
    and export property types.
    """
    node_property_predicates = {
        f"https://www.example.org/UNKNOWN/{x}"
        for x in ["fusion", "homology", "combined_score", "cooccurence"]
    }
    property_types = {}
    for k in node_property_predicates:
        property_types[k] = "xsd:float"

    input_args1 = {
        "filename": [os.path.join(RESOURCE_DIR, "rdf", "test3.nt")],
        "format": "nt",
        "node_property_predicates": node_property_predicates,
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 7
    assert t1.store.graph.number_of_edges() == 6

    output_args2 = {
        "filename": os.path.join(TARGET_DIR, "test3-export.nt"),
        "format": "nt",
        "property_types": property_types,
    }
    t1.save(output_args2)

    input_args2 = {
        "filename": [os.path.join(TARGET_DIR, "test3-export.nt")],
        "format": "nt",
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 7
    assert t2.store.graph.number_of_edges() == 6

    n1t1 = t1.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]
    n1t2 = t2.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]

    assert n1t1["type"] == n1t2["type"] == "SO:0000704"
    assert len(n1t1["category"]) == len(n1t2["category"]) == 4
    assert "biolink:Gene" in n1t1["category"] and "biolink:Gene" in n1t2[
        "category"]
    assert ("biolink:GenomicEntity" in n1t1["category"]
            and "biolink:GenomicEntity" in n1t2["category"])
    assert ("biolink:NamedThing" in n1t1["category"]
            and "biolink:NamedThing" in n1t2["category"])
    assert n1t1["name"] == n1t2["name"] == "Test Gene 123"
    assert n1t1["description"] == n1t2[
        "description"] == "This is a Test Gene 123"
    assert ("Test Dataset" in n1t1["provided_by"]
            and "Test Dataset" in n1t2["provided_by"])

    e1t1 = list(
        t1.store.graph.get_edge("ENSEMBL:ENSP0000000000001",
                                "ENSEMBL:ENSP0000000000002").values())[0]
    e1t2 = list(
        t2.store.graph.get_edge("ENSEMBL:ENSP0000000000001",
                                "ENSEMBL:ENSP0000000000002").values())[0]

    assert e1t1["subject"] == e1t2["subject"] == "ENSEMBL:ENSP0000000000001"
    assert e1t1["object"] == e1t2["object"] == "ENSEMBL:ENSP0000000000002"
    assert e1t1["predicate"] == e1t2["predicate"] == "biolink:interacts_with"
    assert e1t1["relation"] == e1t2["relation"] == "biolink:interacts_with"
    assert e1t1["type"] == e1t2["type"] == "biolink:Association"
    assert e1t1["id"] == e1t2[
        "id"] == "urn:uuid:fcf76807-f909-4ccb-b40a-3b79b49aa518"
    assert "test3.nt" in e1t1["knowledge_source"]
    assert e1t2["fusion"] == 0.0
    assert e1t2["homology"] == 0.0
    assert e1t2["combined_score"] == 490.0
    assert e1t2["cooccurence"] == 332.0
    assert "test3.nt" in e1t2["knowledge_source"]
Exemplo n.º 14
0
def test_rdf_transform2():
    """
    Test parsing an RDF N-triple, with user defined prefix map,
    node property predicates, and predicate mappings.
    """
    prefix_map = {
        "HGNC":
        "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/",
        "OMIM": "http://omim.org/entry/",
    }
    node_property_predicates = {
        "http://purl.obolibrary.org/obo/RO_0002558",
        "http://purl.org/dc/elements/1.1/source",
        "https://monarchinitiative.org/frequencyOfPhenotype",
    }
    predicate_mappings = {
        "http://purl.org/dc/elements/1.1/source":
        "source",
        "https://monarchinitiative.org/frequencyOfPhenotype":
        "frequency_of_phenotype",
    }
    input_args1 = {
        "filename": [os.path.join(RESOURCE_DIR, "rdf", "oban-test.nt")],
        "format": "nt",
        "prefix_map": prefix_map,
        "node_property_predicates": node_property_predicates,
        "predicate_mappings": predicate_mappings,
    }
    t1 = Transformer()
    t1.transform(input_args1)

    assert t1.store.graph.number_of_nodes() == 14
    assert t1.store.graph.number_of_edges() == 7

    n1t1 = t1.store.graph.nodes()["HP:0000505"]
    assert len(n1t1["category"]) == 1
    assert "biolink:NamedThing" in n1t1["category"]

    e1t1 = list(t1.store.graph.get_edge("OMIM:166400",
                                        "HP:0000006").values())[0]
    assert e1t1["subject"] == "OMIM:166400"
    assert e1t1["object"] == "HP:0000006"
    assert e1t1["relation"] == "RO:0000091"
    assert e1t1["type"] == "OBAN:association"
    assert e1t1["has_evidence"] == "ECO:0000501"
    assert e1t1["source"] == "OMIM:166400"

    e2t1 = list(t1.store.graph.get_edge("ORPHA:93262",
                                        "HP:0000505").values())[0]
    assert e2t1["subject"] == "ORPHA:93262"
    assert e2t1["object"] == "HP:0000505"
    assert e2t1["relation"] == "RO:0002200"
    assert e2t1["type"] == "OBAN:association"
    assert e2t1["frequency_of_phenotype"] == "HP:0040283"
    assert e2t1["source"] == "ORPHA:93262"

    property_types = {
        "frequency_of_phenotype": "uriorcurie",
        "source": "uriorcurie"
    }
    output_args1 = {
        "filename": os.path.join(TARGET_DIR, "oban-export.nt"),
        "format": "nt",
        "property_types": property_types,
    }
    t1.save(output_args1)

    input_args2 = {
        "filename": [os.path.join(TARGET_DIR, "oban-export.nt")],
        "format": "nt",
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 14
    assert t2.store.graph.number_of_edges() == 7

    n1t2 = t2.store.graph.nodes()["HP:0000505"]
    assert len(n1t2["category"]) == 1
    assert "biolink:NamedThing" in n1t2["category"]

    e1t2 = list(t2.store.graph.get_edge("OMIM:166400",
                                        "HP:0000006").values())[0]
    assert e1t2["subject"] == "OMIM:166400"
    assert e1t2["object"] == "HP:0000006"
    assert e1t2["relation"] == "RO:0000091"
    assert e1t2["type"] == "biolink:Association"
    assert e1t2["has_evidence"] == "ECO:0000501"
    assert e1t2["source"] == "OMIM:166400"

    e2t2 = list(t2.store.graph.get_edge("ORPHA:93262",
                                        "HP:0000505").values())[0]
    assert e2t2["subject"] == "ORPHA:93262"
    assert e2t2["object"] == "HP:0000505"
    assert e2t2["relation"] == "RO:0002200"
    assert e2t2["type"] == "biolink:Association"
    assert e2t2["frequency_of_phenotype"] == "HP:0040283"
    assert e2t2["source"] == "ORPHA:93262"

    input_args3 = {
        "filename": [os.path.join(TARGET_DIR, "oban-export.nt")],
        "format": "nt",
    }
    t3 = Transformer()
    t3.transform(input_args3)
    assert t3.store.graph.number_of_nodes() == 14
    assert t3.store.graph.number_of_edges() == 7

    n1t3 = t1.store.graph.nodes()["HP:0000505"]
    assert len(n1t3["category"]) == 1
    assert "biolink:NamedThing" in n1t3["category"]

    e1t3 = list(t3.store.graph.get_edge("OMIM:166400",
                                        "HP:0000006").values())[0]
    assert e1t3["subject"] == "OMIM:166400"
    assert e1t3["object"] == "HP:0000006"
    assert e1t3["relation"] == "RO:0000091"
    assert e1t3["type"] == "biolink:Association"
    assert e1t3["has_evidence"] == "ECO:0000501"
    assert e1t3["source"] == "OMIM:166400"

    e2t3 = list(t3.store.graph.get_edge("ORPHA:93262",
                                        "HP:0000505").values())[0]
    assert e2t3["subject"] == "ORPHA:93262"
    assert e2t3["object"] == "HP:0000505"
    assert e2t3["relation"] == "RO:0002200"
    assert e2t3["type"] == "biolink:Association"
    assert e2t3["frequency_of_phenotype"] == "HP:0040283"
    assert e2t3["source"] == "ORPHA:93262"