Пример #1
0
def test_load_ppi() -> EnsmallenGraph:
    """Test that PPI can be loaded."""
    EnsmallenGraph.from_unsorted_csv(
        edge_path=os.path.join(ROOT_DIR, "data/ppi/edges.tsv"),
        sources_column="subject",
        destinations_column="object",
        directed=False,
        weights_column="weight",
        node_path="./pytests/data/ppi/nodes.tsv",
        nodes_column="id",
        node_types_column="molecular_function",
        default_node_type="Missing"
    )
Пример #2
0
def test_no_existent_column():
    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="NOT A REAL COLUMN",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="",
            directed=False,
            edge_types_column="NOT A REAL COLUMN",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="NOT A REAL COLUMN",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="object",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="NOT A REAL COLUMN",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )
Пример #3
0
def load_pathway() -> EnsmallenGraph:
    """Test that Pathway can be loaded."""
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path="./pytests/data/pathway.tsv",
        sources_column="Gene_A",
        destinations_column="Gene_B",
        directed=False,
        name="Pathway")
    graph.enable()
    return graph
Пример #4
0
def load_hpo() -> EnsmallenGraph:
    """Test that HPO graph can be loaded."""
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path="./pytests/data/edges.tsv",
        sources_column="subject",
        destinations_column="object",
        directed=False,
        edge_types_column="edge_label",
        node_path="./pytests/data/nodes.tsv",
        nodes_column="id",
        node_types_column="category",
        default_edge_type='biolink:interacts_with',
        default_node_type='biolink:NamedThing',
        name="HPO")
    graph.enable()
    return graph
Пример #5
0
    c for c in go_yaml['classifier']['classifiers']
    if c['type'] == 'neural network'
][0]['model']['outfile']
mlp = tf.keras.models.load_model(os.path.join("output_data", mlp_file))

node_data = pd.read_csv('input_data/go_nodes.tsv', sep='\t')
node_data = node_data.filter(['id', 'name'])

#
# positive validation edges
#
pos_graph_args = go_yaml['graph_data']['graph']
pos_graph_args['directed'] = True
pos_graph_args['edge_path'] = go_yaml['graph_data']['pos_validation'][
    'edge_path']
pos_validation_graph = EnsmallenGraph.from_unsorted_csv(**pos_graph_args)
pos_edges = list(
    zip(pos_validation_graph.get_source_names(),
        pos_validation_graph.get_destination_names()))

pos_edge_transform = GraphTransformer(go_yaml['classifier']['edge_method'])
pos_edge_transform.fit(
    np.load(
        os.path.join("output_data",
                     go_yaml['embeddings']['embedding_file_name'])))
pos_edges_to_eval_emb = pos_edge_transform.transform(pos_validation_graph)

pos_valid_predict = mlp.predict(pos_edges_to_eval_emb, batch_size=1048)
pos_valid_predict_sorted = pd.DataFrame({
    "pred": pos_valid_predict.flatten(),
    "subject": [t[0] for t in pos_edges],
Пример #6
0
def make_holdouts(nodes: str,
                  edges: str,
                  output_dir: str,
                  train_fraction: float,
                  validation: bool,
                  seed=42) -> None:
    """Prepare positive and negative edges for testing and training (see run.py holdouts
    command for documentation)

    Args:
        :param nodes    nodes of input graph, in KGX TSV format [data/merged/nodes.tsv]
        :param edges:   edges for input graph, in KGX TSV format [data/merged/edges.tsv]
        :param output_dir:     directory to output edges and new graph [data/edges/]
        :param train_fraction: fraction of edges to emit as training
        :param validation:     should we make validation edges? [False]
        :param seed:    random seed [42]
    Returns:
        None.
    """
    logging.basicConfig(level=logging.INFO)
    logging.info("Loading graph from nodes %s and edges %s files" %
                 (nodes, edges))
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path=edges,
        sources_column='subject',
        destinations_column='object',
        directed=False,
        edge_types_column='edge_label',
        default_edge_type='biolink:Association',
        node_path=nodes,
        nodes_column='id',
        default_node_type='biolink:NamedThing',
        node_types_column='category')

    os.makedirs(output_dir, exist_ok=True)

    # make positive edges
    logging.info("Making positive edges")
    pos_train_edges, pos_test_edges = graph.random_holdout(
        seed=seed, train_percentage=train_fraction)
    if validation:
        pos_valid_edges, pos_test_edges = \
            pos_test_edges.random_holdout(seed=seed,
                                          train_percentage=0.5)

    # make negative edges
    logging.info("Making negative edges")

    all_negative_edges = \
        pos_train_edges.sample_negatives(seed=seed,
                                         negatives_number=graph.get_edges_number(),
                                         allow_selfloops=False)
    neg_train_edges, neg_test_edges = \
        all_negative_edges.random_holdout(seed=seed, train_percentage=train_fraction)
    if validation:
        neg_test_edges, neg_valid_edges = \
            neg_test_edges.random_holdout(seed=seed, train_percentage=0.5)

    #
    # write out positive edges
    #
    # training:
    logging.info("Writing out positive edges")
    pos_train_edges_outfile = os.path.join(output_dir, "pos_train_edges.tsv")
    pos_train_nodes_outfile = os.path.join(output_dir, "pos_train_nodes.tsv")
    pos_test_edges_outfile = os.path.join(output_dir, "pos_test_edges.tsv")
    pos_valid_edges_outfile = os.path.join(output_dir, "pos_valid_edges.tsv")

    pos_train_edges.dump_edges(path=pos_train_edges_outfile)
    pos_train_edges.dump_nodes(path=pos_train_nodes_outfile)
    pos_test_edges.dump_edges(path=pos_test_edges_outfile)
    if validation:
        pos_valid_edges.dump_edges(path=pos_valid_edges_outfile)

    #
    # write out negative edges
    #
    logging.info("Writing out negative edges")
    neg_train_edges_outfile = os.path.join(output_dir, "neg_train_edges.tsv")
    neg_test_edges_outfile = os.path.join(output_dir, "neg_test_edges.tsv")
    neg_valid_edges_outfile = os.path.join(output_dir, "neg_valid_edges.tsv")

    neg_train_edges.dump_edges(path=neg_train_edges_outfile)
    neg_test_edges.dump_edges(path=neg_test_edges_outfile)
    if validation:
        neg_valid_edges.dump_edges(path=neg_valid_edges_outfile)
Пример #7
0
def sanitize_graph(graph_data: str, root: str):
    """Convert all the graphs to a standard format.

    Parameters
    ----------
    graph_data: List[Dict],
        Informations of the graph to sanitize
    root: str,
        The working folder. All the files will be read and written from here.
    """
    kwargs = graph_data["loading_settings"]

    kwargs["edge_path"] = os.path.join(root, graph_data["folder_name"],
                                       graph_data["edge_file"])

    kwargs.setdefault("directed", False)

    directed_dst_path = os.path.join(root, graph_data["folder_name"],
                                     "directed_sanitized.tsv")

    undirected_dst_path = os.path.join(root, graph_data["folder_name"],
                                       "undirected_sanitized.tsv")

    report_path = os.path.join(root, graph_data["folder_name"], "report.json")

    textual_report_path = os.path.join(root, graph_data["folder_name"],
                                       "report.txt")

    if all(
            os.path.exists(p) for p in (directed_dst_path, undirected_dst_path,
                                        report_path, textual_report_path)):
        return

    logger.info("Loading the file %s" % kwargs["edge_path"])
    graph: EnsmallenGraph = EnsmallenGraph.from_unsorted_csv(
        **kwargs, name=graph_data["graph"])
    logger.info("Enabling fast version")
    graph.enable_fast_walk()
    logger.info("Computing metadata")
    if not os.path.exists(report_path):
        logger.info("Computing JSON report")
        report = graph.report()
        compress_json.dump(report, report_path)
    if not os.path.exists(textual_report_path):
        logger.info("Computing textual report")
        textual_report = str(graph)
        with open(textual_report_path, "w") as f:
            f.write(textual_report)

    if not os.path.exists(undirected_dst_path):
        logger.info("Writing the file {}".format(undirected_dst_path))
        graph.dump_edges(
            path=undirected_dst_path,
            header=False,
            sources_column_number=0,
            destinations_column_number=1,
            weights_column_number=2,
            numeric_node_ids=True,
            # We dump with directed=True for the undirected file to have in the file the bidirectional edges.
            directed=True)
    if not os.path.exists(directed_dst_path):
        logger.info("Writing the file {}".format(directed_dst_path))
        graph.dump_edges(
            path=directed_dst_path,
            header=False,
            sources_column_number=0,
            destinations_column_number=1,
            weights_column_number=2,
            numeric_node_ids=True,
            # We dump with directed=False for the directed file to have no doubled bidirectional edge in the write out.
            directed=False)
Пример #8
0
if not os.path.exists(hpo_json_file):
    os.system("wget http://purl.obolibrary.org/obo/hp.json -O hpo.json")

if not os.path.exists(hpo_edges_file) or not os.path.exists(hpo_nodes_file):
    os.system("kgx transform --input-format obojson --output-format tsv --output hpo hpo.json")

edge = 'biolink:subclass_of'
edges_string = edge
os.makedirs(edges_string, exist_ok=True)

graph = EnsmallenGraph.from_unsorted_csv(
    edge_path=hpo_edges_file,
    sources_column="subject",
    destinations_column="object",
    edge_types_column='edge_label',
    directed=False,
    node_path=hpo_nodes_file,
    nodes_column='id',
    node_types_column='category',
    default_node_type='biolink:NamedThing'
)

reduced_graph = graph.remove(singletons=True)
pos_training, pos_validation = reduced_graph.connected_holdout(
    train_size=train_percentage,
    edge_types=[edge],
    random_state=seed)

# make negative graph
neg_training, neg_validation = reduced_graph.sample_negatives(
   random_state=seed,