Пример #1
0
    def test_no_remap(self):

        output_dir = Path(TMP_TEST_DIR) / Path("test_dtype")
        output_dir.mkdir()

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"),
            delim=" ",
            remap_ids=False,
            num_nodes=100,
            num_rels=10
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=False)
Пример #2
0
    def test_delim(self):

        output_dir = Path(TMP_TEST_DIR) / Path("test_delim")
        output_dir.mkdir()

        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
        tmp.to_csv(Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), header=None, sep=",", index=False)

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"),
            delim=","
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Пример #3
0
    def test_splits(self):
        output_dir = Path(TMP_TEST_DIR) / Path("test_splits")
        output_dir.mkdir()

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"),
            delim=" ",
            splits=[.9, .05, .05]
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 900
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 900
        expected_stats.num_valid = 50
        expected_stats.num_test = 50

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Пример #4
0
    def test_pytorch_defaults(self):
        output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
        output_dir.mkdir()

        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")

        train_edges = torch.tensor(train_edges_df.to_numpy())

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=train_edges,
            format="pytorch"
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Пример #5
0
def generate_random_dataset_lp(output_dir,
                               num_nodes,
                               num_edges,
                               num_rels=1,
                               splits=None,
                               num_partitions=1,
                               partitioned_eval=False,
                               sequential_train_nodes=False,
                               remap_ids=True,
                               feature_dim=-1):
    edges = get_random_graph(num_nodes, num_edges, num_rels)
    edges_df = pd.DataFrame(data=edges)

    if edges.shape[1] == 3:
        columns = [0, 1, 2]
    else:
        columns = [0, 1]

    raw_edges_filename = output_dir / Path("raw_edges.csv")

    edges_df.to_csv(raw_edges_filename, ",", header=False, index=False)

    converter = TorchEdgeListConverter(
        output_dir,
        train_edges=raw_edges_filename,
        delim=",",
        splits=splits,
        num_partitions=num_partitions,
        remap_ids=remap_ids,
        columns=columns,
        partitioned_evaluation=partitioned_eval,
        sequential_train_nodes=sequential_train_nodes,
        format="CSV")

    dataset_stats = converter.convert()

    if feature_dim != -1:
        features = generate_features(num_nodes, feature_dim)

        if remap_ids:
            features = remap_lp(output_dir, features)

        node_features_file = output_dir / Path(
            PathConstants.node_features_path)
        with open(node_features_file, "wb") as f:
            f.write(bytes(features))

        dataset_stats.node_feature_dim = feature_dim
        with open(output_dir / Path("dataset.yaml"), "w") as f:
            yaml_file = OmegaConf.to_yaml(dataset_stats)
            f.writelines(yaml_file)
Пример #6
0
def generate_random_dataset_nc(output_dir,
                               num_nodes,
                               num_edges,
                               num_rels=1,
                               splits=None,
                               num_partitions=1,
                               partitioned_eval=False,
                               sequential_train_nodes=False,
                               remap_ids=True,
                               feature_dim=-1,
                               num_classes=10):
    edges = get_random_graph(num_nodes, num_edges, num_rels)
    edges_df = pd.DataFrame(data=edges)

    if edges.shape[1] == 3:
        columns = [0, 1, 2]
    else:
        columns = [0, 1]

    raw_edges_filename = output_dir / Path("raw_edges.csv")
    edges_df.to_csv(raw_edges_filename, ",", header=False, index=False)

    all_nodes = np.arange(0, num_nodes, dtype=np.int32)
    train_nodes = all_nodes

    valid_nodes = None
    test_nodes = None
    if splits is not None:
        train_nodes, valid_nodes, test_nodes = split_edges(all_nodes, splits)

    converter = TorchEdgeListConverter(
        output_dir,
        train_edges=Path(raw_edges_filename),
        delim=",",
        remap_ids=remap_ids,
        num_partitions=num_partitions,
        columns=columns,
        partitioned_evaluation=partitioned_eval,
        sequential_train_nodes=sequential_train_nodes,
        known_node_ids=[train_nodes, valid_nodes, test_nodes],
        format="CSV")

    dataset_stats = converter.convert()

    features = None
    if feature_dim != -1:
        features = generate_features(num_nodes, feature_dim)

    labels = generate_labels(num_nodes, num_classes)

    train_nodes, labels, valid_nodes, test_nodes, features = remap_nc(
        output_dir, train_nodes, labels, num_nodes, valid_nodes, test_nodes,
        features)

    if features is not None:
        node_features_file = output_dir / Path(
            PathConstants.node_features_path)
        with open(node_features_file, "wb") as f:
            f.write(bytes(features))

    labels_file = output_dir / Path(PathConstants.labels_path)
    with open(labels_file, "wb") as f:
        f.write(bytes(labels))

    if train_nodes is not None:
        train_nodes_file = output_dir / Path(PathConstants.train_nodes_path)
        with open(train_nodes_file, "wb") as f:
            f.write(bytes(train_nodes))

    if valid_nodes is not None:
        valid_nodes_file = output_dir / Path(PathConstants.valid_nodes_path)
        with open(valid_nodes_file, "wb") as f:
            f.write(bytes(valid_nodes))

    if test_nodes is not None:
        test_nodes_file = output_dir / Path(PathConstants.test_nodes_path)
        with open(test_nodes_file, "wb") as f:
            f.write(bytes(test_nodes))

    # update dataset yaml
    dataset_stats.num_train = train_nodes.shape[0]

    if valid_nodes is not None:
        dataset_stats.num_valid = valid_nodes.shape[0]
    else:
        dataset_stats.num_valid = -1

    if test_nodes is not None:
        dataset_stats.num_test = test_nodes.shape[0]
    else:
        dataset_stats.num_test = -1

    if features is not None:
        dataset_stats.node_feature_dim = features.shape[1]
    else:
        dataset_stats.node_feature_dim = -1

    dataset_stats.num_classes = num_classes

    dataset_stats.num_nodes = num_nodes

    with open(output_dir / Path("dataset.yaml"), "w") as f:
        yaml_file = OmegaConf.to_yaml(dataset_stats)
        f.writelines(yaml_file)