def test_no_remap(self): output_dir = Path(TMP_TEST_DIR) / Path("test_dtype") output_dir.mkdir() converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", remap_ids=False, num_nodes=100, num_rels=10 ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=False)
def test_delim(self): output_dir = Path(TMP_TEST_DIR) / Path("test_delim") output_dir.mkdir() tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") tmp.to_csv(Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), header=None, sep=",", index=False) converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), delim="," ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_splits(self): output_dir = Path(TMP_TEST_DIR) / Path("test_splits") output_dir.mkdir() converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", splits=[.9, .05, .05] ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 900 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 900 expected_stats.num_valid = 50 expected_stats.num_test = 50 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_pytorch_defaults(self): output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") output_dir.mkdir() train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") train_edges = torch.tensor(train_edges_df.to_numpy()) converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=train_edges, format="pytorch" ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def generate_random_dataset_lp(output_dir, num_nodes, num_edges, num_rels=1, splits=None, num_partitions=1, partitioned_eval=False, sequential_train_nodes=False, remap_ids=True, feature_dim=-1): edges = get_random_graph(num_nodes, num_edges, num_rels) edges_df = pd.DataFrame(data=edges) if edges.shape[1] == 3: columns = [0, 1, 2] else: columns = [0, 1] raw_edges_filename = output_dir / Path("raw_edges.csv") edges_df.to_csv(raw_edges_filename, ",", header=False, index=False) converter = TorchEdgeListConverter( output_dir, train_edges=raw_edges_filename, delim=",", splits=splits, num_partitions=num_partitions, remap_ids=remap_ids, columns=columns, partitioned_evaluation=partitioned_eval, sequential_train_nodes=sequential_train_nodes, format="CSV") dataset_stats = converter.convert() if feature_dim != -1: features = generate_features(num_nodes, feature_dim) if remap_ids: features = remap_lp(output_dir, features) node_features_file = output_dir / Path( PathConstants.node_features_path) with open(node_features_file, "wb") as f: f.write(bytes(features)) dataset_stats.node_feature_dim = feature_dim with open(output_dir / Path("dataset.yaml"), "w") as f: yaml_file = OmegaConf.to_yaml(dataset_stats) f.writelines(yaml_file)
def generate_random_dataset_nc(output_dir, num_nodes, num_edges, num_rels=1, splits=None, num_partitions=1, partitioned_eval=False, sequential_train_nodes=False, remap_ids=True, feature_dim=-1, num_classes=10): edges = get_random_graph(num_nodes, num_edges, num_rels) edges_df = pd.DataFrame(data=edges) if edges.shape[1] == 3: columns = [0, 1, 2] else: columns = [0, 1] raw_edges_filename = output_dir / Path("raw_edges.csv") edges_df.to_csv(raw_edges_filename, ",", header=False, index=False) all_nodes = np.arange(0, num_nodes, dtype=np.int32) train_nodes = all_nodes valid_nodes = None test_nodes = None if splits is not None: train_nodes, valid_nodes, test_nodes = split_edges(all_nodes, splits) converter = TorchEdgeListConverter( output_dir, train_edges=Path(raw_edges_filename), delim=",", remap_ids=remap_ids, num_partitions=num_partitions, columns=columns, partitioned_evaluation=partitioned_eval, sequential_train_nodes=sequential_train_nodes, known_node_ids=[train_nodes, valid_nodes, test_nodes], format="CSV") dataset_stats = converter.convert() features = None if feature_dim != -1: features = generate_features(num_nodes, feature_dim) labels = generate_labels(num_nodes, num_classes) train_nodes, labels, valid_nodes, test_nodes, features = remap_nc( output_dir, train_nodes, labels, num_nodes, valid_nodes, test_nodes, features) if features is not None: node_features_file = output_dir / Path( PathConstants.node_features_path) with open(node_features_file, "wb") as f: f.write(bytes(features)) labels_file = output_dir / Path(PathConstants.labels_path) with open(labels_file, "wb") as f: f.write(bytes(labels)) if train_nodes is not None: train_nodes_file = output_dir / Path(PathConstants.train_nodes_path) with open(train_nodes_file, "wb") as f: f.write(bytes(train_nodes)) if valid_nodes is not None: valid_nodes_file = output_dir / Path(PathConstants.valid_nodes_path) with open(valid_nodes_file, "wb") as f: f.write(bytes(valid_nodes)) if test_nodes is not None: test_nodes_file = output_dir / Path(PathConstants.test_nodes_path) with open(test_nodes_file, "wb") as f: f.write(bytes(test_nodes)) # update dataset yaml dataset_stats.num_train = train_nodes.shape[0] if valid_nodes is not None: dataset_stats.num_valid = valid_nodes.shape[0] else: dataset_stats.num_valid = -1 if test_nodes is not None: dataset_stats.num_test = test_nodes.shape[0] else: dataset_stats.num_test = -1 if features is not None: dataset_stats.node_feature_dim = features.shape[1] else: dataset_stats.node_feature_dim = -1 dataset_stats.num_classes = num_classes dataset_stats.num_nodes = num_nodes with open(output_dir / Path("dataset.yaml"), "w") as f: yaml_file = OmegaConf.to_yaml(dataset_stats) f.writelines(yaml_file)