def test_delimited_str_ids(self): output_dir = self.make_directory_tree("test_delimited_str_ids") tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") tmp[0] = tmp[0].map(str) + "_test" tmp[1] = tmp[1].map(str) + "_test" tmp[2] = tmp[2].map(str) + "_test" tmp.to_csv(Path(TMP_TEST_DIR) / Path("str_train_edges.txt"), header=None, sep=" ", index=False) converter = SparkEdgeListConverter(output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("str_train_edges.txt"), delim=" ") converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_header(self): output_dir = self.make_directory_tree("test_header") tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") tmp.to_csv(Path(TMP_TEST_DIR) / Path("header_train_edges.txt"), header=["src", "rel", "dst"], sep=" ", index=False) converter = SparkEdgeListConverter(output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("header_train_edges.txt"), delim=" ", header_length=1) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_partitions(self): output_dir = self.make_directory_tree("test_partitions") converter = SparkEdgeListConverter(output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", num_partitions=10) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_partitioned_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=10) converter = SparkEdgeListConverter(output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", num_partitions=100) converter.convert() validate_partitioned_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=100)
def test_no_remap(self): output_dir = Path(TMP_TEST_DIR) / Path("test_dtype") output_dir.mkdir() converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", remap_ids=False, num_nodes=100, num_rels=10 ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=False)
def test_delim(self): output_dir = Path(TMP_TEST_DIR) / Path("test_delim") output_dir.mkdir() tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") tmp.to_csv(Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), header=None, sep=",", index=False) converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), delim="," ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_splits(self): output_dir = Path(TMP_TEST_DIR) / Path("test_splits") output_dir.mkdir() converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", splits=[.9, .05, .05] ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 900 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 900 expected_stats.num_valid = 50 expected_stats.num_test = 50 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def test_pytorch_defaults(self): output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") output_dir.mkdir() train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") train_edges = torch.tensor(train_edges_df.to_numpy()) converter = TorchEdgeListConverter( output_dir=output_dir, train_edges=train_edges, format="pytorch" ) converter.convert() expected_stats = DatasetConfig() expected_stats.dataset_dir = output_dir.__str__() expected_stats.num_edges = 1000 expected_stats.num_nodes = 100 expected_stats.num_relations = 10 expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)
def write_to_binary(self, train_edges_tens, valid_edges_tens, test_edges_tens, num_nodes, num_rels, num_partitions, train_edges_offsets=None, valid_edges_offsets=None, test_edges_offsets=None): dataset_stats = DatasetConfig() dataset_stats.dataset_dir = Path( self.output_dir).absolute().__str__() + "/" dataset_stats.num_edges = train_edges_tens.size(0) dataset_stats.num_train = train_edges_tens.size(0) if valid_edges_tens is not None: dataset_stats.num_valid = valid_edges_tens.size(0) if test_edges_tens is not None: dataset_stats.num_test = test_edges_tens.size(0) dataset_stats.num_nodes = num_nodes dataset_stats.num_relations = num_rels with open(self.output_dir / Path("dataset.yaml"), "w") as f: print("Dataset statistics written to: {}".format( (self.output_dir / Path("dataset.yaml")).__str__())) yaml_file = OmegaConf.to_yaml(dataset_stats) f.writelines(yaml_file) with open(self.output_dir / Path(PathConstants.train_edges_path), "wb") as f: f.write(bytes(train_edges_tens.numpy())) if valid_edges_tens is not None: with open(self.output_dir / Path(PathConstants.valid_edges_path), "wb") as f: f.write(bytes(valid_edges_tens.numpy())) if test_edges_tens is not None: with open(self.output_dir / Path(PathConstants.test_edges_path), "wb") as f: f.write(bytes(test_edges_tens.numpy())) if num_partitions > 1: with open( self.output_dir / Path(PathConstants.train_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in train_edges_offsets]) if valid_edges_offsets is not None: with open( self.output_dir / Path(PathConstants.valid_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in valid_edges_offsets]) if test_edges_offsets is not None: with open( self.output_dir / Path(PathConstants.test_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in test_edges_offsets]) return dataset_stats
def write_to_csv(self, train_edges_df, valid_edges_df, test_edges_df, nodes_df, rels_df, num_partitions): dataset_stats = DatasetConfig() dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__() dataset_stats.num_edges = get_df_count(train_edges_df, EDGES_INDEX_COL) train_edges_df = train_edges_df.drop(EDGES_INDEX_COL) dataset_stats.num_train = dataset_stats.num_edges if valid_edges_df is not None: dataset_stats.num_valid = get_df_count(valid_edges_df, EDGES_INDEX_COL) valid_edges_df = valid_edges_df.drop(EDGES_INDEX_COL) if test_edges_df is not None: dataset_stats.num_test = get_df_count(test_edges_df, EDGES_INDEX_COL) test_edges_df = test_edges_df.drop(EDGES_INDEX_COL) dataset_stats.num_nodes = get_df_count(nodes_df, INDEX_COL) if rels_df is None: dataset_stats.num_relations = 1 else: dataset_stats.num_relations = get_df_count(rels_df, REL_INDEX_COL) with open(self.output_dir / Path("dataset.yaml"), "w") as f: print("Dataset statistics written to: {}".format( (self.output_dir / Path("dataset.yaml")).__str__())) yaml_file = OmegaConf.to_yaml(dataset_stats) f.writelines(yaml_file) write_df_to_csv( nodes_df, self.output_dir / Path(PathConstants.node_mapping_path)) if rels_df is not None: write_df_to_csv( rels_df, self.output_dir / Path(PathConstants.relation_mapping_path)) if num_partitions > 1: offsets = write_partitioned_df_to_csv( train_edges_df, num_partitions, self.output_dir / Path(PathConstants.train_edges_path)) with open( self.output_dir / Path(PathConstants.train_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in offsets]) if self.partitioned_evaluation: if valid_edges_df is not None: offsets = write_partitioned_df_to_csv( valid_edges_df, num_partitions, self.output_dir / Path(PathConstants.valid_edges_path)) with open( self.output_dir / Path(PathConstants.valid_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in offsets]) if test_edges_df is not None: offsets = write_partitioned_df_to_csv( test_edges_df, num_partitions, self.output_dir / Path(PathConstants.test_edges_path)) with open( self.output_dir / Path(PathConstants.test_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in offsets]) else: if valid_edges_df is not None: write_df_to_csv( valid_edges_df, self.output_dir / Path(PathConstants.valid_edges_path)) if test_edges_df is not None: write_df_to_csv( test_edges_df, self.output_dir / Path(PathConstants.test_edges_path)) else: write_df_to_csv( train_edges_df, self.output_dir / Path(PathConstants.train_edges_path)) if valid_edges_df is not None: write_df_to_csv( valid_edges_df, self.output_dir / Path(PathConstants.valid_edges_path)) if test_edges_df is not None: write_df_to_csv( test_edges_df, self.output_dir / Path(PathConstants.test_edges_path)) return dataset_stats