def eval( self, scores: Scores, batch_lhs: EntityList, batch_rhs: EntityList, batch_rel: Union[int, LongTensorType], ) -> Stats: # Assume dynamic relations. assert isinstance(batch_rel, torch.LongTensor) _, _, lhs_neg_scores, rhs_neg_scores = scores b = batch_lhs.size(0) for idx in range(b): # Assume non-featurized. cur_lhs = batch_lhs.to_tensor()[idx].item() cur_rel = batch_rel[idx].item() # Assume non-featurized. cur_rhs = batch_rhs.to_tensor()[idx].item() rhs_edges_filtered = self.lhs_map[cur_lhs, cur_rel] lhs_edges_filtered = self.rhs_map[cur_rhs, cur_rel] assert cur_lhs in lhs_edges_filtered assert cur_rhs in rhs_edges_filtered # The rank is computed as the number of non-negative margins (as # that means a negative with at least as good a score as a positive) # so to avoid counting positives we give them a negative margin. lhs_neg_scores[idx][lhs_edges_filtered] = -1e9 rhs_neg_scores[idx][rhs_edges_filtered] = -1e9 return super().eval(scores, batch_lhs, batch_rhs, batch_rel)
def cat(cls, edge_lists: Sequence["EdgeList"]) -> "EdgeList": cat_lhs = EntityList.cat([el.lhs for el in edge_lists]) cat_rhs = EntityList.cat([el.rhs for el in edge_lists]) if any(el.has_weight() for el in edge_lists): if not all(el.has_weight() for el in edge_lists): raise RuntimeError( "Can't concatenate edgelists with and without weight field." ) cat_weight = torch.cat( [el.weight.expand((len(el), )) for el in edge_lists]) else: cat_weight = None if all(el.has_scalar_relation_type() for el in edge_lists): rel_types = {el.get_relation_type_as_scalar() for el in edge_lists} if len(rel_types) == 1: (rel_type, ) = rel_types return cls( cat_lhs, cat_rhs, torch.tensor(rel_type, dtype=torch.long), cat_weight, ) cat_rel = torch.cat([el.rel.expand((len(el), )) for el in edge_lists]) return cls(cat_lhs, cat_rhs, cat_rel, cat_weight)
def test_from_tensor(self): self.assertEqual( EntityList.from_tensor(torch.tensor([3, 4], dtype=torch.long)), EntityList( torch.tensor([3, 4], dtype=torch.long), TensorList.empty(num_tensors=2) ), )
def test_empty(self): self.assertEqual( group_by_relation_type( torch.empty((0, ), dtype=torch.long), EntityList.empty(), EntityList.empty(), ), [], )
def load_chunk_of_edges( self, lhs_p: Partition, rhs_p: Partition, chunk_idx: int = 0, num_chunks: int = 1, shared: bool = False, ) -> EdgeList: file_path = self.get_edges_file(lhs_p, rhs_p) try: with h5py.File(file_path, "r") as hf: if hf.attrs.get(FORMAT_VERSION_ATTR, None) != FORMAT_VERSION: raise RuntimeError( f"Version mismatch in edge file {file_path}") lhs_ds = hf["lhs"] rhs_ds = hf["rhs"] rel_ds = hf["rel"] num_edges = rel_ds.len() chunk_size = div_roundup(num_edges, num_chunks) begin = chunk_idx * chunk_size end = min((chunk_idx + 1) * chunk_size, num_edges) chunk_size = end - begin allocator = allocate_shared_tensor if shared else torch.empty lhs = allocator((chunk_size, ), dtype=torch.long) rhs = allocator((chunk_size, ), dtype=torch.long) rel = allocator((chunk_size, ), dtype=torch.long) # Needed because https://github.com/h5py/h5py/issues/870. if chunk_size > 0: lhs_ds.read_direct(lhs.numpy(), source_sel=np.s_[begin:end]) rhs_ds.read_direct(rhs.numpy(), source_sel=np.s_[begin:end]) rel_ds.read_direct(rel.numpy(), source_sel=np.s_[begin:end]) lhsd = self.read_dynamic(hf, "lhsd", begin, end, shared=shared) rhsd = self.read_dynamic(hf, "rhsd", begin, end, shared=shared) if "weight" in hf: weight_ds = hf["weight"] weight = allocator((chunk_size, ), dtype=torch.long) if chunk_size > 0: weight_ds.read_direct(weight.numpy(), source_sel=np.s_[begin:end]) else: weight = None return EdgeList(EntityList(lhs, lhsd), EntityList(rhs, rhsd), rel, weight) except OSError as err: # h5py refuses to make it easy to figure out what went wrong. The errno # attribute is set to None. See https://github.com/h5py/h5py/issues/493. if f"errno = {errno.ENOENT}" in str(err): raise CouldNotLoadData() from err raise err
def append_to_file(data, appender): lhs_offsets, rhs_offsets, rel_ids = zip(*data) appender.append_edges( EdgeList( EntityList.from_tensor(torch.tensor(lhs_offsets, dtype=torch.long)), EntityList.from_tensor(torch.tensor(rhs_offsets, dtype=torch.long)), torch.tensor(rel_ids, dtype=torch.long), ) )
def test_get_relation_type_as_scalar(self): self.assertEqual( EdgeList( EntityList.from_tensor(torch.tensor([3, 4], dtype=torch.long)), EntityList.from_tensor(torch.tensor([0, 2], dtype=torch.long)), torch.tensor(3, dtype=torch.long), ).get_relation_type_as_scalar(), 3, )
def test_empty(self): self.assertEqual( EdgeList.empty(), EdgeList( EntityList.empty(), EntityList.empty(), torch.empty((0, ), dtype=torch.long), ), )
def test_getitem_int(self): self.assertEqual( EntityList( torch.tensor([3, 4, 1, 0], dtype=torch.long), tensor_list_from_lists([[2, 1], [0], [], [3, 4, 5]]), )[-3], EntityList(torch.tensor([4], dtype=torch.long), tensor_list_from_lists([[0]])), )
def generate_edge_path_files_fast( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], edgelist_reader: EdgelistReader, ) -> None: processed = 0 skipped = 0 log("Taking the fast train!") data = [] for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in): if rel_word is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: _, lhs_offset = entities_by_type[lhs_type].get_partition(lhs_word) _, rhs_offset = entities_by_type[rhs_type].get_partition(rhs_word) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue data.append((lhs_offset, rhs_offset, rel_id)) processed = processed + 1 if processed % 100000 == 0: log(f"- Processed {processed} edges so far...") lhs_offsets, rhs_offsets, rel_ids = zip(*data) edge_list = EdgeList( EntityList.from_tensor(torch.tensor(list(lhs_offsets), dtype=torch.long)), EntityList.from_tensor(torch.tensor(list(rhs_offsets), dtype=torch.long)), torch.tensor(list(rel_ids), dtype=torch.long), ) edge_storage.save_edges(0, 0, edge_list) log(f"- Processed {processed} edges in total") if skipped > 0: log( f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare)." )
def cat(cls, edge_lists: Sequence["EdgeList"]) -> "EdgeList": cat_lhs = EntityList.cat([el.lhs for el in edge_lists]) cat_rhs = EntityList.cat([el.rhs for el in edge_lists]) if all(el.has_scalar_relation_type() for el in edge_lists): rel_types = {el.get_relation_type_as_scalar() for el in edge_lists} if len(rel_types) == 1: (rel_type,) = rel_types return cls(cat_lhs, cat_rhs, torch.tensor(rel_type, dtype=torch.long)) cat_rel = torch.cat([el.rel.expand((len(el),)) for el in edge_lists]) return EdgeList(cat_lhs, cat_rhs, cat_rel)
def test_getitem_longtensor(self): self.assertEqual( EntityList( torch.tensor([3, 4, 1, 0], dtype=torch.long), tensor_list_from_lists([[2, 1], [0], [], [3, 4, 5]]), )[torch.tensor([2, 0])], EntityList( torch.tensor([1, 3], dtype=torch.long), tensor_list_from_lists([[], [2, 1]]), ), )
def append_to_file(data, appender): lhs_offsets, rhs_offsets, rel_ids, weights = zip(*data) weights = torch.tensor(weights) if weights[0] is not None else None appender.append_edges( EdgeList( EntityList.from_tensor(torch.tensor(lhs_offsets, dtype=torch.long)), EntityList.from_tensor(torch.tensor(rhs_offsets, dtype=torch.long)), torch.tensor(rel_ids, dtype=torch.long), weights, ))
def test_get_relation_type_as_vector(self): self.assertTrue( torch.equal( EdgeList( EntityList.from_tensor( torch.tensor([3, 4], dtype=torch.long)), EntityList.from_tensor( torch.tensor([0, 2], dtype=torch.long)), torch.tensor([2, 0], dtype=torch.long), ).get_relation_type_as_vector(), torch.tensor([2, 0], dtype=torch.long), ))
def test_len(self): self.assertEqual( len( EdgeList( EntityList.from_tensor( torch.tensor([3, 4], dtype=torch.long)), EntityList.from_tensor( torch.tensor([0, 2], dtype=torch.long)), torch.tensor([2, 0], dtype=torch.long), )), 2, )
def test_has_scalar_relation_type(self): self.assertTrue( EdgeList( EntityList.from_tensor(torch.tensor([3, 4], dtype=torch.long)), EntityList.from_tensor(torch.tensor([0, 2], dtype=torch.long)), torch.tensor(3, dtype=torch.long), ).has_scalar_relation_type()) self.assertFalse( EdgeList( EntityList.from_tensor(torch.tensor([3, 4], dtype=torch.long)), EntityList.from_tensor(torch.tensor([0, 2], dtype=torch.long)), torch.tensor([2, 0], dtype=torch.long), ).has_scalar_relation_type())
def test_cat(self): tensor_1 = torch.tensor([2, 3], dtype=torch.long) tensor_2 = torch.tensor([0, 1], dtype=torch.long) tensor_sum = torch.tensor([2, 3, 0, 1], dtype=torch.long) tensor_list_1 = tensor_list_from_lists([[3, 4], [0]]) tensor_list_2 = tensor_list_from_lists([[1, 2, 0], []]) tensor_list_sum = tensor_list_from_lists([[3, 4], [0], [1, 2, 0], []]) self.assertEqual( EntityList.cat([ EntityList(tensor_1, tensor_list_1), EntityList(tensor_2, tensor_list_2), ]), EntityList(tensor_sum, tensor_list_sum), )
def test_basic(self): self.assertEqual( group_by_relation_type( torch.tensor([1, 0, 0, 1, 2, 2, 0, 0, 2, 2], dtype=torch.long), EntityList.from_tensor( torch.tensor([93, 24, 13, 31, 70, 66, 77, 38, 5, 5], dtype=torch.long)), EntityList.from_tensor( torch.tensor([90, 75, 9, 25, 23, 31, 49, 64, 42, 50], dtype=torch.long)), ), [ ( EntityList.from_tensor( torch.tensor([24, 13, 77, 38], dtype=torch.long)), EntityList.from_tensor( torch.tensor([75, 9, 49, 64], dtype=torch.long)), 0, ), ( EntityList.from_tensor( torch.tensor([93, 31], dtype=torch.long)), EntityList.from_tensor( torch.tensor([90, 25], dtype=torch.long)), 1, ), ( EntityList.from_tensor( torch.tensor([70, 66, 5, 5], dtype=torch.long)), EntityList.from_tensor( torch.tensor([23, 31, 42, 50], dtype=torch.long)), 2, ), ], )
def test_getitem_int(self): self.assertEqual( EdgeList( EntityList.from_tensor( torch.tensor([3, 4, 1, 0], dtype=torch.long)), EntityList.from_tensor( torch.tensor([0, 2, 1, 3], dtype=torch.long)), torch.tensor([1, 1, 3, 0], dtype=torch.long), )[-3], EdgeList( EntityList.from_tensor(torch.tensor([4], dtype=torch.long)), EntityList.from_tensor(torch.tensor([2], dtype=torch.long)), torch.tensor(1, dtype=torch.long), ), )
def test_getitem_longtensor(self): self.assertEqual( EdgeList( EntityList.from_tensor( torch.tensor([3, 4, 1, 0], dtype=torch.long)), EntityList.from_tensor( torch.tensor([0, 2, 1, 3], dtype=torch.long)), torch.tensor([1, 1, 3, 0], dtype=torch.long), )[torch.tensor([2, 0])], EdgeList( EntityList.from_tensor(torch.tensor([1, 3], dtype=torch.long)), EntityList.from_tensor(torch.tensor([1, 0], dtype=torch.long)), torch.tensor([3, 1], dtype=torch.long), ), )
def test_forward(self): embeddings = torch.tensor( [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]], requires_grad=True ) module = FeaturizedEmbedding(weight=embeddings) result = module( EntityList.from_tensor_list( TensorList( torch.tensor([0, 1, 3, 6, 6]), torch.tensor([0, 2, 1, 0, 1, 0]) ) ) ) self.assertTensorEqual( result, torch.tensor( [ [1.0000, 1.0000, 1.0000], [2.5000, 2.5000, 2.5000], [1.3333, 1.3333, 1.3333], [0.0000, 0.0000, 0.0000], ] ), ) result.sum().backward() self.assertTrue((embeddings.grad.to_dense() != 0).any())
def test_empty(self): embeddings = torch.empty((0, 3)) module = SimpleEmbedding(weight=embeddings) self.assertTensorEqual( module(EntityList.from_tensor(torch.empty((0, ), dtype=torch.long))), torch.empty((0, 3)))
def load_chunk_of_edges( self, lhs_p: int, rhs_p: int, chunk_idx: int = 0, num_chunks: int = 1, ) -> EdgeList: file_path = self.get_edges_file(lhs_p, rhs_p) try: with h5py.File(file_path, "r") as hf: if hf.attrs.get(FORMAT_VERSION_ATTR, None) != FORMAT_VERSION: raise RuntimeError( f"Version mismatch in edge file {file_path}") lhs_ds = hf["lhs"] rhs_ds = hf["rhs"] rel_ds = hf["rel"] num_edges = rel_ds.len() begin = int(chunk_idx * num_edges / num_chunks) end = int((chunk_idx + 1) * num_edges / num_chunks) chunk_size = end - begin lhs = torch.empty((chunk_size, ), dtype=torch.long) rhs = torch.empty((chunk_size, ), dtype=torch.long) rel = torch.empty((chunk_size, ), dtype=torch.long) # Needed because https://github.com/h5py/h5py/issues/870. if chunk_size > 0: lhs_ds.read_direct(lhs.numpy(), source_sel=np.s_[begin:end]) rhs_ds.read_direct(rhs.numpy(), source_sel=np.s_[begin:end]) rel_ds.read_direct(rel.numpy(), source_sel=np.s_[begin:end]) lhsd = self.read_dynamic(hf, "lhsd", begin, end) rhsd = self.read_dynamic(hf, "rhsd", begin, end) return EdgeList(EntityList(lhs, lhsd), EntityList(rhs, rhsd), rel) except OSError as err: # h5py refuses to make it easy to figure out what went wrong. The errno # attribute is set to None. See https://github.com/h5py/h5py/issues/493. if f"errno = {errno.ENOENT}" in str(err): raise CouldNotLoadData() from err raise err
def test_to_tensor_list(self): self.assertEqual( EntityList( torch.tensor([-1, -1], dtype=torch.long), tensor_list_from_lists([[3, 4], [0]]), ).to_tensor_list(), tensor_list_from_lists([[3, 4], [0]]), )
def test_len(self): self.assertEqual( len(EntityList( torch.tensor([3, 4], dtype=torch.long), tensor_list_from_lists([[], [2, 1, 0]]), )), 2, )
def test_to_tensor(self): self.assertTrue( torch.equal( EntityList( torch.tensor([2, 3], dtype=torch.long), tensor_list_from_lists([[], []]), ).to_tensor(), torch.tensor([2, 3], dtype=torch.long), ))
def test_empty(self): embeddings = torch.empty((0, 3)) module = FeaturizedEmbedding(weight=embeddings) self.assertTensorEqual( module( EntityList.from_tensor_list( TensorList(torch.zeros((1, ), dtype=torch.long), torch.empty((0, ), dtype=torch.long)))), torch.empty((0, 3)))
def test_constant(self): self.assertEqual( group_by_relation_type( torch.tensor([3, 3, 3, 3], dtype=torch.long), EntityList.from_tensor( torch.tensor([93, 24, 13, 31], dtype=torch.long)), EntityList.from_tensor( torch.tensor([90, 75, 9, 25], dtype=torch.long)), ), [ ( EntityList.from_tensor( torch.tensor([93, 24, 13, 31], dtype=torch.long)), EntityList.from_tensor( torch.tensor([90, 75, 9, 25], dtype=torch.long)), 3, ), ], )
def test_forward(self): embeddings = torch.tensor( [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]], requires_grad=True ) module = SimpleEmbedding(weight=embeddings) result = module(EntityList.from_tensor(torch.tensor([2, 0, 0]))) self.assertTensorEqual( result, torch.tensor([[3.0, 3.0, 3.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) ) result.sum().backward() self.assertTrue((embeddings.grad.to_dense() != 0).any())
def test_equal(self): el = EntityList( torch.tensor([3, 4], dtype=torch.long), tensor_list_from_lists([[], [2, 1, 0]]), ) self.assertEqual(el, el) self.assertNotEqual( el, EntityList( torch.tensor([4, 2], dtype=torch.long), tensor_list_from_lists([[], [2, 1, 0]]), ), ) self.assertNotEqual( el, EntityList( torch.tensor([3, 4], dtype=torch.long), tensor_list_from_lists([[3], [2, 0]]), ), )