def __init__(self, df=None, kg=None, ent2ix=None, rel2ix=None, dict_of_heads=None, dict_of_tails=None): """ :param df: `pandas.DataFrame` :param kg: dict keys should be exhaustively ('heads', 'tails', 'relations') :param ent2ix: :param rel2ix: :param dict_of_heads: :param dict_of_tails: """ if df is None: if kg is None: raise WrongArgumentsError( "Please provide at least one argument of `df` and kg`") else: try: assert (type(kg) == dict) & ('heads' in kg.keys()) & ('tails' in kg.keys()) & \ ('relations' in kg.keys()) except AssertionError: raise WrongArgumentsError( "Keys in the `kg` dict should contain `heads`, `tails`, `relations`." ) try: assert (rel2ix is not None) & (ent2ix is not None) except AssertionError: raise WrongArgumentsError( "Please provide the two dictionaries ent2ix and rel2ix if building from `kg`." ) else: if kg is not None: raise WrongArgumentsError( "`df` and kg` arguments should not both provided.") if ent2ix is None: self.ent2ix = get_dictionaries(df, ent=True) else: self.ent2ix = ent2ix if rel2ix is None: self.rel2ix = get_dictionaries(df, ent=False) else: self.rel2ix = rel2ix self.n_ent = max(self.ent2ix.values()) + 1 self.n_rel = max(self.rel2ix.values()) + 1 if df is not None: # build kg from a pandas dataframe self.n_facts = len(df) self.head_idx = tensor(df['from'].map(self.ent2ix).values).long() self.tail_idx = tensor(df['to'].map(self.ent2ix).values).long() self.relations = tensor(df['rel'].map(self.rel2ix).values).long() else: # build kg from another kg self.n_facts = kg['heads'].shape[0] self.head_idx = kg['heads'] self.tail_idx = kg['tails'] self.relations = kg['relations'] if dict_of_heads is None or dict_of_tails is None: self.dict_of_heads = defaultdict(set) self.dict_of_tails = defaultdict(set) self.evaluate_dicts() else: self.dict_of_heads = dict_of_heads self.dict_of_tails = dict_of_tails try: self.sanity_check() except AssertionError: raise SanityError("Please check the sanity of arguments.")
def __init__(self, df=None, kg=None, ent2ix=None, rel2ix=None, dict_of_heads=None, dict_of_tails=None, dict_of_rel=None, id2point=None, geo=None): if df is None: if kg is None: raise WrongArgumentsError("Please provide at least one " "argument of `df` and kg`") else: try: assert (type(kg) == dict) & ('heads' in kg.keys()) & \ ('tails' in kg.keys()) & \ ('relations' in kg.keys()) except AssertionError: raise WrongArgumentsError("Keys in the `kg` dict should " "contain `heads`, `tails`, " "`relations`.") try: assert (rel2ix is not None) & (ent2ix is not None) except AssertionError: raise WrongArgumentsError("Please provide the two " "dictionaries ent2ix and rel2ix " "if building from `kg`.") else: if kg is not None: raise WrongArgumentsError("`df` and kg` arguments should not " "both be provided.") if ent2ix is None: self.ent2ix = get_dictionaries(df, ent=True) else: self.ent2ix = ent2ix if rel2ix is None: self.rel2ix = get_dictionaries(df, ent=False) else: self.rel2ix = rel2ix if id2point is not None: self.id2point = id2point self.n_ent = max(self.ent2ix.values()) + 1 self.n_rel = max(self.rel2ix.values()) + 1 self.geo = geo if df is not None: # build kg from a pandas dataframe self.n_facts = len(df) self.head_idx = tensor(df['from'].map(self.ent2ix).values).long() self.tail_idx = tensor(df['to'].map(self.ent2ix).values).long() self.relations = tensor(df['rel'].map(self.rel2ix).values).long() else: # build kg from another kg self.n_facts = kg['heads'].shape[0] self.head_idx = kg['heads'] self.tail_idx = kg['tails'] self.relations = kg['relations'] try: self.point = kg['point'] except: pass if (geo is not None) and (df is not None): # Geo self.entity2point, self.id2point = self.load_point(geo) self.point = np.array([[ self.entity2point[triplet[0]], self.entity2point[triplet[2]] ] for triplet in df.values]) if dict_of_heads is None or dict_of_tails is None or dict_of_rel is None: self.dict_of_heads = defaultdict(set) self.dict_of_tails = defaultdict(set) self.dict_of_rel = defaultdict(set) self.evaluate_dicts() else: self.dict_of_heads = dict_of_heads self.dict_of_tails = dict_of_tails self.dict_of_rel = dict_of_rel try: self.sanity_check() except AssertionError: raise SanityError("Please check the sanity of arguments.")
def split_kg(self, share=0.8, sizes=None, validation=False): """Split the knowledge graph into train and test. If `sizes` is provided then it is used to split the samples as explained below. If only `share` is provided, the split is done at random but it assures to keep at least one fact involving each type of entity and relation in the training subset. Parameters ---------- share: float Percentage to allocate to train set. sizes: tuple Tuple of ints of length 2 or 3. If len(sizes) == 2, then the first sizes[0] values of\ the knowledge graph will be used as training set and the rest as test set.\ If len(sizes) == 3, the first sizes[0] values of the knowledge graph will be used as\ training set, the following sizes[1] as validation set and the last sizes[2] as testing\ set. validation: bool Indicate if a validation set should be produced along with train and test sets. Returns ------- train_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph` val_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph`, optional test_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph` """ # TODO: assert that all relations in test appear as well in validation (for triplet classification) if sizes is not None: try: if len(sizes) == 3: try: assert (sizes[0] + sizes[1] + sizes[2] == self.n_facts) except AssertionError: raise WrongArgumentsError( 'Sizes should sum to the number of facts.') elif len(sizes) == 2: try: assert (sizes[0] + sizes[1] == self.n_facts) except AssertionError: raise WrongArgumentsError( 'Sizes should sum to the number of facts.') else: raise SizeMismatchError( 'Tuple `sizes` should be of length 2 or 3.') except AssertionError: raise SizeMismatchError( 'Tuple `sizes` should sum up to the number of facts in the ' 'knowledge graph.') else: assert share < 1 if ((sizes is not None) and (len(sizes) == 3)) or ((sizes is None) and validation): # return training, validation and a testing graphs if (sizes is None) and validation: mask_tr, mask_val, mask_te = self.get_mask(share, validation=True) else: mask_tr = cat([ tensor([1 for _ in range(sizes[0])]), tensor([0 for _ in range(sizes[1] + sizes[2])]) ]).bool() mask_val = cat([ tensor([0 for _ in range(sizes[0])]), tensor([1 for _ in range(sizes[1])]), tensor([0 for _ in range(sizes[2])]) ]).bool() mask_te = ~(mask_tr | mask_val) return KnowledgeGraph( kg={ 'heads': self.head_idx[mask_tr], 'tails': self.tail_idx[mask_tr], 'relations': self.relations[mask_tr] }, ent2ix=self.ent2ix, rel2ix=self.rel2ix, dict_of_heads=self.dict_of_heads, dict_of_tails=self.dict_of_tails), KnowledgeGraph( kg={ 'heads': self.head_idx[mask_val], 'tails': self.tail_idx[mask_val], 'relations': self.relations[mask_val] }, ent2ix=self.ent2ix, rel2ix=self.rel2ix, dict_of_heads=self.dict_of_heads, dict_of_tails=self.dict_of_tails), KnowledgeGraph( kg={ 'heads': self.head_idx[mask_te], 'tails': self.tail_idx[mask_te], 'relations': self.relations[mask_te] }, ent2ix=self.ent2ix, rel2ix=self.rel2ix, dict_of_heads=self.dict_of_heads, dict_of_tails=self.dict_of_tails) else: # return training and testing graphs assert (((sizes is not None) and len(sizes) == 2) or ((sizes is None) and not validation)) if sizes is None: mask_tr, mask_te = self.get_mask(share, validation=False) else: mask_tr = cat([ tensor([1 for _ in range(sizes[0])]), tensor([0 for _ in range(sizes[1])]) ]).bool() mask_te = ~mask_tr return KnowledgeGraph( kg={ 'heads': self.head_idx[mask_tr], 'tails': self.tail_idx[mask_tr], 'relations': self.relations[mask_tr] }, ent2ix=self.ent2ix, rel2ix=self.rel2ix, dict_of_heads=self.dict_of_heads, dict_of_tails=self.dict_of_tails), KnowledgeGraph( kg={ 'heads': self.head_idx[mask_te], 'tails': self.tail_idx[mask_te], 'relations': self.relations[mask_te] }, ent2ix=self.ent2ix, rel2ix=self.rel2ix, dict_of_heads=self.dict_of_heads, dict_of_tails=self.dict_of_tails)
def __init__( self, df=None, kg=None, ent2ix=None, rel2ix=None, dict_of_heads=None, dict_of_tails=None, ): if df is None: if kg is None: raise WrongArgumentsError( "Please provide at least one " "argument of `df` and kg`" ) else: try: assert ( (type(kg) == dict) & ("heads" in kg.keys()) & ("tails" in kg.keys()) & ("relations" in kg.keys()) ) except AssertionError: raise WrongArgumentsError( "Keys in the `kg` dict should " "contain `heads`, `tails`, " "`relations`." ) try: assert (rel2ix is not None) & (ent2ix is not None) except AssertionError: raise WrongArgumentsError( "Please provide the two " "dictionaries ent2ix and rel2ix " "if building from `kg`." ) else: if kg is not None: raise WrongArgumentsError( "`df` and kg` arguments should not " "both be provided." ) if ent2ix is None: self.ent2ix = get_dictionaries(df, ent=True) else: self.ent2ix = ent2ix if rel2ix is None: self.rel2ix = get_dictionaries(df, ent=False) else: self.rel2ix = rel2ix self.n_ent = max(self.ent2ix.values()) + 1 self.n_rel = max(self.rel2ix.values()) + 1 if df is not None: # build kg from a pandas dataframe self.n_facts = len(df) self.head_idx = tensor(df["from"].map(self.ent2ix).values).long() self.tail_idx = tensor(df["to"].map(self.ent2ix).values).long() self.relations = tensor(df["rel"].map(self.rel2ix).values).long() self.magnitudes = tensor(df["how-much"], dtype=float64) else: # build kg from another kg self.n_facts = kg["heads"].shape[0] self.head_idx = kg["heads"] self.tail_idx = kg["tails"] self.relations = kg["relations"] self.magnitudes = kg["magnitudes"] if dict_of_heads is None or dict_of_tails is None: self.dict_of_heads = defaultdict(set) self.dict_of_tails = defaultdict(set) self.evaluate_dicts() else: self.dict_of_heads = dict_of_heads self.dict_of_tails = dict_of_tails try: self.sanity_check() except AssertionError: raise SanityError("Please check the sanity of arguments.")