class EntitySimilarity: """This class implements entity relatedness using DBpedia links and entity concepts""" def __init__(self): self._features = EntityFeatures() self._stats = StatSPARQL() self._yago = YagoTypeSimilarity() def similarity(self, entity1, entity2): concepts_1 = self._features.type(entity1) concepts_1 = [c for c in concepts_1 if c.__contains__('class/yago')] concepts_2 = self._features.type(entity2) concepts_2 = [c for c in concepts_2 if c.__contains__('class/yago')] synsets_1 = [self._yago.yago2synset(c) for c in concepts_1 if self._yago.yago2synset(c)] synsets_2 = [self._yago.yago2synset(c) for c in concepts_2 if self._yago.yago2synset(c)] if not synsets_1 or not synsets_2: return 0.0 s1,_ = zip(*Counter({s:self._yago.synset_ic(s) for s in synsets_1}).most_common(5)) s2,_ = zip(*Counter({s:self._yago.synset_ic(s) for s in synsets_2}).most_common(5)) N1 = len(s1) N2 = len(s2) score1 = sum([max([self._yago.similarity(syn1, syn2) for syn2 in s2]) for syn1 in s1]) / N1 score2 = sum([max([self._yago.similarity(syn1, syn2) for syn1 in s1]) for syn2 in s2]) / N2 return (score1 + score2) / 2.0 def relatedness(self, entity1, entity2): ab = self._stats.entity_share(entity1, entity2) if ab == 0: return 0 a = self._stats.entity_relation(entity1) b = self._stats.entity_relation(entity2) x = math.log(max([a,b])) - math.log(ab) y = math.log(self._stats.entity_N()) - math.log(min([a,b])) return x / y
def __init__(self): self._features = EntityFeatures() self._stats = StatSPARQL() self.entity_N = self._stats.entity_N() self._yago = YagoTypeSimilarity() self.entity_stats = {} self.entity_share_stats = {}
class GraphIC: """ This class is used to compute graph-based IC in knowledge graph, which is basically the proportion of instances tagged with a specific concept """ def __init__(self, ic_file): self._ic_file = ic_file self._graph_ic = self.graph_ic_reader(ic_file) self._graph_stats = StatSPARQL() self._N = self._graph_stats.entity_N() def concept_ic(self, concept): """ Compute the ic value of a concept using sparql query :param concept: a id of concept, here is the uri of concept :return: the ic value of the concept """ if concept in self._graph_ic: return self._graph_ic[concept] else: freq = int(self._graph_stats.concept_freq(concept)) if freq == 0: ic = 0.0 else: prob = 1.0 * freq / self._N ic = -math.log(prob) self.graph_ic_writer(self._ic_file, [{ 'concept': concept, 'ic': str(ic) }]) self._graph_ic[concept] = ic return ic def graph_ic_reader(self, filename): """ Load the saved IC values :param filename: the file containing IC values of concepts :return: a dictionary concept:IC """ data = FileIO.read_json_file(filename) return {d['concept']: float(d['ic']) for d in data} def graph_ic_writer(self, filename, data): """ Save the ic values for a concept for faster access. :param filename: :param data: :return: """ FileIO.append_json_file(filename, data)
class EntitySimilarity: """This class implements entity relatedness using DBpedia links and entity concepts""" def __init__(self): self._features = EntityFeatures() self._stats = StatSPARQL() self._yago = YagoTypeSimilarity() @memoized def similarity(self, entity1, entity2): concepts_1 = self._features.type(entity1) concepts_1 = [c for c in concepts_1 if c.__contains__('class/yago')] concepts_2 = self._features.type(entity2) concepts_2 = [c for c in concepts_2 if c.__contains__('class/yago')] synsets_1 = [ self._yago.yago2synset(c) for c in concepts_1 if self._yago.yago2synset(c) ] synsets_2 = [ self._yago.yago2synset(c) for c in concepts_2 if self._yago.yago2synset(c) ] if not synsets_1 or not synsets_2: return 0.0 s1, _ = zip(*Counter({s: self._yago.synset_ic(s) for s in synsets_1}).most_common(5)) s2, _ = zip(*Counter({s: self._yago.synset_ic(s) for s in synsets_2}).most_common(5)) N1 = len(s1) N2 = len(s2) score1 = sum([ max([self._yago.similarity(syn1, syn2) for syn2 in s2]) for syn1 in s1 ]) / N1 score2 = sum([ max([self._yago.similarity(syn1, syn2) for syn1 in s1]) for syn2 in s2 ]) / N2 return (score1 + score2) / 2.0 @memoized def relatedness(self, entity1, entity2): ab = self._stats.entity_share(entity1, entity2) if ab == 0: return 0 a = self._stats.entity_relation(entity1) b = self._stats.entity_relation(entity2) x = math.log(max([a, b])) - math.log(ab) y = math.log(self._stats.entity_N()) - math.log(min([a, b])) return x / y
class GraphIC: """ This class is used to compute graph-based IC in knowledge graph, which is basically the proportion of instances tagged with a specific concept """ def __init__(self, ic_file): self._ic_file = ic_file self._graph_ic = self.graph_ic_reader(ic_file) self._graph_stats = StatSPARQL() self._N = self._graph_stats.entity_N() def concept_ic(self, concept): """ Compute the ic value of a concept using sparql query :param concept: a id of concept, here is the uri of concept :return: the ic value of the concept """ if concept in self._graph_ic: return self._graph_ic[concept] else: freq = int(self._graph_stats.concept_freq(concept)) if freq == 0: ic = 0.0 else: prob = 1.0 * freq / self._N ic = -math.log(prob) self.graph_ic_writer(self._ic_file, [{'concept':concept, 'ic':str(ic)}]) self._graph_ic[concept] = ic return ic def graph_ic_reader(self, filename): """ Load the saved IC values :param filename: the file containing IC values of concepts :return: a dictionary concept:IC """ data = FileIO.read_json_file(filename) return {d['concept']:float(d['ic']) for d in data} def graph_ic_writer(self, filename, data): """ Save the ic values for a concept for faster access. :param filename: :param data: :return: """ FileIO.append_json_file(filename, data)
def __init__(self): self._features = EntityFeatures() self._stats = StatSPARQL() self._yago = YagoTypeSimilarity()
def __init__(self, ic_file): self._ic_file = ic_file self._graph_ic = self.graph_ic_reader(ic_file) self._graph_stats = StatSPARQL() self._N = self._graph_stats.entity_N()