示例#1
0
class EntitySimilarity:

    """This class implements entity relatedness using DBpedia links and entity concepts"""

    def __init__(self):
        self._features = EntityFeatures()
        self._stats = StatSPARQL()
        self._yago = YagoTypeSimilarity()

    def similarity(self, entity1, entity2):
        concepts_1 = self._features.type(entity1)
        concepts_1 = [c for c in concepts_1 if c.__contains__('class/yago')]
        concepts_2 = self._features.type(entity2)
        concepts_2 = [c for c in concepts_2 if c.__contains__('class/yago')]
        synsets_1 = [self._yago.yago2synset(c) for c in concepts_1 if self._yago.yago2synset(c)]
        synsets_2 = [self._yago.yago2synset(c) for c in concepts_2 if self._yago.yago2synset(c)]
        if not synsets_1 or not synsets_2:
            return 0.0
        s1,_ = zip(*Counter({s:self._yago.synset_ic(s) for s in synsets_1}).most_common(5))
        s2,_ = zip(*Counter({s:self._yago.synset_ic(s) for s in synsets_2}).most_common(5))
        N1 = len(s1)
        N2 = len(s2)
        score1 = sum([max([self._yago.similarity(syn1, syn2) for syn2 in s2]) for syn1 in s1]) / N1
        score2 = sum([max([self._yago.similarity(syn1, syn2) for syn1 in s1]) for syn2 in s2]) / N2
        return (score1 + score2) / 2.0

    def relatedness(self, entity1, entity2):
        ab = self._stats.entity_share(entity1, entity2)
        if ab == 0:
            return 0
        a = self._stats.entity_relation(entity1)
        b = self._stats.entity_relation(entity2)
        x = math.log(max([a,b])) - math.log(ab)
        y = math.log(self._stats.entity_N()) - math.log(min([a,b]))
        return x / y
示例#2
0
 def __init__(self):
     self._features = EntityFeatures()
     self._stats = StatSPARQL()
     self.entity_N = self._stats.entity_N()
     self._yago = YagoTypeSimilarity()
     self.entity_stats = {}
     self.entity_share_stats = {}
示例#3
0
class GraphIC:
    """
    This class is used to compute graph-based IC in knowledge graph, which is
    basically the proportion of instances tagged with a specific concept
    """
    def __init__(self, ic_file):
        self._ic_file = ic_file
        self._graph_ic = self.graph_ic_reader(ic_file)
        self._graph_stats = StatSPARQL()
        self._N = self._graph_stats.entity_N()

    def concept_ic(self, concept):
        """
        Compute the ic value of a concept using sparql query
        :param concept: a id of concept, here is the uri of concept
        :return: the ic value of the concept
        """
        if concept in self._graph_ic:
            return self._graph_ic[concept]
        else:
            freq = int(self._graph_stats.concept_freq(concept))
            if freq == 0:
                ic = 0.0
            else:
                prob = 1.0 * freq / self._N
                ic = -math.log(prob)
            self.graph_ic_writer(self._ic_file, [{
                'concept': concept,
                'ic': str(ic)
            }])
            self._graph_ic[concept] = ic
            return ic

    def graph_ic_reader(self, filename):
        """
        Load the saved IC values
        :param filename: the file containing IC values of concepts
        :return: a dictionary concept:IC
        """
        data = FileIO.read_json_file(filename)
        return {d['concept']: float(d['ic']) for d in data}

    def graph_ic_writer(self, filename, data):
        """
        Save the ic values for a concept for faster access.
        :param filename:
        :param data:
        :return:
        """
        FileIO.append_json_file(filename, data)
示例#4
0
class EntitySimilarity:
    """This class implements entity relatedness using DBpedia links and entity concepts"""
    def __init__(self):
        self._features = EntityFeatures()
        self._stats = StatSPARQL()
        self._yago = YagoTypeSimilarity()

    @memoized
    def similarity(self, entity1, entity2):
        concepts_1 = self._features.type(entity1)
        concepts_1 = [c for c in concepts_1 if c.__contains__('class/yago')]
        concepts_2 = self._features.type(entity2)
        concepts_2 = [c for c in concepts_2 if c.__contains__('class/yago')]
        synsets_1 = [
            self._yago.yago2synset(c) for c in concepts_1
            if self._yago.yago2synset(c)
        ]
        synsets_2 = [
            self._yago.yago2synset(c) for c in concepts_2
            if self._yago.yago2synset(c)
        ]
        if not synsets_1 or not synsets_2:
            return 0.0
        s1, _ = zip(*Counter({s: self._yago.synset_ic(s)
                              for s in synsets_1}).most_common(5))
        s2, _ = zip(*Counter({s: self._yago.synset_ic(s)
                              for s in synsets_2}).most_common(5))
        N1 = len(s1)
        N2 = len(s2)
        score1 = sum([
            max([self._yago.similarity(syn1, syn2) for syn2 in s2])
            for syn1 in s1
        ]) / N1
        score2 = sum([
            max([self._yago.similarity(syn1, syn2) for syn1 in s1])
            for syn2 in s2
        ]) / N2
        return (score1 + score2) / 2.0

    @memoized
    def relatedness(self, entity1, entity2):
        ab = self._stats.entity_share(entity1, entity2)
        if ab == 0:
            return 0
        a = self._stats.entity_relation(entity1)
        b = self._stats.entity_relation(entity2)
        x = math.log(max([a, b])) - math.log(ab)
        y = math.log(self._stats.entity_N()) - math.log(min([a, b]))
        return x / y
示例#5
0
文件: graph.py 项目: gsi-upm/sematch
class GraphIC:

    """
    This class is used to compute graph-based IC in knowledge graph, which is
    basically the proportion of instances tagged with a specific concept
    """

    def __init__(self, ic_file):
        self._ic_file = ic_file
        self._graph_ic = self.graph_ic_reader(ic_file)
        self._graph_stats = StatSPARQL()
        self._N = self._graph_stats.entity_N()

    def concept_ic(self, concept):
        """
        Compute the ic value of a concept using sparql query
        :param concept: a id of concept, here is the uri of concept
        :return: the ic value of the concept
        """
        if concept in self._graph_ic:
            return self._graph_ic[concept]
        else:
            freq = int(self._graph_stats.concept_freq(concept))
            if freq == 0:
                ic = 0.0
            else:
                prob = 1.0 * freq / self._N
                ic = -math.log(prob)
            self.graph_ic_writer(self._ic_file, [{'concept':concept, 'ic':str(ic)}])
            self._graph_ic[concept] = ic
            return ic

    def graph_ic_reader(self, filename):
        """
        Load the saved IC values
        :param filename: the file containing IC values of concepts
        :return: a dictionary concept:IC
        """
        data = FileIO.read_json_file(filename)
        return {d['concept']:float(d['ic']) for d in data}

    def graph_ic_writer(self, filename, data):
        """
        Save the ic values for a concept for faster access.
        :param filename:
        :param data:
        :return:
        """
        FileIO.append_json_file(filename, data)
示例#6
0
 def __init__(self):
     self._features = EntityFeatures()
     self._stats = StatSPARQL()
     self._yago = YagoTypeSimilarity()
示例#7
0
 def __init__(self, ic_file):
     self._ic_file = ic_file
     self._graph_ic = self.graph_ic_reader(ic_file)
     self._graph_stats = StatSPARQL()
     self._N = self._graph_stats.entity_N()
示例#8
0
 def __init__(self):
     self._features = EntityFeatures()
     self._stats = StatSPARQL()
     self._yago = YagoTypeSimilarity()
示例#9
0
文件: graph.py 项目: gsi-upm/sematch
 def __init__(self, ic_file):
     self._ic_file = ic_file
     self._graph_ic = self.graph_ic_reader(ic_file)
     self._graph_stats = StatSPARQL()
     self._N = self._graph_stats.entity_N()