def test_shared_ic(self): df = plugin.DisjointFactor() for ic in IC_NAMES: ic_calculator = plugin.ICCalculator(ic) shared_ic_calculator_no_disjoints = \ plugin.SharedICCalculator(ic, use_disjoints=False) shared_ic_calculator_disjoints = \ plugin.SharedICCalculator(ic, use_disjoints=True) for one, two, mica, z in SHARED_IC_RESULTS: one_id = utils.get_id(one) two_id = utils.get_id(two) mica_id = utils.get_id(mica) ic_mica = ic_calculator.get(mica_id) result = shared_ic_calculator_no_disjoints.get(one_id, two_id) assert abs(result - ic_mica) < EPSILON if z is None: expected = 0 else: z_id = utils.get_id(z) ic_z = ic_calculator.get(z_id) factor = df.get(one_id, two_id) expected = ic_mica - factor * (ic_mica - ic_z) result = shared_ic_calculator_disjoints.get(one_id, two_id) assert abs(result - expected) < EPSILON
def chain_to_ids(chain): properties = chain[:-1] concept = chain[-1] chain = [utils.get_id(i, "ObjectProperty") for i in properties] chain.append(utils.get_id(concept)) return chain
def test_utils_get_id(): for iri, entity_type in KNOWN_ENTITIES + UNKNOWN_ENTITIES: entity_id = utils.get_id(iri, entity_type=entity_type) fetched_iri, fetched_type = utils.get_entity(entity_id) assert iri == fetched_iri and entity_type == fetched_type assert utils.get_entity(1000) == None
def test_ic_values(self): for index, ic_name in enumerate(IC_NAMES): ic_calculator = plugin.ICCalculator(ic_name) for iri, values in IC_TESTS.items(): concept_id = utils.get_id(iri) ic = ic_calculator.get(concept_id) expected = values[index] print(iri, ic, expected) assert abs(ic - expected) < EPSILON
def __init__(self, *, ic=None, distance_threshold=3, weight_threshold=0.3, property_weights=None, default_weight=0.7, hierarchy_weight=0.8, discover_subclasses=False): self.distance_threshold = distance_threshold self.weight_threshold = weight_threshold if isinstance(default_weight, LogScale): # We assign to each property a weight based on the amount of times # that property is used in the database self.property_weights = default_weight.get_weights() self.default_weight = 0 else: self.property_weights = {} self.default_weight = default_weight if property_weights is not None: for prop, weight in property_weights.items(): prop = utils.get_id(prop, "ObjectProperty") self.property_weights[prop] = weight # The class-subclass propertyis represented in this code # by the None object self.property_weights[None] = hierarchy_weight if ic: self.ic_calculator = ICCalculator(ic) else: self.ic_calculator = None self.get_relations_query = ("SELECT chain, end, distance " "FROM existential_relations " "WHERE start = %s AND distance <= %s") self.discover_subclasses = discover_subclasses if discover_subclasses: self.get_hierarchy_query = ( "SELECT superclass, distance " "FROM hierarchy " "WHERE subclass = %s AND distance = 1 " "UNION " "SELECT subclass, distance " "FROM hierarchy " "WHERE superclass = %s AND distance <= %s") else: self.get_hierarchy_query = ( "SELECT superclass, distance " "FROM hierarchy " "WHERE subclass = %s AND distance <= %s")
def __init__(self, *, ic=None, distance_threshold=3, weight_threshold=0.3, property_weights=None, default_weight=0.7, hierarchy_weight=0.8, discover_subclasses=False): self.distance_threshold = distance_threshold self.weight_threshold = weight_threshold if isinstance(default_weight, LogScale): # We assign to each property a weight based on the amount of times # that property is used in the database self.property_weights = default_weight.get_weights() self.default_weight = 0 else: self.property_weights = {} self.default_weight = default_weight if property_weights is not None: for prop, weight in property_weights.items(): prop = utils.get_id(prop, "ObjectProperty") self.property_weights[prop] = weight # The class-subclass propertyis represented in this code # by the None object self.property_weights[None] = hierarchy_weight if ic: self.ic_calculator = ICCalculator(ic) else: self.ic_calculator = None self.get_relations_query = ( "SELECT chain, end, distance " "FROM existential_relations " "WHERE start = %s AND distance <= %s") self.discover_subclasses = discover_subclasses if discover_subclasses: self.get_hierarchy_query = ( "SELECT superclass, distance " "FROM hierarchy " "WHERE subclass = %s AND distance = 1 " "UNION " "SELECT subclass, distance " "FROM hierarchy " "WHERE superclass = %s AND distance <= %s") else: self.get_hierarchy_query = ( "SELECT superclass, distance " "FROM hierarchy " "WHERE subclass = %s AND distance <= %s")
def compare(self, one, two): if one == two: return 0 one = utils.get_id(one) two = utils.get_id(two) ic_one = self.ic_calculator.get(one) ic_two = self.ic_calculator.get(two) # Special cases if ic_one == -1 or ic_two == -1: # One of them does not have an IC, so similarity is 0 return 1 elif ic_one + ic_two == 0: # If both have IC = 0, then IC(MICA) = 0 # We say, in this case, that distance is 1 return 1 shared_ic = self.shared_ic_calculator.get(one, two) return (ic_one + ic_two - 2 * shared_ic) / 2
def compare(self, one, two): if one == two: return 1 one = utils.get_id(one) two = utils.get_id(two) ic_one = self.ic_calculator.get(one) ic_two = self.ic_calculator.get(two) # Special cases if ic_one == -1 or ic_two == -1: # One of them does not have an IC, so similarity is 0 return 0 elif ic_one + ic_two == 0: # If both have IC = 0, then IC(MICA) = 0 # We say, in this case, that similarity is 0 return 0 num = 2 * self.shared_ic_calculator.get(one, two) den = ic_one + ic_two return num / den
def compare(self, one, two): one = utils.get_id(one) two = utils.seq_to_ids(two) with sql.lock: if not self.two_args: args = (one, ) else: args = (one, one) sql.cursor.execute(self.get_super_query, args) superclasses = {i[0] for i in sql.cursor} # If one of the concepts in the second argument is superclass of the # first argument, return 1 if any(i in two for i in superclasses): return 1 # Otherwise, compare the concept with any of the concepts of the second # list and return the maximum similarity value found result = 0 for second in two: result = max(result, self.inner.compare(one, second)) return result
def test_disjoint_factor(self): df = plugin.DisjointFactor() for one, two, result in DISJOINT_FACTOR_RESULTS: one_id = utils.get_id(one) two_id = utils.get_id(two) assert abs(df.get(one_id, two_id) - result) < EPSILON
def compare(self, one, two): one = utils.get_id(one) two = utils.get_id(two) return self.shared_ic_calculator.get(one, two)