def transform_similarities_to_graphviz(self, outfile=sys.stdout, min_visible_similarity=0.15, min_label_similarity=0.30): self._prepare_instance() assert self._prepared assert 0 <= min_visible_similarity <= min_label_similarity logging.info("Represent similarities as a graph to file '%s'...", outfile.name) print('graph similarities {', file=outfile) # Preparing nodes print(' node [shape="rect", fontsize=14];', file=outfile) for i,(sn,sw) in enumerate(zip(self.names, self.weights)): print(' s{0} [label="{1}\\n[{2}]"];'.format(i, sn, sw), file=outfile) # Preparing edges print(' edge [len=5, fontsize=10];', file=outfile) for i1,i2,simil in [ (i1, i2, self.similarities[i1][i2]) for i1,i2 in util.lower_triangle(len(self.names)) if self.similarities[i1][i2] >= min_visible_similarity ]: edge_format = ( 'label="{0:.3f}",style=solid'.format(simil) if simil > min_label_similarity else 'style=dashed,constraint=false' ) print(' s{0} -- s{1} [{2}];'.format(i1, i2, edge_format), file=outfile) print('}', file=outfile) outfile.flush()
def _prepare_instance(self): def compute_similarity(entity_similarities): return float( sum([ sum(row) for row in entity_similarities ]) / (len(entity_similarities)+len(entity_similarities[0])) ) assert self is not None assert self.names assert len(self.names) == len(self.entities) if self._prepared: return # Polish schema names ext = '.graphml' self.names = [ name[:-len(ext)] if name.endswith(ext) else name for name in self.names ] # Prepare weights: weights[schema_i] = no. of entities logging.info("Computing schema weights...") self.weights = [ len(entity_set) for entity_set in self.entities ] # Compute weights logging.info("Computing schema similarities...") n_schemas = len(self.names) # - prepare matrix self.similarities = [ [ 0.0 ] * n_schemas for i in xrange(n_schemas) ] # - compute raw similarities max_similarity = 0.0 for i1,i2 in util.lower_triangle(n_schemas): this_similarity = compute_similarity(self.entity_similarities[(i1,i2)]) self.similarities[i1][i2] = this_similarity self.similarities[i2][i1] = this_similarity max_similarity = max(max_similarity, this_similarity) logging.debug("Maximum similarity between two schemas: %.4f", max_similarity) assert max_similarity > 0.0 # - rescale matrix self.similarities = [ [ el/max_similarity for el in row ] for row in self.similarities ] self._prepared = True
def prepare_ILP(self, ilp_solver, elems=None, k=None): vartype = StrictILPFormulation.VarType assert ilp_solver ## Get all the elements if they are not specified if not elems: elems = list(range(self._schemas.names)) ln = len(elems) if not k or k == "auto": k = int(math.ceil(float(ln) / self._constr.ll)) kr = list(range(k)) self._solver = ilp_solver self._elems = elems self._k = k logging.info("Solving [%s] with at most %d clusters...", ", ".join((str(el) for el in self._elems)), self._k) self._solver.set_solver_obj(solver.ProblemType.MAXIMIZATION) # # # # # # # # # # # # # # # # Prepare the VARIABLES logging.info("Preparing ILP variables...") # - Variables ClusterAssignment (i, c) (with i element and c cluster) self._solver.add_variables( variables=[ (vartype.ClusterAssignment, (i, c)) for i in elems for c in kr ], variable_types=[solver.VariableType.BINARY] * (ln*k), obj_coeff=[ 0.0 ] * (ln*k) ) # - Variables Weight (i, j, c) (with i, j elements, and c cluster) self._solver.add_variables( variables=[ (vartype.Weight, (i1, i2, c)) for i1, i2 in util.lower_triangle(elems) for c in kr ], variable_types=[solver.VariableType.CONTINUOUS] * (ln*(ln-1)/2*k), obj_coeff=[ 0.0 ] * (ln*(ln-1)/2*k) ) # - Variables WeightCluster (c) (with c cluster) self._solver.add_variables( variables=[ (vartype.WeightCluster, c) for c in kr ], variable_types=[solver.VariableType.CONTINUOUS] * k, obj_coeff=[ 1.0 ] * k ) # - Variables Empty (c) (with c cluster) self._solver.add_variables( variables=[ (vartype.Empty, c) for c in kr ], variable_types=[solver.VariableType.BINARY] * k, obj_coeff=[ 0.0 ] * k ) # # # # # # # # # # # # # # # # Prepare the CONSTRAINTS logging.info("Preparing ILP constraints...") # Each element is in a single cluster # \sum_k Cl_{i,k} = 1 for i in elems: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ], sense=solver.ConstraintSense.EQ, rhs=1.0 ) # W_{i,j,k} \le similarity of i and j if i co-clustered with j for i1,i2 in util.lower_triangle(elems): for c in kr: self._solver.add_constraint( lhs=[ (self._schemas.similarities[i1][i2], (vartype.ClusterAssignment, (i1, c))), (-1, (vartype.Weight, (i1, i2, c))) ], sense=solver.ConstraintSense.GE, rhs=0.0 ) self._solver.add_constraint( lhs=[ (self._schemas.similarities[i1][i2], (vartype.ClusterAssignment, (i2, c))), (-1, (vartype.Weight, (i1, i2, c))) ], sense=solver.ConstraintSense.GE, rhs=0.0 ) # The weight of a cluster is the sum of the weights for c in kr: self._solver.add_constraint( lhs=( [ (1, (vartype.Weight, (i1, i2, c))) for i1, i2 in util.lower_triangle(elems) ] + [ (-1, (vartype.WeightCluster, c)) ] ), sense=solver.ConstraintSense.EQ, rhs=0.0 ) # Maximum cluster cardinality for c in kr: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for i in elems ], sense=solver.ConstraintSense.LE, rhs=self._constr.lu ) # Maximum number of entities for c in kr: self._solver.add_constraint( lhs=[ (self._schemas.weights[i], (vartype.ClusterAssignment, (i, c))) for i in elems ], sense=solver.ConstraintSense.LE, rhs=self._constr.eu ) # Minimum cluster cardinality OR empty # - part 1: Empty_c \ge 1 iff cluster c is empty for i,c in itertools.product(elems, kr): self._solver.add_constraint( lhs=[ (1, (vartype.Empty, c)), (1, (vartype.ClusterAssignment, (i, c))) ], sense=solver.ConstraintSense.LE, rhs=1 ) # - part 2: Cardinality of cluster c is at least ll if not Empty_c for c in kr: self._solver.add_constraint( lhs=( [ (1, (vartype.ClusterAssignment, (i, c))) for i in elems ] + [ (self._constr.ll, (vartype.Empty, c)) ] ), sense=solver.ConstraintSense.GE, rhs=self._constr.ll ) '''
def prepare_ILP(self, ilp_solver, elems=None, k=None): vartype = LargeILPFormulation.VarType assert ilp_solver ## Get all the elements if they are not specified if not elems: elems = list(range(self._schemas.names)) ln = len(elems) if not k or k == "auto": k = int(math.ceil(float(ln) / self._constr.ll)) kr = list(range(k)) self._solver = ilp_solver self._elems = elems self._k = k logging.info("Solving [%s] with at most %d clusters...", ", ".join((str(el) for el in self._elems)), self._k) self._solver.set_solver_obj(solver.ProblemType.MAXIMIZATION) # # # # # # # # # # # # # # # # Prepare the PARAMETERS nprime = self._constr.ll * ( self._k - int(math.ceil( float(ln) / self._constr.lu ) ) ) add_elems = list(range(-1, -nprime-1, -1)) self._add_elems = add_elems all_elems = add_elems + elems ntot = ln + nprime # # # # # # # # # # # # # # # # Prepare the VARIABLES logging.info("Preparing ILP variables...") # - Variables ClusterAssignment (i, c) (with i true element and c cluster) self._solver.add_variables( variables=[ (vartype.ClusterAssignment, (i, c)) for i in elems for c in kr ], variable_types=[solver.VariableType.BINARY] * (ln*k), obj_coeff=[ 0.0 ] * (ln*k) ) # - Variables ClusterAssignment (i, c) (with i "false" element and c cluster) self._solver.add_variables( variables=[ (vartype.ClusterAssignment, (i, c)) for i in add_elems for c in kr ], variable_types=[solver.VariableType.BINARY] * (nprime*k), obj_coeff=[ 0.0 ] * (nprime*k) ) # - Variables Weight (i, j, c) (with i, j elements, and c cluster) self._solver.add_variables( variables=[ (vartype.Weight, (i1, i2, c)) for i1, i2 in util.lower_triangle(elems) for c in kr ], variable_types=[solver.VariableType.CONTINUOUS] * (ln*(ln-1)/2*k), obj_coeff=[ 1.0 ] * (ln*(ln-1)/2*k) ) # # # # # # # # # # # # # # # # Prepare the CONSTRAINTS logging.info("Preparing ILP constraints...") # Each true element is in a single cluster # \sum_k Cl_{i,k} = 1 for i in elems: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ], sense=solver.ConstraintSense.EQ, rhs=1.0 ) # Each false element is in at most one single cluster # \sum_k Cl_{i,k} = 1 for i in add_elems: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ], sense=solver.ConstraintSense.LE, rhs=1.0 ) # True and false elements are not co-clustered # \sum_k ( Cl_{i,k} + Cl_{j,k} ) \le 1 with i real and j false element for i, j in itertools.product(elems, add_elems): for c in kr: self._solver.add_constraint( lhs=( [ (1, (vartype.ClusterAssignment, (i, c))), (1, (vartype.ClusterAssignment, (j, c))) ] ), sense=solver.ConstraintSense.LE, rhs=1.0 ) # W_{i,j,k} \le similarity of i and j if i co-clustered with j for i1,i2 in util.lower_triangle(elems): for c in kr: self._solver.add_constraint( lhs=[ (self._schemas.similarities[i1][i2], (vartype.ClusterAssignment, (i1, c))), (-1, (vartype.Weight, (i1, i2, c))) ], sense=solver.ConstraintSense.GE, rhs=0.0 ) self._solver.add_constraint( lhs=[ (self._schemas.similarities[i1][i2], (vartype.ClusterAssignment, (i2, c))), (-1, (vartype.Weight, (i1, i2, c))) ], sense=solver.ConstraintSense.GE, rhs=0.0 ) # Maximum cluster cardinality for c in kr: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for i in all_elems ], sense=solver.ConstraintSense.LE, rhs=self._constr.lu ) # Maximum number of entities for c in kr: self._solver.add_constraint( lhs=[ (self._schemas.weights[i], (vartype.ClusterAssignment, (i, c))) for i in elems ], sense=solver.ConstraintSense.LE, rhs=self._constr.eu ) # Minimum cluster cardinality for c in kr: self._solver.add_constraint( lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for i in all_elems ], sense=solver.ConstraintSense.GE, rhs=self._constr.ll )
def recompute_similarities(self): self.cluster_similarities = [ sum( ( self.instance.similarities[i1][i2] for i1,i2 in util.lower_triangle(cluster) ) ) for cluster in self.clusters ] self.total_similarity = sum(self.cluster_similarities)