예제 #1
0
    def transform_similarities_to_graphviz(self,
                                           outfile=sys.stdout,
                                           min_visible_similarity=0.15,
                                           min_label_similarity=0.30):
        self._prepare_instance()
        assert self._prepared
        assert 0 <= min_visible_similarity <= min_label_similarity

        logging.info("Represent similarities as a graph to file '%s'...",
                     outfile.name)

        print('graph similarities {', file=outfile)
        # Preparing nodes
        print('  node [shape="rect", fontsize=14];', file=outfile)
        for i,(sn,sw) in enumerate(zip(self.names, self.weights)):
            print('  s{0} [label="{1}\\n[{2}]"];'.format(i, sn, sw),
                  file=outfile)
        # Preparing edges
        print('  edge [len=5, fontsize=10];', file=outfile)
        for i1,i2,simil in [ (i1, i2, self.similarities[i1][i2])
                             for i1,i2 in util.lower_triangle(len(self.names))
                             if self.similarities[i1][i2] >= min_visible_similarity ]:
            edge_format = ( 'label="{0:.3f}",style=solid'.format(simil) if simil > min_label_similarity
                            else 'style=dashed,constraint=false' )
            print('  s{0} -- s{1} [{2}];'.format(i1, i2, edge_format),
                  file=outfile)
        print('}', file=outfile)
        outfile.flush()
예제 #2
0
    def _prepare_instance(self):
        def compute_similarity(entity_similarities):
            return float( sum([ sum(row) for row in entity_similarities ]) /
                          (len(entity_similarities)+len(entity_similarities[0])) )

        assert self is not None
        assert self.names
        assert len(self.names) == len(self.entities)

        if self._prepared:
            return

        # Polish schema names
        ext = '.graphml'
        self.names = [ name[:-len(ext)] if name.endswith(ext) else name
                       for name in self.names ]

        # Prepare weights: weights[schema_i] = no. of entities
        logging.info("Computing schema weights...")
        self.weights = [ len(entity_set) for entity_set in self.entities ]

        # Compute weights
        logging.info("Computing schema similarities...")
        n_schemas = len(self.names)
        # - prepare matrix
        self.similarities = [ [ 0.0 ] * n_schemas for i in xrange(n_schemas) ]

        # - compute raw similarities
        max_similarity = 0.0
        for i1,i2 in util.lower_triangle(n_schemas):
            this_similarity = compute_similarity(self.entity_similarities[(i1,i2)])
            self.similarities[i1][i2] = this_similarity
            self.similarities[i2][i1] = this_similarity
            max_similarity = max(max_similarity, this_similarity)

        logging.debug("Maximum similarity between two schemas: %.4f", max_similarity)
        assert max_similarity > 0.0
        #  - rescale matrix
        self.similarities = [ [ el/max_similarity for el in row ]
                              for row in self.similarities ]

        self._prepared = True
예제 #3
0
    def prepare_ILP(self, ilp_solver, elems=None, k=None):
        vartype = StrictILPFormulation.VarType
        assert ilp_solver
        ## Get all the elements if they are not specified
        if not elems:
            elems = list(range(self._schemas.names))
        ln = len(elems)
        if not k or k == "auto":
            k = int(math.ceil(float(ln) / self._constr.ll))
        kr = list(range(k))

        self._solver = ilp_solver
        self._elems = elems
        self._k = k

        logging.info("Solving [%s] with at most %d clusters...",
                     ", ".join((str(el) for el in self._elems)), self._k)

        self._solver.set_solver_obj(solver.ProblemType.MAXIMIZATION)

        # # # # # # # # # # # # # # #
        # Prepare the VARIABLES
        logging.info("Preparing ILP variables...")
        # - Variables ClusterAssignment (i, c) (with i element and c cluster)
        self._solver.add_variables(
            variables=[ (vartype.ClusterAssignment, (i, c))
                        for i in elems
                        for c in kr ],
            variable_types=[solver.VariableType.BINARY] * (ln*k),
            obj_coeff=[ 0.0 ] * (ln*k) )

        # - Variables Weight (i, j, c) (with i, j elements, and c cluster)
        self._solver.add_variables(
            variables=[ (vartype.Weight, (i1, i2, c))
                        for i1, i2 in util.lower_triangle(elems)
                        for c in kr ],
            variable_types=[solver.VariableType.CONTINUOUS] * (ln*(ln-1)/2*k),
            obj_coeff=[ 0.0 ] * (ln*(ln-1)/2*k) )

        # - Variables WeightCluster (c) (with c cluster)
        self._solver.add_variables(
            variables=[ (vartype.WeightCluster, c) for c in kr ],
            variable_types=[solver.VariableType.CONTINUOUS] * k,
            obj_coeff=[ 1.0 ] * k )

        # - Variables Empty (c) (with c cluster)
        self._solver.add_variables(
            variables=[ (vartype.Empty, c) for c in kr ],
            variable_types=[solver.VariableType.BINARY] * k,
            obj_coeff=[ 0.0 ] * k )


        # # # # # # # # # # # # # # #
        # Prepare the CONSTRAINTS
        logging.info("Preparing ILP constraints...")
        # Each element is in a single cluster
        # \sum_k Cl_{i,k} = 1
        for i in elems:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ],
                sense=solver.ConstraintSense.EQ,
                rhs=1.0 )

        # W_{i,j,k} \le similarity of i and j if i co-clustered with j
        for i1,i2 in util.lower_triangle(elems):
            for c in kr:
                self._solver.add_constraint(
                    lhs=[ (self._schemas.similarities[i1][i2],
                           (vartype.ClusterAssignment, (i1, c))),
                          (-1, (vartype.Weight, (i1, i2, c))) ],
                    sense=solver.ConstraintSense.GE,
                    rhs=0.0 )
                self._solver.add_constraint(
                    lhs=[ (self._schemas.similarities[i1][i2],
                           (vartype.ClusterAssignment, (i2, c))),
                          (-1, (vartype.Weight, (i1, i2, c))) ],
                    sense=solver.ConstraintSense.GE,
                    rhs=0.0 )

        # The weight of a cluster is the sum of the weights
        for c in kr:
            self._solver.add_constraint(
                lhs=( [ (1, (vartype.Weight, (i1, i2, c)))
                        for i1, i2 in util.lower_triangle(elems) ] +
                      [ (-1, (vartype.WeightCluster, c)) ] ),
                sense=solver.ConstraintSense.EQ,
                rhs=0.0 )

        # Maximum cluster cardinality
        for c in kr:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c)))
                      for i in elems ],
                sense=solver.ConstraintSense.LE,
                rhs=self._constr.lu )

        # Maximum number of entities
        for c in kr:
            self._solver.add_constraint(
                lhs=[ (self._schemas.weights[i],
                       (vartype.ClusterAssignment, (i, c)))
                      for i in elems ],
                sense=solver.ConstraintSense.LE,
                rhs=self._constr.eu )

        # Minimum cluster cardinality OR empty
        #   - part 1: Empty_c \ge 1 iff cluster c is empty
        for i,c in itertools.product(elems, kr):
            self._solver.add_constraint(
                lhs=[ (1, (vartype.Empty, c)),
                      (1, (vartype.ClusterAssignment, (i, c))) ],
                sense=solver.ConstraintSense.LE,
                rhs=1 )
        #   - part 2: Cardinality of cluster c is at least ll if not Empty_c
        for c in kr:
            self._solver.add_constraint(
                lhs=( [ (1, (vartype.ClusterAssignment, (i, c))) for i in elems ] +
                      [ (self._constr.ll, (vartype.Empty, c)) ] ),
                sense=solver.ConstraintSense.GE,
                rhs=self._constr.ll )

        '''
예제 #4
0
    def prepare_ILP(self, ilp_solver, elems=None, k=None):
        vartype = LargeILPFormulation.VarType
        assert ilp_solver
        ## Get all the elements if they are not specified
        if not elems:
            elems = list(range(self._schemas.names))
        ln = len(elems)
        if not k or k == "auto":
            k = int(math.ceil(float(ln) / self._constr.ll))
        kr = list(range(k))

        self._solver = ilp_solver
        self._elems = elems
        self._k = k

        logging.info("Solving [%s] with at most %d clusters...",
                     ", ".join((str(el) for el in self._elems)), self._k)

        self._solver.set_solver_obj(solver.ProblemType.MAXIMIZATION)

        # # # # # # # # # # # # # # #
        # Prepare the PARAMETERS
        nprime = self._constr.ll * ( self._k -
                                     int(math.ceil( float(ln) /
                                                    self._constr.lu ) ) )
        add_elems = list(range(-1, -nprime-1, -1))
        self._add_elems = add_elems
        all_elems = add_elems + elems
        ntot = ln + nprime


        # # # # # # # # # # # # # # #
        # Prepare the VARIABLES
        logging.info("Preparing ILP variables...")
        # - Variables ClusterAssignment (i, c) (with i true element and c cluster)
        self._solver.add_variables(
            variables=[ (vartype.ClusterAssignment, (i, c))
                        for i in elems
                        for c in kr ],
            variable_types=[solver.VariableType.BINARY] * (ln*k),
            obj_coeff=[ 0.0 ] * (ln*k) )

        # - Variables ClusterAssignment (i, c) (with i "false" element and c cluster)
        self._solver.add_variables(
            variables=[ (vartype.ClusterAssignment, (i, c))
                        for i in add_elems
                        for c in kr ],
            variable_types=[solver.VariableType.BINARY] * (nprime*k),
            obj_coeff=[ 0.0 ] * (nprime*k) )

        # - Variables Weight (i, j, c) (with i, j elements, and c cluster)
        self._solver.add_variables(
            variables=[ (vartype.Weight, (i1, i2, c))
                        for i1, i2 in util.lower_triangle(elems)
                        for c in kr ],
            variable_types=[solver.VariableType.CONTINUOUS] * (ln*(ln-1)/2*k),
            obj_coeff=[ 1.0 ] * (ln*(ln-1)/2*k) )


        # # # # # # # # # # # # # # #
        # Prepare the CONSTRAINTS
        logging.info("Preparing ILP constraints...")
        # Each true element is in a single cluster
        # \sum_k Cl_{i,k} = 1
        for i in elems:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ],
                sense=solver.ConstraintSense.EQ,
                rhs=1.0 )

        # Each false element is in at most one single cluster
        # \sum_k Cl_{i,k} = 1
        for i in add_elems:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c))) for c in kr ],
                sense=solver.ConstraintSense.LE,
                rhs=1.0 )

        # True and false elements are not co-clustered
        # \sum_k ( Cl_{i,k} + Cl_{j,k} ) \le 1   with i real and j false element
        for i, j in itertools.product(elems, add_elems):
            for c in kr:
                self._solver.add_constraint(
                    lhs=( [ (1, (vartype.ClusterAssignment, (i, c))),
                            (1, (vartype.ClusterAssignment, (j, c))) ] ),
                    sense=solver.ConstraintSense.LE,
                    rhs=1.0 )

        # W_{i,j,k} \le similarity of i and j if i co-clustered with j
        for i1,i2 in util.lower_triangle(elems):
            for c in kr:
                self._solver.add_constraint(
                    lhs=[ (self._schemas.similarities[i1][i2],
                           (vartype.ClusterAssignment, (i1, c))),
                          (-1, (vartype.Weight, (i1, i2, c))) ],
                    sense=solver.ConstraintSense.GE,
                    rhs=0.0 )
                self._solver.add_constraint(
                    lhs=[ (self._schemas.similarities[i1][i2],
                           (vartype.ClusterAssignment, (i2, c))),
                          (-1, (vartype.Weight, (i1, i2, c))) ],
                    sense=solver.ConstraintSense.GE,
                    rhs=0.0 )

        # Maximum cluster cardinality
        for c in kr:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c)))
                      for i in all_elems ],
                sense=solver.ConstraintSense.LE,
                rhs=self._constr.lu )

        # Maximum number of entities
        for c in kr:
            self._solver.add_constraint(
                lhs=[ (self._schemas.weights[i],
                       (vartype.ClusterAssignment, (i, c)))
                      for i in elems ],
                sense=solver.ConstraintSense.LE,
                rhs=self._constr.eu )

        # Minimum cluster cardinality
        for c in kr:
            self._solver.add_constraint(
                lhs=[ (1, (vartype.ClusterAssignment, (i, c)))
                      for i in all_elems ],
                sense=solver.ConstraintSense.GE,
                rhs=self._constr.ll )
예제 #5
0
 def recompute_similarities(self):
     self.cluster_similarities = [ sum( ( self.instance.similarities[i1][i2]
                                          for i1,i2 in util.lower_triangle(cluster) ) )
                                          for cluster in self.clusters ]
     self.total_similarity = sum(self.cluster_similarities)