예제 #1
0
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X'])

        # Input validation and parsing
        if X is None:
            raise ValueError('`transform` input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise TypeError('input must be an iterable\n')

            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    elif len(x) == 1:
                        warnings.warn(
                            'Ignoring empty element on index: '
                            + str(i) + '\nLabels must be provided.')
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                        vertices = list(x.get_vertices(purpose="any"))
                        Labels = x.get_labels(purpose="any")
                elif type(x) is Graph:
                    vertices = list(x.get_vertices(purpose="any"))
                    Labels = x.get_labels(purpose="any")
                else:
                    raise TypeError('each element of X must be either '
                                    'a graph object or a list with at '
                                    'least a graph like object and '
                                    'node labels dict \n')

                # Hash based on the labels of fit
                new_labels = {v: self._labels_hash_dict.get(l, None)
                              for v, l in iteritems(Labels)}

                # Radix sort the other
                g = ((vertices, new_labels) +
                     ({n: x.neighbors(n, purpose="any")
                       for n in vertices},))

                gr = {0: self.NH_(g)}
                for r in range(1, self.R):
                    gr[r] = self.NH_(gr[r-1])

                # save the output for all levels
                out.append(gr)
                i += 1

                if i == 0:
                    raise ValueError('parsed input is empty')

        # Transform - calculate kernel matrix
        # Output is always normalized
        km = self._calculate_kernel_matrix(out)
        self._is_transformed = True
        return km
예제 #2
0
    def fit(self, X, y=None):
        """Fit a dataset, for a transformer.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). The train samples.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        self : object
        Returns self.

        """
        self._method_calling = 1
        self._is_transformed = False
        # Input validation and parsing
        self.initialize()
        if X is None:
            raise ValueError('`fit` input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise TypeError('input must be an iterable\n')

            i = 0
            out = list()
            gs = list()
            self._labels_hash_dict, labels_hash_set = dict(), set()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    elif len(x) == 1:
                        warnings.warn(
                            'Ignoring empty element on index: '
                            + str(i) + '\nLabels must be provided.')
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                        vertices = list(x.get_vertices(purpose="any"))
                        Labels = x.get_labels(purpose="any")
                elif type(x) is Graph:
                    vertices = list(x.get_vertices(purpose="any"))
                    Labels = x.get_labels(purpose="any")
                else:
                    raise TypeError('each element of X must be either '
                                    'a graph object or a list with at '
                                    'least a graph like object and '
                                    'node labels dict \n')

                g = (vertices, Labels,
                     {n: x.neighbors(n, purpose="any") for n in vertices})

                # collect all the labels
                labels_hash_set |= set(itervalues(Labels))
                gs.append(g)
                i += 1

            if i == 0:
                raise ValueError('parsed input is empty')

            # Hash labels
            if len(labels_hash_set) > self._max_number:
                warnings.warn('Number of labels is smaller than'
                              'the biggest possible.. '
                              'Collisions will appear on the '
                              'new labels.')

                # If labels exceed the biggest possible size
                nl, nrl = list(), len(labels_hash_set)
                while nrl > self._max_number:
                    nl += self.random_state_.choice(self._max_number,
                                                    self._max_number,
                                                    replace=False).tolist()
                    nrl -= self._max_number
                if nrl > 0:
                    nl += self.random_state_.choice(self._max_number,
                                                    nrl,
                                                    replace=False).tolist()
                # unify the collisions per element.

            else:
                # else draw n random numbers.
                nl = self.random_state_.choice(self._max_number, len(labels_hash_set),
                                               replace=False).tolist()

            self._labels_hash_dict = dict(zip(labels_hash_set, nl))

            # for all graphs
            for vertices, labels, neighbors in gs:
                new_labels = {v: self._labels_hash_dict[l]
                              for v, l in iteritems(labels)}
                g = (vertices, new_labels, neighbors,)
                gr = {0: self.NH_(g)}
                for r in range(1, self.R):
                    gr[r] = self.NH_(gr[r-1])

                # save the output for all levels
                out.append(gr)

        self.X = out

        # Return the transformer
        return self
예제 #3
0
    def parse_input(self, X):
        """Parse and create features for the NSPD kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        M : dict
            A dictionary with keys all the distances from 0 to self.d
            and values the the np.arrays with rows corresponding to the
            non-null input graphs and columns to the enumerations of tuples
            consisting of pairs of hash values and radius, from all the given
            graphs of the input (plus the fitted one's on transform).

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            # Hold the number of graphs
            ng = 0

            # Holds all the data for combinations of r, d
            data = collections.defaultdict(dict)

            # Index all keys for combinations of r, d
            all_keys = collections.defaultdict(dict)
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        g = Graph(x[0], x[1], x[2])
                        g.change_format("adjacency")
                elif type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency", label_type="vertex"),
                        x.get_labels(purpose="adjacency", label_type="edge"))
                else:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')

                # Bring to the desired format
                g.change_format(self._graph_format)

                # Take the vertices
                vertices = set(g.get_vertices(purpose=self._graph_format))

                # Extract the dicitionary
                ed = g.get_edge_dictionary()

                # Convert edges to tuples
                edges = {(j, k) for j in ed.keys() for k in ed[j].keys()}

                # Extract labels for nodes
                Lv = g.get_labels(purpose=self._graph_format)
                # and for edges
                Le = g.get_labels(purpose=self._graph_format,
                                  label_type="edge")

                # Produce all the neighborhoods and the distance pairs
                # up to the desired radius and maximum distance
                N, D, D_pair = g.produce_neighborhoods(self.r,
                                                       purpose="dictionary",
                                                       with_distances=True,
                                                       d=self.d)

                # Hash all the neighborhoods
                H = self._hash_neighborhoods(vertices, edges, Lv, Le, N,
                                             D_pair)

                if self._method_calling == 1:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            for r in range(self.r + 1):
                                key = (H[r, A], H[r, B])
                                keys = all_keys[r, d]
                                idx = keys.get(key, None)
                                if idx is None:
                                    idx = len(keys)
                                    keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1

                elif self._method_calling == 3:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            # Based on the edges of the bidirected graph
                            for r in range(self.r + 1):
                                keys = all_keys[r, d]
                                fit_keys = self._fit_keys[r, d]
                                key = (H[r, A], H[r, B])
                                idx = fit_keys.get(key, None)
                                if idx is None:
                                    idx = keys.get(key, None)
                                    if idx is None:
                                        idx = len(keys) + len(fit_keys)
                                        keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1
                ng += 1
            if ng == 0:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix((data, (rows, cols)),
                                        shape=(ng, len(all_keys[key])),
                                        dtype=np.int64)
                self._fit_keys = all_keys
                self._ngx = ng

            elif self._method_calling == 3:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix(
                        (data, (rows, cols)),
                        shape=(ng,
                               len(all_keys[key]) + len(self._fit_keys[key])),
                        dtype=np.int64)

                self._ngy = ng

            return M