Exemplo n.º 1
0
 def __init__(self,
              compiled=False,
              learning_rate=0.002,
              n_features_hidden_factor=10,
              max_n_clusters=8,
              n_iter=20,
              inclusion_threshold=0.7,
              random_state=1):
     self.max_n_clusters = max_n_clusters
     self.n_iter = n_iter
     self.inclusion_threshold = inclusion_threshold
     self.random_state = random_state
     self.feature_selector = IteratedSemiSupervisedFeatureSelection()
     self.feature_constructor = Projector()
     self.embedder = Embedder2D(compiled=compiled,
                                learning_rate=learning_rate,
                                n_features_hidden_factor=n_features_hidden_factor)
     self.evaluator = LocalEmbeddingEvaluator()
     self.refiner = LocalEmbeddingRefiner()
     self.auto_cluster = AutoCluster()
Exemplo n.º 2
0
class AutoEmbedder(object):
    """
    Transform a set of high dimensional vectors to a set of two dimensional vectors.

    The data matrix is transformed using the feature_constructor prior to the application of
    the embedder in 2D.

    """
    def __init__(self,
                 compiled=False,
                 learning_rate=0.002,
                 n_features_hidden_factor=10,
                 max_n_clusters=8,
                 n_iter=20,
                 inclusion_threshold=0.7,
                 random_state=1):
        self.max_n_clusters = max_n_clusters
        self.n_iter = n_iter
        self.inclusion_threshold = inclusion_threshold
        self.random_state = random_state
        self.feature_selector = IteratedSemiSupervisedFeatureSelection()
        self.feature_constructor = Projector()
        self.embedder = Embedder2D(
            compiled=compiled,
            learning_rate=learning_rate,
            n_features_hidden_factor=n_features_hidden_factor)
        self.evaluator = LocalEmbeddingEvaluator()
        self.refiner = LocalEmbeddingRefiner()
        self.auto_cluster = AutoCluster()

    def __repr__(self):
        serial = []
        serial.append('Embedder:')
        serial.append('inclusion_threshold: %.3f' % (self.inclusion_threshold))
        serial.append('n_iter: %d' % (self.n_iter))
        serial.append('max_n_clusters: %d' % (self.max_n_clusters))
        serial.append('-' * 80)
        serial.append(str(self.feature_constructor))
        serial.append('-' * 80)
        serial.append(str(self.embedder))
        serial.append('=' * 80)
        return '\n'.join(serial)

    def fit(self, data_matrix, target=None):
        raise NotImplementedError("Should have implemented this")
        return self

    def fit_transform(self, data_matrix, target=None):
        logger.info('Input data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        # if sparse.issparse(data_matrix):
        #     logger.info('Convert matrix format from sparse to dense using random projections')
        #     data_matrix = SparseRandomProjection().fit_transform(data_matrix).toarray()
        #     logger.info('Data matrix: %d rows  %d cols' %
        #                 (data_matrix.shape[0], data_matrix.shape[1]))
        if target is not None:
            data_matrix_feature_select = self.feature_selector.fit_transform(
                data_matrix, target)
            logger.info('Feature selection')
            logger.info('Data matrix: %d rows  %d cols' %
                        (data_matrix_feature_select.shape[0],
                         data_matrix_feature_select.shape[1]))
        else:
            data_matrix_feature_select = data_matrix

        self.data_matrix = self.optimize(data_matrix_feature_select,
                                         target=target,
                                         n_iter=self.n_iter)
        logger.debug('%s' % str(self.__repr__()))
        return self.data_matrix

    def transform(self, data_matrix, target=None):
        data_matrix_feature_constr = self.feature_constructor.fit_transform(
            data_matrix, target=target)
        data_matrix_lowdim = self.embedder.fit_transform(
            data_matrix_feature_constr)
        data_matrix_out = self.refiner.embedding_refinement(data_matrix,
                                                            data_matrix_lowdim,
                                                            n_neighbors=8,
                                                            emb_quality_th=1,
                                                            n_iter=20)
        self.score, self.scores = self.evaluator.averaged_embedding_quality_score(
            data_matrix,
            data_matrix_out,
            n_neighbor_list=[10, 30],
            return_scores=True)
        return data_matrix_out

    def predict(self, data_matrix, target=None):
        self.fit_transform(data_matrix, target=target)
        self.auto_cluster.optimize(self.data_matrix,
                                   max_n_clusters=self.max_n_clusters)
        self.predictions = self.auto_cluster.predictions
        logger.debug('embedding score: %.4f' % (self.score))
        return self.predictions

    def randomize(self, data_matrix, amount=1):
        self.feature_constructor.randomize(data_matrix, amount=amount)
        self.embedder.randomize(data_matrix, amount=amount)

    def optimize(self, data_matrix, target=None, n_iter=20):
        score, iter_id, data_matrix_out, obj_dict = max(
            self._optimize(data_matrix, target=target, n_iter=n_iter))
        self.__dict__.update(obj_dict)
        return data_matrix_out

    def _optimize(self, data_matrix, target=None, n_iter=None):
        for iter_id in range(1, n_iter + 1):
            try:
                self.randomize(data_matrix, amount=self.inclusion_threshold)
                data_matrix_out = self.transform(data_matrix, target=target)
                score = self.score
                yield (score, iter_id, data_matrix_out,
                       deepcopy(self.__dict__))
            except Exception as e:
                logger.debug('Failed iteration. Reason: %s' % e)
                logger.debug('Exception', exc_info=True)
                logger.debug('Current object status:')
                logger.debug(self.__repr__())
                logger.debug('*' * 80)
Exemplo n.º 3
0
class AutoEmbedder(object):

    """
    Transform a set of high dimensional vectors to a set of two dimensional vectors.

    The data matrix is transformed using the feature_constructor prior to the application of
    the embedder in 2D.

    """

    def __init__(self,
                 compiled=False,
                 learning_rate=0.002,
                 n_features_hidden_factor=10,
                 max_n_clusters=8,
                 n_iter=20,
                 inclusion_threshold=0.7,
                 random_state=1):
        self.max_n_clusters = max_n_clusters
        self.n_iter = n_iter
        self.inclusion_threshold = inclusion_threshold
        self.random_state = random_state
        self.feature_selector = IteratedSemiSupervisedFeatureSelection()
        self.feature_constructor = Projector()
        self.embedder = Embedder2D(compiled=compiled,
                                   learning_rate=learning_rate,
                                   n_features_hidden_factor=n_features_hidden_factor)
        self.evaluator = LocalEmbeddingEvaluator()
        self.refiner = LocalEmbeddingRefiner()
        self.auto_cluster = AutoCluster()

    def __repr__(self):
        serial = []
        serial.append('Embedder:')
        serial.append('inclusion_threshold: %.3f' % (self.inclusion_threshold))
        serial.append('n_iter: %d' % (self.n_iter))
        serial.append('max_n_clusters: %d' % (self.max_n_clusters))
        serial.append('-' * 80)
        serial.append(str(self.feature_constructor))
        serial.append('-' * 80)
        serial.append(str(self.embedder))
        serial.append('=' * 80)
        return '\n'.join(serial)

    def fit(self, data_matrix, target=None):
        raise NotImplementedError("Should have implemented this")
        return self

    def fit_transform(self, data_matrix, target=None):
        logger.info('Input data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        # if sparse.issparse(data_matrix):
        #     logger.info('Convert matrix format from sparse to dense using random projections')
        #     data_matrix = SparseRandomProjection().fit_transform(data_matrix).toarray()
        #     logger.info('Data matrix: %d rows  %d cols' %
        #                 (data_matrix.shape[0], data_matrix.shape[1]))
        if target is not None:
            data_matrix_feature_select = self.feature_selector.fit_transform(data_matrix, target)
            logger.info('Feature selection')
            logger.info('Data matrix: %d rows  %d cols' %
                        (data_matrix_feature_select.shape[0], data_matrix_feature_select.shape[1]))
        else:
            data_matrix_feature_select = data_matrix

        self.data_matrix = self.optimize(data_matrix_feature_select, target=target, n_iter=self.n_iter)
        logger.debug('%s' % str(self.__repr__()))
        return self.data_matrix

    def transform(self, data_matrix, target=None):
        data_matrix_feature_constr = self.feature_constructor.fit_transform(data_matrix, target=target)
        data_matrix_lowdim = self.embedder.fit_transform(data_matrix_feature_constr)
        data_matrix_out = self.refiner.embedding_refinement(data_matrix,
                                                            data_matrix_lowdim,
                                                            n_neighbors=8,
                                                            emb_quality_th=1,
                                                            n_iter=20)
        self.score, self.scores = self.evaluator.averaged_embedding_quality_score(data_matrix,
                                                                                  data_matrix_out,
                                                                                  n_neighbor_list=[10, 30],
                                                                                  return_scores=True)
        return data_matrix_out

    def predict(self, data_matrix, target=None):
        self.fit_transform(data_matrix, target=target)
        self.auto_cluster.optimize(self.data_matrix, max_n_clusters=self.max_n_clusters)
        self.predictions = self.auto_cluster.predictions
        logger.debug('embedding score: %.4f' % (self.score))
        return self.predictions

    def randomize(self, data_matrix, amount=1):
        self.feature_constructor.randomize(data_matrix, amount=amount)
        self.embedder.randomize(data_matrix, amount=amount)

    def optimize(self, data_matrix, target=None, n_iter=20):
        score, iter_id, data_matrix_out, obj_dict = max(self._optimize(data_matrix,
                                                                       target=target, n_iter=n_iter))
        self.__dict__.update(obj_dict)
        return data_matrix_out

    def _optimize(self, data_matrix, target=None, n_iter=None):
        for iter_id in range(1, n_iter + 1):
            try:
                self.randomize(data_matrix, amount=self.inclusion_threshold)
                data_matrix_out = self.transform(data_matrix, target=target)
                score = self.score
                yield (score, iter_id, data_matrix_out, deepcopy(self.__dict__))
            except Exception as e:
                logger.debug('Failed iteration. Reason: %s' % e)
                logger.debug('Exception', exc_info=True)
                logger.debug('Current object status:')
                logger.debug(self.__repr__())
                logger.debug('*' * 80)