def __init__(self, compiled=False, learning_rate=0.002, n_features_hidden_factor=10, max_n_clusters=8, n_iter=20, inclusion_threshold=0.7, random_state=1): self.max_n_clusters = max_n_clusters self.n_iter = n_iter self.inclusion_threshold = inclusion_threshold self.random_state = random_state self.feature_selector = IteratedSemiSupervisedFeatureSelection() self.feature_constructor = Projector() self.embedder = Embedder2D(compiled=compiled, learning_rate=learning_rate, n_features_hidden_factor=n_features_hidden_factor) self.evaluator = LocalEmbeddingEvaluator() self.refiner = LocalEmbeddingRefiner() self.auto_cluster = AutoCluster()
class AutoEmbedder(object): """ Transform a set of high dimensional vectors to a set of two dimensional vectors. The data matrix is transformed using the feature_constructor prior to the application of the embedder in 2D. """ def __init__(self, compiled=False, learning_rate=0.002, n_features_hidden_factor=10, max_n_clusters=8, n_iter=20, inclusion_threshold=0.7, random_state=1): self.max_n_clusters = max_n_clusters self.n_iter = n_iter self.inclusion_threshold = inclusion_threshold self.random_state = random_state self.feature_selector = IteratedSemiSupervisedFeatureSelection() self.feature_constructor = Projector() self.embedder = Embedder2D( compiled=compiled, learning_rate=learning_rate, n_features_hidden_factor=n_features_hidden_factor) self.evaluator = LocalEmbeddingEvaluator() self.refiner = LocalEmbeddingRefiner() self.auto_cluster = AutoCluster() def __repr__(self): serial = [] serial.append('Embedder:') serial.append('inclusion_threshold: %.3f' % (self.inclusion_threshold)) serial.append('n_iter: %d' % (self.n_iter)) serial.append('max_n_clusters: %d' % (self.max_n_clusters)) serial.append('-' * 80) serial.append(str(self.feature_constructor)) serial.append('-' * 80) serial.append(str(self.embedder)) serial.append('=' * 80) return '\n'.join(serial) def fit(self, data_matrix, target=None): raise NotImplementedError("Should have implemented this") return self def fit_transform(self, data_matrix, target=None): logger.info('Input data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) # if sparse.issparse(data_matrix): # logger.info('Convert matrix format from sparse to dense using random projections') # data_matrix = SparseRandomProjection().fit_transform(data_matrix).toarray() # logger.info('Data matrix: %d rows %d cols' % # (data_matrix.shape[0], data_matrix.shape[1])) if target is not None: data_matrix_feature_select = self.feature_selector.fit_transform( data_matrix, target) logger.info('Feature selection') logger.info('Data matrix: %d rows %d cols' % (data_matrix_feature_select.shape[0], data_matrix_feature_select.shape[1])) else: data_matrix_feature_select = data_matrix self.data_matrix = self.optimize(data_matrix_feature_select, target=target, n_iter=self.n_iter) logger.debug('%s' % str(self.__repr__())) return self.data_matrix def transform(self, data_matrix, target=None): data_matrix_feature_constr = self.feature_constructor.fit_transform( data_matrix, target=target) data_matrix_lowdim = self.embedder.fit_transform( data_matrix_feature_constr) data_matrix_out = self.refiner.embedding_refinement(data_matrix, data_matrix_lowdim, n_neighbors=8, emb_quality_th=1, n_iter=20) self.score, self.scores = self.evaluator.averaged_embedding_quality_score( data_matrix, data_matrix_out, n_neighbor_list=[10, 30], return_scores=True) return data_matrix_out def predict(self, data_matrix, target=None): self.fit_transform(data_matrix, target=target) self.auto_cluster.optimize(self.data_matrix, max_n_clusters=self.max_n_clusters) self.predictions = self.auto_cluster.predictions logger.debug('embedding score: %.4f' % (self.score)) return self.predictions def randomize(self, data_matrix, amount=1): self.feature_constructor.randomize(data_matrix, amount=amount) self.embedder.randomize(data_matrix, amount=amount) def optimize(self, data_matrix, target=None, n_iter=20): score, iter_id, data_matrix_out, obj_dict = max( self._optimize(data_matrix, target=target, n_iter=n_iter)) self.__dict__.update(obj_dict) return data_matrix_out def _optimize(self, data_matrix, target=None, n_iter=None): for iter_id in range(1, n_iter + 1): try: self.randomize(data_matrix, amount=self.inclusion_threshold) data_matrix_out = self.transform(data_matrix, target=target) score = self.score yield (score, iter_id, data_matrix_out, deepcopy(self.__dict__)) except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) logger.debug('Current object status:') logger.debug(self.__repr__()) logger.debug('*' * 80)
class AutoEmbedder(object): """ Transform a set of high dimensional vectors to a set of two dimensional vectors. The data matrix is transformed using the feature_constructor prior to the application of the embedder in 2D. """ def __init__(self, compiled=False, learning_rate=0.002, n_features_hidden_factor=10, max_n_clusters=8, n_iter=20, inclusion_threshold=0.7, random_state=1): self.max_n_clusters = max_n_clusters self.n_iter = n_iter self.inclusion_threshold = inclusion_threshold self.random_state = random_state self.feature_selector = IteratedSemiSupervisedFeatureSelection() self.feature_constructor = Projector() self.embedder = Embedder2D(compiled=compiled, learning_rate=learning_rate, n_features_hidden_factor=n_features_hidden_factor) self.evaluator = LocalEmbeddingEvaluator() self.refiner = LocalEmbeddingRefiner() self.auto_cluster = AutoCluster() def __repr__(self): serial = [] serial.append('Embedder:') serial.append('inclusion_threshold: %.3f' % (self.inclusion_threshold)) serial.append('n_iter: %d' % (self.n_iter)) serial.append('max_n_clusters: %d' % (self.max_n_clusters)) serial.append('-' * 80) serial.append(str(self.feature_constructor)) serial.append('-' * 80) serial.append(str(self.embedder)) serial.append('=' * 80) return '\n'.join(serial) def fit(self, data_matrix, target=None): raise NotImplementedError("Should have implemented this") return self def fit_transform(self, data_matrix, target=None): logger.info('Input data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) # if sparse.issparse(data_matrix): # logger.info('Convert matrix format from sparse to dense using random projections') # data_matrix = SparseRandomProjection().fit_transform(data_matrix).toarray() # logger.info('Data matrix: %d rows %d cols' % # (data_matrix.shape[0], data_matrix.shape[1])) if target is not None: data_matrix_feature_select = self.feature_selector.fit_transform(data_matrix, target) logger.info('Feature selection') logger.info('Data matrix: %d rows %d cols' % (data_matrix_feature_select.shape[0], data_matrix_feature_select.shape[1])) else: data_matrix_feature_select = data_matrix self.data_matrix = self.optimize(data_matrix_feature_select, target=target, n_iter=self.n_iter) logger.debug('%s' % str(self.__repr__())) return self.data_matrix def transform(self, data_matrix, target=None): data_matrix_feature_constr = self.feature_constructor.fit_transform(data_matrix, target=target) data_matrix_lowdim = self.embedder.fit_transform(data_matrix_feature_constr) data_matrix_out = self.refiner.embedding_refinement(data_matrix, data_matrix_lowdim, n_neighbors=8, emb_quality_th=1, n_iter=20) self.score, self.scores = self.evaluator.averaged_embedding_quality_score(data_matrix, data_matrix_out, n_neighbor_list=[10, 30], return_scores=True) return data_matrix_out def predict(self, data_matrix, target=None): self.fit_transform(data_matrix, target=target) self.auto_cluster.optimize(self.data_matrix, max_n_clusters=self.max_n_clusters) self.predictions = self.auto_cluster.predictions logger.debug('embedding score: %.4f' % (self.score)) return self.predictions def randomize(self, data_matrix, amount=1): self.feature_constructor.randomize(data_matrix, amount=amount) self.embedder.randomize(data_matrix, amount=amount) def optimize(self, data_matrix, target=None, n_iter=20): score, iter_id, data_matrix_out, obj_dict = max(self._optimize(data_matrix, target=target, n_iter=n_iter)) self.__dict__.update(obj_dict) return data_matrix_out def _optimize(self, data_matrix, target=None, n_iter=None): for iter_id in range(1, n_iter + 1): try: self.randomize(data_matrix, amount=self.inclusion_threshold) data_matrix_out = self.transform(data_matrix, target=target) score = self.score yield (score, iter_id, data_matrix_out, deepcopy(self.__dict__)) except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) logger.debug('Current object status:') logger.debug(self.__repr__()) logger.debug('*' * 80)