예제 #1
0
class KNNWrapper(BaseEstimator, ClassifierMixin):
    """KNNWrapper."""

    def __init__(self, program=NearestNeighbors(n_neighbors=2)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            self.graphs = list(graphs)
            data_matrix = self.vectorizer.transform(graphs)
            self.program = self.program.fit(data_matrix)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            distances, indices = self.program.kneighbors(data_matrix)
            for knn_dists, knn_ids, graph in izip(distances, indices, graphs):
                neighbor_graphs = []
                for knn_id in knn_ids:
                    neighbor_graphs.append(self.graphs[knn_id])
                graph.graph['neighbors'] = neighbor_graphs
                graph.graph['ids'] = knn_ids
                graph.graph['distances'] = knn_dists
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
def prep(graphlist,id=0):
    if not graphlist:
        return {}
    v=Vectorizer()
    map(lambda x: node_operation(x, lambda n, d: d.pop('weight', None)), graphlist)
    csr=v.transform(graphlist)
    hash_function = lambda vec: hash(tuple(vec.data + vec.indices))
    return {hash_function(row): (id,ith) for ith, row in enumerate(csr)}
예제 #3
0
class TransformerWrapper(BaseEstimator, ClassifierMixin):
    """TransformerWrapper."""

    def __init__(self, program=None):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            self.program.fit(graphs)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def transform(self, graphs):
        """predict."""
        try:
            for graph in graphs:
                transformed_graph = self._transform(graph)
                yield transformed_graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _transform(self, graph):
        return graph
예제 #4
0
def compute_NSPDK_features():
  import eden
  from eden.graph import Vectorizer
  from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden
  mol_path = olfaction_prediction_path + '/data/sdf/'
  iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf')
  iter_graphs = obabel_to_eden(iter_mols)

  vectorizer = Vectorizer( r=3, d=4 )
  X = vectorizer.transform( iter_graphs )
  return X
예제 #5
0
class AnnotateImportance(BaseEstimator, ClassifierMixin):
    """Annotate minimal cycles."""

    def __init__(self,
                 program=None,
                 vertex_features=True,
                 reweight=1.0):
        """Construct."""
        self.program = program
        self.vertex_features = vertex_features
        self.reweight = reweight
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this program.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_program = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_program[param] = params[param]
        self.program.set_params(**params_program)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def transform(self, graphs):
        """Transform."""
        try:
            annotated_graphs = self.vectorizer.annotate(
                graphs,
                estimator=self.program,
                reweight=self.reweight,
                vertex_features=self.vertex_features)
            for graph in annotated_graphs:
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #6
0
class OrdererWrapper(BaseEstimator, ClassifierMixin):
    """Orderer."""

    def __init__(self, program=None):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_orderer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_orderer[param] = params[param]
        self.program.set_params(**params_orderer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def decision_function(self, graphs):
        """decision_function."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            scores = self.program.decision_function(data_matrix)
            return scores
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #7
0
파일: __init__.py 프로젝트: smautner/EDeN
class IsomorphicClusterer(BaseEstimator, ClusterMixin):
    """IsomorphismClusterer.
    """

    def __init__(self):
        """Construct."""
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        for param in params:
            self.__dict__[param] = params[param]
        return self

    def fit_predict(self, graphs):
        """fit_predict."""
        def vec_to_hash(vec):
            return hash(tuple(vec.data + vec.indices))
        try:
            for graph in graphs:
                prediction = vec_to_hash(self.vectorizer.transform([graph]))
                yield prediction
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #8
0
 def set_params(self, r=3, d=8, nbits=16, discrete=True,
                balance=False, subsample_size=200, ratio=2,
                normalization=False, inner_normalization=False,
                penalty='elasticnet'):
     """setter."""
     self.r = r
     self.d = d
     self.nbits = nbits
     self.normalization = normalization
     self.inner_normalization = inner_normalization
     self.discrete = discrete
     self.balance = balance
     self.subsample_size = subsample_size
     self.ratio = ratio
     if penalty == 'perceptron':
         self.model = Perceptron(max_iter=5, tol=None)
     else:
         self.model = SGDClassifier(
             average=True, class_weight='balanced', shuffle=True,
             penalty=penalty, max_iter=5, tol=None)
     self.vectorizer = Vectorizer(
         r=self.r, d=self.d,
         normalization=self.normalization,
         inner_normalization=self.inner_normalization,
         discrete=self.discrete,
         nbits=self.nbits)
     return self
예제 #9
0
파일: RNA.py 프로젝트: gianlucacorrado/EDeN
    def __init__(self,
                 complexity=None,
                 nbits=20,
                 sequence_vectorizer_complexity=3,
                 graph_vectorizer_complexity=2,
                 n_neighbors=5,
                 sampling_prob=.5,
                 n_iter=5,
                 min_energy=-5,
                 random_state=1):
        random.seed(random_state)
        if complexity is not None:
            sequence_vectorizer_complexity = complexity
            graph_vectorizer_complexity = complexity

        self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity,
                                                 nbits=nbits,
                                                 normalization=False,
                                                 inner_normalization=False)
        self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits)
        self.n_neighbors = n_neighbors
        self.sampling_prob = sampling_prob
        self.n_iter = n_iter
        self.min_energy = min_energy
        self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
예제 #10
0
 def __init__(self,
              program=SGDClassifier(average=True,
                                    class_weight='balanced',
                                    shuffle=True)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
예제 #11
0
    def __init__(self,
                 complexity=3,
                 r=None,
                 d=None,
                 min_r=0,
                 min_d=0,
                 nbits=20,
                 normalization=True,
                 inner_normalization=True,
                 n=1,
                 min_n=2):
        """
        Arguments:


        complexity : int
          The complexity of the features extracted.

        r : int
          The maximal radius size.

        d : int
          The maximal distance size.

        min_r : int
          The minimal radius size.

        min_d : int
          The minimal distance size.

        nbits : int
          The number of bits that defines the feature space size: |feature space|=2^nbits.

        normalization : bool
          If set the resulting feature vector will have unit euclidean norm.

        inner_normalization : bool
          If set the feature vector for a specific combination of the radius and
          distance size will have unit euclidean norm.
          When used together with the 'normalization' flag it will be applied first and
          then the resulting feature vector will be normalized.

        n : int
          The maximal number of clusters used to discretized label vectors.

        min:n : int
          The minimal number of clusters used to discretized label vectors.
        """
        self.vectorizer = Vectorizer(complexity=complexity,
                                     r=r,
                                     d=d,
                                     min_r=min_r,
                                     min_d=min_d,
                                     nbits=nbits,
                                     normalization=normalization,
                                     inner_normalization=inner_normalization,
                                     n=n,
                                     min_n=min_n)
        self.vectorizers = list()
예제 #12
0
class EdenRegressor(BaseEstimator, RegressorMixin):
    """Build a regressor for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 normalization=True, inner_normalization=True,
                 penalty='elasticnet', loss='squared_loss'):
        """construct."""
        self.set_params(r, d, nbits, discrete,
                        normalization, inner_normalization,
                        penalty, loss)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   normalization=True, inner_normalization=True,
                   penalty='elasticnet', loss='squared_loss'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.model = SGDRegressor(
            loss=loss, penalty=penalty,
            average=True, shuffle=True,
            max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        x = self.transform(graphs)
        self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        return self.predict(graphs)
예제 #13
0
 def __init__(self,
              program=None,
              vertex_features=True,
              reweight=1.0):
     """Construct."""
     self.program = program
     self.vertex_features = vertex_features
     self.reweight = reweight
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #14
0
 def __init__(self,
              program=None,
              relabel=False,
              reweight=1.0):
     """Construct."""
     self.program = program
     self.relabel = relabel
     self.reweight = reweight
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #15
0
class OrdererWrapper(BaseEstimator, ClassifierMixin):
    """Orderer."""

    def __init__(self, program=None):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_orderer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_orderer[param] = params[param]
        self.program.set_params(**params_orderer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def decision_function(self, graphs):
        """decision_function."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            scores = self.program.decision_function(data_matrix)
            return scores
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #16
0
 def vectorize(self, g):
     """ Vectorize graph nodes
     
     Return: a matrix in which rows are the vectors that represents for nodes        
     """
     
     vec = Vectorizer(nbits=self.nbits, 
                      discrete=self.discrete, 
                      d=self.d,
                      r=self.r
                      )
                      
     M = vec.vertex_transform([g])[0]  
     M_reduce = []
     for idx in range(self.n_nodes):
         vec = M[idx,:]
         for l in range(1, self.L):
             vec = vec + M[idx + l*self.n_nodes,: ]
         M_reduce.append(vec)
     M = vstack(M_reduce)                     
     return M
예제 #17
0
def generate_negatives_and_evaluate(iterable=None,
                                    estimator=None,
                                    negative_shuffle_ratio=None,
                                    shuffle_order=None,
                                    vectorizer_complexity=None):

    vectorizer = Vectorizer(complexity=vectorizer_complexity)
    iterable, iterable_neg = binary_classification_dataset_setup(
        iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order)
    roc, apr = estimate(iterable, iterable_neg,
                        estimator, vectorizer, n_jobs=-1)
    return roc, apr
예제 #18
0
    def __init__(self,
                 min_subarray_size=7,
                 max_subarray_size=10,
                 min_motif_count=1,
                 min_cluster_size=1,
                 training_size=None,
                 negative_ratio=1,
                 shuffle_order=2,
                 n_iter_search=1,
                 complexity=4,
                 radius=None,
                 distance=None,
                 nbits=20,
                 clustering_algorithm=None,
                 n_jobs=4,
                 n_blocks=8,
                 block_size=None,
                 pre_processor_n_jobs=4,
                 pre_processor_n_blocks=8,
                 pre_processor_block_size=None,
                 random_state=1):
        self.n_jobs = n_jobs
        self.n_blocks = n_blocks
        self.block_size = block_size
        self.pre_processor_n_jobs = pre_processor_n_jobs
        self.pre_processor_n_blocks = pre_processor_n_blocks
        self.pre_processor_block_size = pre_processor_block_size
        self.training_size = training_size
        self.n_iter_search = n_iter_search
        self.complexity = complexity
        self.nbits = nbits
        # init vectorizer
        self.vectorizer = Vectorizer(complexity=self.complexity,
                                     r=radius, d=distance,
                                     nbits=self.nbits)
        self.seq_vectorizer = SeqVectorizer(complexity=self.complexity,
                                            r=radius, d=distance,
                                            nbits=self.nbits)
        self.negative_ratio = negative_ratio
        self.shuffle_order = shuffle_order
        self.clustering_algorithm = clustering_algorithm
        self.min_subarray_size = min_subarray_size
        self.max_subarray_size = max_subarray_size
        self.min_motif_count = min_motif_count
        self.min_cluster_size = min_cluster_size
        self.random_state = random_state
        random.seed(random_state)

        self.motives_db = defaultdict(list)
        self.motives = []
        self.clusters = defaultdict(list)
        self.cluster_models = []
        self.importances = []
예제 #19
0
파일: estimator.py 프로젝트: xuan-hh/EDeN
 def set_params(self, r=3, d=8, nbits=16, discrete=True,
                normalization=True, inner_normalization=True,
                penalty='elasticnet', loss='squared_loss'):
     """setter."""
     self.r = r
     self.d = d
     self.nbits = nbits
     self.normalization = normalization
     self.inner_normalization = inner_normalization
     self.discrete = discrete
     self.model = SGDRegressor(
         loss=loss, penalty=penalty,
         average=True, shuffle=True,
         max_iter=5, tol=None)
     self.vectorizer = Vectorizer(
         r=self.r, d=self.d,
         normalization=self.normalization,
         inner_normalization=self.inner_normalization,
         discrete=self.discrete,
         nbits=self.nbits)
     return self
예제 #20
0
    def __init__(self,
                 transformer=None,
                 vectorizer=Vectorizer(complexity=4, nbits=13),
                 clustering_algo=DBSCAN(),
                 distance_std_factor=2,
                 min_cluster_size=2,
                 random_state=1):
        """Cluster sequences according to regions of interest and structural folding.

        Parameters
        ----------
        transformer : initialized PreProcessor object
            Transforms sequences to graphs that encode secondary structure information
            and weights nucleotides according to user defined list of intervals.

        vectorizer : initialized Vectorizer object
            Transforms graphs to sparse vectors.

        clustering_algo : scikit-learn clustering algorithm
            Clusters sparse vectors in a finite number of classes.

        distance_std_factor : int (default 2)
            How many standard deviations less than the mean pairwise distance is the maximal
            distance required to join an instance in a cluster.

        min_cluster_size : int (default 2)
            Minimal size of any cluster.

        random_state: int (default 1)
            Random seed.

        Attributes
        ----------
        predictions : list(int)
            List of cluster ids, one per instance.

        clusters : defaultdict(list)
            Dictionary with cluster id as key and list of sequences as variable.

        data_matrix : Scipy sparse matrix (Compressed Sparse Row matrix)
            List of sparse vectors resulting from the transformation of sequences into structures.
        """
        self.name = self.__class__.__name__
        self.transformer = transformer
        self.vectorizer = vectorizer
        self.clustering_algo = clustering_algo
        self.distance_std_factor = distance_std_factor
        self.min_cluster_size = min_cluster_size
        self.clusters = defaultdict(list)
        self.predictions = list()
        self.data_matrix = None
        self.random_state = random_state
        random.seed(self.random_state)
예제 #21
0
class Annotator():

    def __init__(self, multiprocess=True, score_attribute='importance'):
        self.score_attribute=score_attribute
        self.vectorizer=Vectorizer()
        self.multi_process=multiprocess
        self.trained=False

    def fit(self, graphs_pos, graphs_neg=[]):

        if self.trained:
            return self
        self.trained=True
        map(utils.remove_eden_annotation,graphs_pos+graphs_neg)
        map(lambda x: utils.node_operation(x, lambda n,d: d.pop('importance',None)), graphs_pos+graphs_neg)
        map( lambda graph: graph.graph.pop('mass_annotate_mp_was_here',None) ,graphs_pos+graphs_neg)

        if graphs_neg:
            #print 'choosing to train binary esti'
            self.estimator = SGDClassifier()
            classes= [1]*len(graphs_pos)+[-1]*len(graphs_neg)
            self.estimator.fit(self.vectorizer.transform(graphs_pos+graphs_neg),classes)
        else:
            self.estimator = ExperimentalOneClassEstimator()
            self.estimator.fit(self.vectorizer.transform(graphs_pos))
        return self


    def fit_transform(self,graphs_p, graphs_n=[]):
        self.fit(graphs_p,graphs_n)
        return self.transform(graphs_p),self.transform(graphs_n)

    def transform(self,graphs):
        return  self.annotate(graphs)

    def annotate(self,graphs,neg=False):
        if not graphs:
            return []
        return mass_annotate_mp(graphs,self.vectorizer,score_attribute=self.score_attribute,estimator=self.estimator,
                                multi_process=self.multi_process, invert_score=neg)
예제 #22
0
class ClustererWrapper(BaseEstimator, ClusterMixin):
    """Clusterer."""

    def __init__(self, program=None):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit_predict(self, graphs):
        """fit_predict."""
        try:
            data_matrix = self.vectorizer.transform(graphs)
            predictions = self.program.fit_predict(data_matrix)
            return predictions
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #23
0
 def setup(self, known_graphs=None, candidate_graphs=None):
     """Setup."""
     # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the
     # known graphs in the list 'known_graphs'
     parameters_priors = dict(n_neighbors=self.n_neighbors)
     parameters_priors.update(dict(vectorizer__complexity=self.complexity,
                                   vectorizer__discrete=True))
     fit_wrapped_knn_predictor_known = \
         model(known_graphs,
               program=KNNWrapper(program=NearestNeighbors()),
               parameters_priors=parameters_priors)
     # compute distances of candidate_graphs to known_graphs
     knn_candidate_graphs = predict(candidate_graphs,
                                    program=fit_wrapped_knn_predictor_known)
     knn_candidate_graphs = list(knn_candidate_graphs)
     self.distances_to_known_graphs = []
     for knn_candidate_graph in knn_candidate_graphs:
         distances = knn_candidate_graph.graph['distances']
         self.distances_to_known_graphs.append(distances)
     # compute candidate_graphs encodings
     vec = Vectorizer(complexity=self.complexity)
     self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
예제 #24
0
def clusterGraphs(graphs, r, d, copt):
    opts = copt[1:-1]
    optl = opts.split(",")
    opt = int(optl[0])
    vectorizer = Vectorizer(r=r, d=d)
    samples = len(graphs)
    minlclu = 5
    Xsp = vectorizer.transform(graphs)  #sparse feature matrix
    X = Xsp.todense()  #regular feature matrix
    #SM=metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma = 1)#similarity matrix
    SM = metrics.pairwise.pairwise_kernels(Xsp, metric='linear')
    DM = []  #distance matrix
    for i in range(len(SM)):
        DM.append([])
        for j in range(len(SM[i])):
            val = 1.0 - SM[i][j]
            if val < 0:
                DM[i].append(0.0)
            else:
                DM[i].append(val)
    if opt == 0:
        nc, labels = MShift(X)
    if opt == 1:
        #print(DM)
        minlclu = int(optl[2])
        nc, labels = DB_SCAN(DM, float(optl[1]), int(optl[2]))
    if opt == 2:
        nc, labels = AffProp(SM)
    if opt == 3:
        print(SM)  #Matrix(X)
        return 0, []
    if opt == 4:
        nc, labels = K_Means(X)
    if opt == 5:
        nc, labels = SpecClus(SM)
    if opt == 6:
        nc, labels = dclust(DM, int(optl[1]), int(optl[2]), float(optl[3]))

    return nc, labels, minlclu
예제 #25
0
 def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3):
     """init."""
     self.vec = Vectorizer(r=r,
                           d=d,
                           normalization=False,
                           inner_normalization=False)
     self.grammar = GrammarWrapper(radius_list=[1, 2, 3],
                                   thickness_list=[2],
                                   min_cip_count=min_count,
                                   min_interface_count=min_count,
                                   max_n_neighbors=max_n_neighbors,
                                   n_neigh_steps=1,
                                   max_neighborhood_size=max_n_neighbors)
예제 #26
0
    def __init__(
            self,
            min_count=2,
            max_n_neighbors=100,
            r=3,
            d=3,
            n_neighbors=10,
            max_num_solutions=30):
        """construct."""
        self.min_count = min_count
        self.max_n_neighbors = max_n_neighbors
        self.max_num_solutions = max_num_solutions
        self.r = r
        self.d = d
        self.n_neighbors = n_neighbors

        self.clf = Perceptron(n_iter=500)
        self.vec = Vectorizer(r=r, d=d,
                              normalization=True,
                              inner_normalization=True,
                              nbits=16)
        self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6]
예제 #27
0
 def setup(self, known_graphs=None, candidate_graphs=None):
     """Setup."""
     # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the
     # known graphs in the list 'known_graphs'
     parameters_priors = dict(n_neighbors=self.n_neighbors)
     parameters_priors.update(
         dict(vectorizer__complexity=self.complexity,
              vectorizer__discrete=True))
     fit_wrapped_knn_predictor_known = \
         model(known_graphs,
               program=KNNWrapper(program=NearestNeighbors()),
               parameters_priors=parameters_priors)
     # compute distances of candidate_graphs to known_graphs
     knn_candidate_graphs = predict(candidate_graphs,
                                    program=fit_wrapped_knn_predictor_known)
     knn_candidate_graphs = list(knn_candidate_graphs)
     self.distances_to_known_graphs = []
     for knn_candidate_graph in knn_candidate_graphs:
         distances = knn_candidate_graph.graph['distances']
         self.distances_to_known_graphs.append(distances)
     # compute candidate_graphs encodings
     vec = Vectorizer(complexity=self.complexity)
     self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
예제 #28
0
def compare(finalL, L, peaks, opt, th, alpha):
    n = len(L)
    lpeaks = {}
    for key in L:
        lpeaks[key] = peaks[key]
    for key in finalL:
        lpeaks[key] = peaks[key]
    graphs, dict = peaksToGraphs(lpeaks, opt, alpha)

    vectorizer = Vectorizer(r=2, d=3)
    samples = len(graphs)
    Xsp = vectorizer.transform(graphs)  #sparse feature matrix
    X = Xsp.todense()  #regular feature matrix
    SM = metrics.pairwise.pairwise_kernels(Xsp, metric='rbf',
                                           gamma=1)  #similarity matrix
    DM = []  #distance matrix
    for i in range(len(SM)):
        DM.append([])
        for j in range(len(SM[i])):
            val = 1.0 - SM[i][j]
            if val < 0:
                DM[i].append(0.0)
            else:
                DM[i].append(val)
    avgDM = 0.0
    counts = 0.0
    for i in range(len(graphs)):
        if dict[i] in L:
            for j in range(len(graphs)):
                if i != j and dict[j] in finalL:
                    avgDM += DM[i][j]
                    counts += 1
    avgDM = avgDM / counts
    if avgDM >= 0.0 and avgDM <= th:
        return 0
    else:
        return 1
예제 #29
0
    def _vectorize_graphs(self, graphs):
        """Vectorize the RNAplfold graphs using EDeN."""
        if self.verbose:
            print("Vectorizing (complexity: %i, hashing: %i bits)..." %
                  (self.complexity, self.nbits),
                  end=' ')
            sys.stdout.flush()

        vec = Vectorizer(complexity=self.complexity, nbits=self.nbits)
        x_sparse = eden_vectorize(graphs, vectorizer=vec, n_jobs=self.njobs)

        if self.verbose:
            print("Done.\n")
            sys.stdout.flush()
        return x_sparse.todense()
 def __init__(self,
              radius_list=None,
              thickness_list=None,
              min_cip_count=3,
              vectorizer=Vectorizer(complexity=3),
              min_interface_count=2,
              nbit=20,
              node_entity_check=lambda x, y: True):
     self.productions = {}
     self.min_interface_count = min_interface_count
     self.radius_list = radius_list
     self.thickness_list = thickness_list
     self.min_cip_count = min_cip_count
     self.vectorizer = vectorizer
     self.hash_bitmask = 2**nbit - 1
     self.nbit = nbit
     # checked when extracting grammar. see graphtools
     self.node_entity_check = node_entity_check
     self.prep_is_outdated = True
    def __init__(self,
                 radius_list=[0, 1],
                 thickness_list=[1, 2],
                 grammar=None,
                 core_interface_pair_remove_threshold=2,
                 interface_remove_threshold=2,
                 complexity=3,
                 vectorizer=Vectorizer(complexity=3),
                 estimator=estimator_wrapper.estimator_wrapper()):

        self.complexity = complexity
        self.feasibility_checker = FeasibilityChecker()
        self.postprocessor = processing.PostProcessor()
        self.vectorizer = vectorizer
        # lists of int
        self.radius_list = [int(2 * r) for r in radius_list]
        self.thickness_list = [int(2 * t) for t in thickness_list]
        # scikit  classifier
        self.estimatorobject = estimator
        # grammar object
        self.local_substitutable_graph_grammar = grammar
        # cips hashes will be masked with this, this is unrelated to the vectorizer
        self.hash_bitmask = pow(2, 20) - 1
        # we will save current graph at every intervalth step of sampling and attach to graphinfos[graphs]
        self.sampling_interval = None
        # how many sampling steps are done
        self.n_steps = None
        # current step in sampling proces of a single graph
        self.step = None
        # how often do we try to get a cip from the current graph  in sampling
        self.select_cip_max_tries = None
        # sample path
        self.sample_path = None

        self.local_substitutable_graph_grammar = LocalSubstitutableGraphGrammar(
            self.radius_list,
            self.thickness_list,
            complexity=self.complexity,
            cip_remove_threshold=core_interface_pair_remove_threshold,
            interface_remove_threshold=interface_remove_threshold,
            nbit=20)
예제 #32
0
class SequenceMotif(object):
    def __init__(self,
                 min_subarray_size=7,
                 max_subarray_size=10,
                 min_motif_count=1,
                 min_cluster_size=1,
                 training_size=None,
                 negative_ratio=2,
                 shuffle_order=2,
                 n_iter_search=1,
                 complexity=4,
                 nbits=20,
                 clustering_algorithm=None,
                 n_jobs=4,
                 n_blocks=8,
                 block_size=None,
                 pre_processor_n_jobs=4,
                 pre_processor_n_blocks=8,
                 pre_processor_block_size=None,
                 random_state=1):
        self.n_jobs = n_jobs
        self.n_blocks = n_blocks
        self.block_size = block_size
        self.pre_processor_n_jobs = pre_processor_n_jobs
        self.pre_processor_n_blocks = pre_processor_n_blocks
        self.pre_processor_block_size = pre_processor_block_size
        self.training_size = training_size
        self.n_iter_search = n_iter_search
        self.complexity = complexity
        self.nbits = nbits
        # init vectorizer
        self.vectorizer = Vectorizer(complexity=self.complexity,
                                     nbits=self.nbits)
        self.seq_vectorizer = PathVectorizer(complexity=self.complexity,
                                             nbits=self.nbits)
        self.negative_ratio = negative_ratio
        self.shuffle_order = shuffle_order
        self.clustering_algorithm = clustering_algorithm
        self.min_subarray_size = min_subarray_size
        self.max_subarray_size = max_subarray_size
        self.min_motif_count = min_motif_count
        self.min_cluster_size = min_cluster_size
        self.random_state = random_state
        random.seed(random_state)

        self.motives_db = defaultdict(list)
        self.motives = []
        self.clusters = defaultdict(list)
        self.cluster_models = []

    def save(self, model_name):
        self.clustering_algorithm = None  # NOTE: some algorithms cannot be pickled
        joblib.dump(self, model_name, compress=1)

    def load(self, obj):
        self.__dict__.update(joblib.load(obj).__dict__)
        self._build_cluster_models()

    def fit(self, seqs, neg_seqs=None):
        """
        Builds a discriminative estimator.
        Identifies the maximal subarrays in the data.
        Clusters them with the clustering algorithm provided in the initialization phase.
        For each cluster builds a fast sequence search model (Aho Corasick data structure).
        """
        start = time()
        if self.training_size is None:
            training_seqs = seqs
        else:
            training_seqs = random.sample(seqs, self.training_size)
        self._fit_predictive_model(training_seqs, neg_seqs=neg_seqs)
        end = time()
        logger.info('model induction: %d positive instances %d s' %
                    (len(training_seqs), (end - start)))

        start = time()
        self.motives = self._motif_finder(seqs)
        end = time()
        logger.info('motives extraction: %d motives in %ds' %
                    (len(self.motives), end - start))

        start = time()
        self._cluster(self.motives,
                      clustering_algorithm=self.clustering_algorithm)
        end = time()
        logger.info('motives clustering: %d clusters in %ds' %
                    (len(self.clusters), end - start))

        start = time()
        self._filter()
        end = time()
        n_motives = sum(len(self.motives_db[cid]) for cid in self.motives_db)
        n_clusters = len(self.motives_db)
        logger.info('after filtering: %d motives %d clusters in %ds' %
                    (n_motives, n_clusters, (end - start)))

        start = time()
        # create models
        self._build_cluster_models()
        end = time()
        logger.info('motif model construction in %ds' % (end - start))

        start = time()
        # update motives counts
        self._update_counts(seqs)
        end = time()
        logger.info('updated motif counts in %ds' % (end - start))

    def info(self):
        text = []
        for cluster_id in self.motives_db:
            num_hits = len(self.cluster_hits[cluster_id])
            frac_num_hits = num_hits / float(self.dataset_size)
            text.append('Cluster: %s #%d (%.3f)' %
                        (cluster_id, num_hits, frac_num_hits))
            for count, motif in sorted(self.motives_db[cluster_id],
                                       reverse=True):
                text.append('%s #%d' % (motif, count))
            text.append('')
        return text

    def _update_counts(self, seqs):
        self.dataset_size = len(seqs)
        cluster_hits = defaultdict(set)
        motives_db = defaultdict(list)
        for cluster_id in self.motives_db:
            motives = [motif for count, motif in self.motives_db[cluster_id]]
            motif_dict = {}
            for motif in motives:
                counter = 0
                for header, seq in seqs:
                    if motif in seq:
                        counter += 1
                        cluster_hits[cluster_id].add(header)
                motif_dict[motif] = counter
            # remove implied motives
            motif_dict_copy = motif_dict.copy()
            for motif_i in motif_dict:
                for motif_j in motif_dict:
                    if motif_dict[motif_i] == motif_dict[motif_j] and \
                            len(motif_j) < len(motif_i) and motif_j in motif_i:
                        if motif_j in motif_dict_copy:
                            motif_dict_copy.pop(motif_j)
            for motif in motif_dict_copy:
                motives_db[cluster_id].append((motif_dict[motif], motif))
        self.motives_db = motives_db
        self.cluster_hits = cluster_hits

    def fit_predict(self, seqs, return_list=False):
        self.fit(seqs)
        for prediction in self.predict(seqs, return_list=return_list):
            yield prediction

    def fit_transform(self, seqs, return_match=False):
        self.fit(seqs)
        for prediction in self.transform(seqs, return_match=return_match):
            yield prediction

    def predict(self, seqs, return_list=False):
        """Returns for each instance a list with the cluster ids that have a hit
        if  return_list=False then just return 1 if there is at least one hit from one cluster."""
        for header, seq in seqs:
            cluster_hits = []
            for cluster_id in self.motives_db:
                hits = list(self._cluster_hit(seq, cluster_id))
                if len(hits):
                    begin, end = min(hits)
                    cluster_hits.append((begin, cluster_id))
            if return_list is False:
                if len(cluster_hits):
                    yield len(cluster_hits)
                else:
                    yield 0
            else:
                yield [cluster_id for pos, cluster_id in sorted(cluster_hits)]

    def transform(self, seqs, return_match=False):
        """Transform an instance to a dense vector with features as cluster ID and entries 0/1 if a motif is found,
        if 'return_match' argument is True, then write a pair with (start position,end position)  in the entry
        instead of 0/1"""
        num = len(self.motives_db)
        for header, seq in seqs:
            cluster_hits = [0] * num
            for cluster_id in self.motives_db:
                hits = self._cluster_hit(seq, cluster_id)
                hits = list(hits)
                if return_match is False:
                    if len(hits):
                        cluster_hits[cluster_id] = 1
                else:
                    cluster_hits[cluster_id] = hits
            yield cluster_hits

    def _serial_graph_motif(self, seqs, placeholder=None):
        # make graphs
        iterable = sequence_to_eden(seqs)
        # use node importance and 'position' attribute to identify max_subarrays of a specific size
        graphs = self.vectorizer.annotate(iterable, estimator=self.estimator)

        # use compute_max_subarrays to return an iterator over motives
        motives = []
        for graph in graphs:
            subarrays = compute_max_subarrays(
                graph=graph,
                min_subarray_size=self.min_subarray_size,
                max_subarray_size=self.max_subarray_size)
            if subarrays:
                for subarray in subarrays:
                    motives.append(subarray['subarray_string'])
        return motives

    def _multiprocess_graph_motif(self, seqs):
        size = len(seqs)
        intervals = compute_intervals(size=size,
                                      n_blocks=self.n_blocks,
                                      block_size=self.block_size)
        if self.n_jobs == -1:
            pool = mp.Pool()
        else:
            pool = mp.Pool(processes=self.n_jobs)
        results = [
            apply_async(pool,
                        self._serial_graph_motif,
                        args=(seqs[start:end], True))
            for start, end in intervals
        ]
        output = [p.get() for p in results]
        return list(chain(*output))

    def _motif_finder(self, seqs):
        if self.n_jobs > 1 or self.n_jobs == -1:
            return self._multiprocess_graph_motif(seqs)
        else:
            return self._serial_graph_motif(seqs)

    def _fit_predictive_model(self, seqs, neg_seqs=None):
        # duplicate iterator
        pos_seqs, pos_seqs_ = tee(seqs)
        pos_graphs = mp_pre_process(pos_seqs,
                                    pre_processor=sequence_to_eden,
                                    n_blocks=self.pre_processor_n_blocks,
                                    block_size=self.pre_processor_block_size,
                                    n_jobs=self.pre_processor_n_jobs)
        if neg_seqs is None:
            # shuffle seqs to obtain negatives
            neg_seqs = seq_to_seq(pos_seqs_,
                                  modifier=shuffle_modifier,
                                  times=self.negative_ratio,
                                  order=self.shuffle_order)
        neg_graphs = mp_pre_process(neg_seqs,
                                    pre_processor=sequence_to_eden,
                                    n_blocks=self.pre_processor_n_blocks,
                                    block_size=self.pre_processor_block_size,
                                    n_jobs=self.pre_processor_n_jobs)
        # fit discriminative estimator
        self.estimator = fit(pos_graphs,
                             neg_graphs,
                             vectorizer=self.vectorizer,
                             n_iter_search=self.n_iter_search,
                             n_jobs=self.n_jobs,
                             n_blocks=self.n_blocks,
                             block_size=self.block_size,
                             random_state=self.random_state)

    def _cluster(self, seqs, clustering_algorithm=None):
        data_matrix = vectorize(seqs,
                                vectorizer=self.seq_vectorizer,
                                n_blocks=self.n_blocks,
                                block_size=self.block_size,
                                n_jobs=self.n_jobs)
        predictions = clustering_algorithm.fit_predict(data_matrix)
        # collect instance ids per cluster id
        for i in range(len(predictions)):
            self.clusters[predictions[i]] += [i]

    def _filter(self):
        # transform self.clusters that contains only the ids of the motives to
        # clustered_motives that contains the actual sequences
        new_sequential_cluster_id = -1
        clustered_motives = defaultdict(list)
        for cluster_id in self.clusters:
            if cluster_id != -1:
                if len(self.clusters[cluster_id]) >= self.min_cluster_size:
                    new_sequential_cluster_id += 1
                    for motif_id in self.clusters[cluster_id]:
                        clustered_motives[new_sequential_cluster_id].append(
                            self.motives[motif_id])
        motives_db = defaultdict(list)
        # extract motif count within a cluster
        for cluster_id in clustered_motives:
            # consider only non identical motives
            motif_set = set(clustered_motives[cluster_id])
            for motif_i in motif_set:
                # count occurrences of each motif in cluster
                count = 0
                for motif_j in clustered_motives[cluster_id]:
                    if motif_i == motif_j:
                        count += 1
                # create dict with motives and their counts
                # if counts are above a threshold
                if count >= self.min_motif_count:
                    motives_db[cluster_id].append((count, motif_i))
        # transform cluster ids to incremental ids
        incremental_id = 0
        for cluster_id in motives_db:
            if len(motives_db[cluster_id]) >= self.min_cluster_size:
                self.motives_db[incremental_id] = motives_db[cluster_id]
                incremental_id += 1

    def _build_cluster_models(self):
        self.cluster_models = []
        for cluster_id in self.motives_db:
            motives = [motif for count, motif in self.motives_db[cluster_id]]
            cluster_model = esm.Index()
            for motif in motives:
                cluster_model.enter(motif)
            cluster_model.fix()
            self.cluster_models.append(cluster_model)

    def _cluster_hit(self, seq, cluster_id):
        for ((start, end),
             motif) in self.cluster_models[cluster_id].query(seq):
            yield (start, end)
예제 #33
0
matplotlib.use('Agg')

from eden.converter.graph.gspan import gspan_to_eden
from graphlearn.graphlearn import GraphLearnSampler
from eden.graph import Vectorizer
import matplotlib.pyplot as plt
import itertools
from graphlearn.utils import myeden
from eden.util import fit_estimator as eden_fit_estimator
from eden.util import selection_iterator as picker
from sklearn.linear_model import SGDClassifier
import random


# a vectorizer
vectorizer = Vectorizer( complexity=3 )

# select 1st element in an iterator
def unpack(graphs):
    for graphlist in graphs:
        yield graphlist[0]


def make_estimator(pos,neg):
    pos = vectorizer.transform( pos )
    neg = vectorizer.transform( neg )
    esti = eden_fit_estimator(SGDClassifier(), positive_data_matrix=pos,
                                        negative_data_matrix=neg)
    return esti

예제 #34
0
class IdealGraphEstimator(object):
    """Build an estimator for graphs."""

    def __init__(
            self,
            min_count=2,
            max_n_neighbors=100,
            r=3,
            d=3,
            n_neighbors=10,
            max_num_solutions=30):
        """construct."""
        self.min_count = min_count
        self.max_n_neighbors = max_n_neighbors
        self.max_num_solutions = max_num_solutions
        self.r = r
        self.d = d
        self.n_neighbors = n_neighbors

        self.clf = Perceptron(n_iter=500)
        self.vec = Vectorizer(r=r, d=d,
                              normalization=True,
                              inner_normalization=True,
                              nbits=16)
        self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6]

    def fit(self, pos_graphs, neg_graphs):
        """fit."""
        ref_graphs = self.construct(pos_graphs, neg_graphs)
        logger.debug('Working on %d constructed graphs' % len(ref_graphs))
        y = [1] * len(pos_graphs) + [-1] * len(neg_graphs)
        x = self.vec.transform(pos_graphs + neg_graphs)
        z = self.vec.transform(ref_graphs)
        n_features = z.shape[0]
        k = np.hstack([pairwise_kernels(x, z, metric='rbf', gamma=g)
                       for g in self.gs])
        step = len(ref_graphs) / 2
        n_inst, n_feat = k.shape
        txt = 'RFECV on %d instances with %d features with step: %d' % \
            (n_inst, n_feat, step)
        logger.debug(txt)
        selector = RFECV(self.clf, step=step, cv=10)
        selector = selector.fit(k, y)

        ids = list(concat([range(n_features)] * len(self.gs)))
        gs_list = list(concat([[g] * n_features for g in self.gs]))

        feat = defaultdict(list)
        for g, i, s in zip(gs_list, ids, selector.support_):
            if s:
                feat[g].append(i)

        self.mats = dict()
        for g in sorted(feat):
            mat = vstack([z[i] for i in feat[g]])
            self.mats[g] = mat

        sel_ids = set([i for i, s in zip(ids, selector.support_) if s])
        self.ideal_graphs_ = [ref_graphs[i] for i in sel_ids]
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vec.transform(graphs)
        xtr = np.hstack([pairwise_kernels(x,
                                          self.mats[g], metric='rbf', gamma=g)
                         for g in sorted(self.mats)])
        return xtr

    def construct(self, pos_graphs, neg_graphs):
        """construct."""
        args = dict(
            min_count=self.min_count,
            max_n_neighbors=self.max_n_neighbors,
            r=self.r,
            d=self.d,
            n_landmarks=5,
            n_neighbors=self.n_neighbors,
            n_iter=20,
            k_best=5,
            max_num_solutions=self.max_num_solutions)
        self.active_constr = NearestNeighborsMeanOptimizer(
            improve=False, **args)
        self.active_constr.fit(pos_graphs, neg_graphs)
        graphs = pos_graphs + neg_graphs
        active_pareto_set_graphs = self.active_constr.optimize(graphs)

        self.pos_constr = NearestNeighborsMeanOptimizer(
            improve=True, **args)
        self.pos_constr.fit(pos_graphs, neg_graphs)
        pareto_set_graphs = self.pos_constr.optimize(graphs)

        sel_constructed_graphs = pareto_set_graphs + active_pareto_set_graphs
        return sel_constructed_graphs
예제 #35
0
def vectorize(thing):
    v = Vectorizer()
    if not thing:
        raise Exception( "need something to vectirize.. received %s" % str(thing))
    thing=list(thing) # current eden does not eat generators anymore? weird
    return v.transform(thing)
예제 #36
0
 def __init__(self,
              program=SGDRegressor(average=True, shuffle=True)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
 def __init__(self):
     # this is mainly for the forest. the sampler uses a different vectorizer
     self.vectorizer = Vectorizer(nbits=14)
class DiscSampler():
    '''
    '''
    def __init__(self):
        # this is mainly for the forest. the sampler uses a different vectorizer
        self.vectorizer = Vectorizer(nbits=14)

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(
                heap,
                (
                    self.sampler.estimator.predict_proba(
                        self.sampler.vectorizer.transform_single(
                            graph2))[0][1],  # score ~ dist from hyperplane
                    k +
                    1,  # making sure that the counter is high so we dont output the startgraphz at the end
                    graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances
                     ]  # the second element should be the dist we want
        avg_dist = distances[len(distances) /
                             2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

    '''
    def sample_simple(self,graphiter,iterneg):
        graphiter,grait,griter2 = itertools.tee(graphiter,3)
        
        self.fit_sampler(graphiter,iterneg)
        a,b,c=self.get_heap_and_forest( griter2, 30)


        grait= itertools.islice(grait,5)
        rez=self.sampler.sample(grait,n_samples=5,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=1,
                                       select_cip_max_tries=100,
                                       accept_annealing_factor=.5,
                                       generatormode=False,
                                       same_core_size=False )
        return rez
    '''

    def sample_graphs(self,
                      graphiter,
                      iter_neg,
                      radius,
                      how_many,
                      check_k,
                      heap_chunk_size=10):

        # some initialisation,
        # creating samper
        # setup heap and forest
        graphiter, iter2 = itertools.tee(graphiter)
        self.fit_sampler(iter2, iter_neg)

        heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k)
        # heap should be like   (hpdist, count, graph)
        radius = radius * avg_dist
        # so lets start the loop1ng
        result = []
        while heap and len(result) < how_many:

            # pop all the graphs we want
            todo = []
            for i in range(heap_chunk_size):
                if heap:
                    todo.append(heapq.heappop(heap))

            # let the sampler do the sampling
            graphz = [e[2] for e in todo]
            #draw.draw_graph_set_graphlearn(graphz)
            work = self.sampler.sample(graphz,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=30,
                                       select_cip_max_tries=100,
                                       improving_threshold=.5,
                                       generatormode=False,
                                       max_core_size_diff=False,
                                       n_samples=3)
            # lets see, we need to take care of
            # = the initialy poped stuff
            # - increase and check the counter, reinsert into heap
            # = the new graphs
            # put them in the heap and the forest
            for graph, task in zip(work, todo):
                graphlist = graph.graph['sampling_info']['graphs_history']
                print 'rez:', graphlist, task
                for graph2 in graphlist:
                    # check distance from created instances
                    x = self.vectorizer.transform_single(graph2)
                    dist, void = forest.kneighbors(x, 1)
                    dist = sum(dist)
                    # is the distance ok?
                    # if so, insert into forest and heap
                    if radius < dist < radius * 2:
                        forest.partial_fit(x)
                        heapq.heappush(heap,
                                       (graph2.graph['score'], 0, graph2))
                        print 'heap'
                    print 'cant heap', radius, dist
                # taking care of task graph
                # put in result list if necessary
                if task[1] < check_k < task[1] + len(graphlist):
                    result.append(task[2])
                    print 'found sth'
                # go back to the heap!
                heapq.heappush(heap,
                               (task[0], task[1] + len(graphlist), task[2]))

        return result

    '''
    def simple_fit(self,iter_pos):
        self.sampler= GraphLearnSampler()
        self.sampler.fit(iter_pos)
        self.estimator=self.sampler.estimator
    '''

    def fit_sampler(self, iter_pos, iter_neg):
        # getting the sampler ready:
        self.sampler = MySampler(radius_list=[0, 1],
                                 thickness_list=[0.5, 1, 2])
        iter_pos, pos, pos_ = itertools.tee(iter_pos, 3)
        self.estimator = self.sampler.estimatorobject.fit_2(
            iter_pos, iter_neg, self.sampler.vectorizer)
        print 'got estimeetaaa'
        self.sampler.local_substitutable_graph_grammar.fit(
            pos, grammar_n_jobs=-1, grammar_batch_size=8)
        self.sampler.estimator = self.estimator
        print 'got grammar:grammar is there oO'
예제 #39
0
파일: __init__.py 프로젝트: smautner/EDeN
 def __init__(self):
     """Construct."""
     self.vectorizer = Vectorizer()
예제 #40
0
class ClassifierWrapper(BaseEstimator, ClassifierMixin):
    """Classifier."""

    def __init__(self,
                 program=SGDClassifier(average=True,
                                       class_weight='balanced',
                                       shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            y = self._extract_targets(graphs)
            # manage case for single class learning
            if len(set(y)) == 1:
                # make negative data matrix
                negative_data_matrix = data_matrix.multiply(-1)
                # make targets
                y = list(y)
                y_neg = [-1] * len(y)
                # concatenate elements
                data_matrix = vstack(
                    [data_matrix, negative_data_matrix], format="csr")
                y = y + y_neg
                y = np.ravel(y)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            predictions = self.program.predict(data_matrix)
            scores = self.program.decision_function(data_matrix)
            for score, prediction, graph in izip(scores, predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = score
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
예제 #41
0
class DiscSampler():
    '''
    '''

    def __init__(self):
        # this is mainly for the forest. the sampler uses a different vectorizer
        self.vectorizer = Vectorizer(nbits=14)

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

    '''
    def sample_simple(self,graphiter,iterneg):
        graphiter,grait,griter2 = itertools.tee(graphiter,3)
        
        self.fit_sampler(graphiter,iterneg)
        a,b,c=self.get_heap_and_forest( griter2, 30)


        grait= itertools.islice(grait,5)
        rez=self.sampler.sample(grait,n_samples=5,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=1,
                                       select_cip_max_tries=100,
                                       accept_annealing_factor=.5,
                                       generatormode=False,
                                       same_core_size=False )
        return rez
    '''

    def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10):

        # some initialisation,
        # creating samper
        # setup heap and forest
        graphiter, iter2 = itertools.tee(graphiter)
        self.fit_sampler(iter2, iter_neg)

        heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k)
        # heap should be like   (hpdist, count, graph)
        radius = radius * avg_dist
        # so lets start the loop1ng
        result = []
        while heap and len(result) < how_many:

            # pop all the graphs we want
            todo = []
            for i in range(heap_chunk_size):
                if heap:
                    todo.append(heapq.heappop(heap))

            # let the sampler do the sampling
            graphz = [e[2] for e in todo]
            # draw.draw_graph_set_graphlearn(graphz)
            work = self.sampler.sample(graphz,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=30,
                                       select_cip_max_tries=100,
                                       improving_threshold=.5,
                                       generatormode=False,
                                       max_core_size_diff=False,
                                       n_samples=3
                                       )
            # lets see, we need to take care of
            # = the initialy poped stuff
            # - increase and check the counter, reinsert into heap
            # = the new graphs
            # put them in the heap and the forest
            for graph, task in zip(work, todo):
                graphlist = graph.graph['sampling_info']['graphs_history']
                print 'rez:', graphlist, task
                for graph2 in graphlist:
                    # check distance from created instances
                    x = self.vectorizer.transform_single(graph2)
                    dist, void = forest.kneighbors(x, 1)
                    dist = sum(dist)
                    # is the distance ok?
                    # if so, insert into forest and heap
                    if radius < dist < radius * 2:
                        forest.partial_fit(x)
                        heapq.heappush(heap, (graph2.graph['score'], 0, graph2))
                        print 'heap'
                    print 'cant heap', radius, dist
                # taking care of task graph
                # put in result list if necessary
                if task[1] < check_k < task[1] + len(graphlist):
                    result.append(task[2])
                    print 'found sth'
                # go back to the heap!
                heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2]))

        return result

    '''
    def simple_fit(self,iter_pos):
        self.sampler= GraphLearnSampler()
        self.sampler.fit(iter_pos)
        self.estimator=self.sampler.estimator
    '''

    def fit_sampler(self, iter_pos, iter_neg):
        # getting the sampler ready:
        self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2])
        iter_pos, pos, pos_ = itertools.tee(iter_pos, 3)
        self.estimator = self.sampler.estimatorobject.fit_2(iter_pos, iter_neg, self.sampler.vectorizer)
        print 'got estimeetaaa'
        self.sampler.local_substitutable_graph_grammar.fit(pos, grammar_n_jobs=-1, grammar_batch_size=8)
        self.sampler.estimator = self.estimator
        print 'got grammar:grammar is there oO'
예제 #42
0
파일: BFG1.py 프로젝트: smautner/GraphLearn
matplotlib.use('Agg')

from eden.converter.graph.gspan import gspan_to_eden
from graphlearn.graphlearn import GraphLearnSampler
from eden.graph import Vectorizer
import matplotlib.pyplot as plt
import itertools
from graphlearn.utils import myeden
from eden.util import fit_estimator as eden_fit_estimator
from eden.util import selection_iterator as picker
from sklearn.linear_model import SGDClassifier
import random


# a vectorizer
vectorizer = Vectorizer( complexity=3 )

# select 1st element in an iterator
def unpack(graphs):
    for graphlist in graphs:
        yield graphlist[0]


def make_estimator(pos,neg):
    pos = vectorizer.transform( pos )
    neg = vectorizer.transform( neg )
    esti = eden_fit_estimator(SGDClassifier(), positive_data_matrix=pos,
                                        negative_data_matrix=neg)
    return esti

예제 #43
0
 def vectorizer_init(self, args):
     vectorizer = Vectorizer()
     vectorizer_parameters = {'complexity': [2, 3, 4, 5, 6]}
     return vectorizer, vectorizer_parameters
예제 #44
0
        exit()

    print "*raw args"
    print "*"*80
    print args


    # verbosity
    from eden.util import configure_logging
    import logging
    configure_logging(logging.getLogger(),verbosity=args.pop('verbose'))


    # handle Vectorizer:
    from eden.graph import Vectorizer
    args['vectorizer'] = Vectorizer(args.pop('vectorizer_complexity'))


    # estimator, if the user is providing a negative graph set, we use
    # the twoclass esti OO
    import graphlearn01.estimate as estimate
    if args['negative_input']==None:
        args['estimator']=estimate.OneClassEstimator(nu=.5, cv=2, n_jobs=-1)
    else:
        args['estimator']=estimate.TwoClassEstimator( cv=2, n_jobs=-1)
        
    #args for fitting:
    from eden.io.gspan import gspan_to_eden
    from itertools import islice
    fitargs={ k:args.pop(k) for k in ['lsgg_include_negatives','grammar_n_jobs','grammar_batch_size']}
예제 #45
0
 def __init__(self, multiprocess=True, score_attribute='importance'):
     self.score_attribute=score_attribute
     self.vectorizer=Vectorizer()
     self.multi_process=multiprocess
     self.trained=False
예제 #46
0
 def __init__(self, program=None):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #47
0
파일: estimator.py 프로젝트: lukedan/EDeN
class EdenEstimator(BaseEstimator, ClassifierMixin):
    """Build an estimator for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 balance=False, subsample_size=200, ratio=2,
                 normalization=False, inner_normalization=False,
                 penalty='elasticnet', n_iter=500):
        """construct."""
        self.set_params(r, d, nbits, discrete, balance, subsample_size,
                        ratio, normalization, inner_normalization,
                        penalty, n_iter)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   balance=False, subsample_size=200, ratio=2,
                   normalization=False, inner_normalization=False,
                   penalty='elasticnet', n_iter=500):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.balance = balance
        self.subsample_size = subsample_size
        self.ratio = ratio
        if penalty == 'perceptron':
            self.model = Perceptron(n_iter=n_iter)
        else:
            self.model = SGDClassifier(
                average=True, class_weight='balanced', shuffle=True,
                penalty=penalty)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    @timeit
    def fit(self, graphs, targets, randomize=True):
        """fit."""
        if self.balance:
            if randomize:
                bal_graphs, bal_targets = balance(
                    graphs, targets, None, ratio=self.ratio)
            else:
                samp_graphs, samp_targets = subsample(
                    graphs, targets, subsample_size=self.subsample_size)
                x = self.transform(samp_graphs)
                self.model.fit(x, samp_targets)
                bal_graphs, bal_targets = balance(
                    graphs, targets, self, ratio=self.ratio)
            size = len(bal_targets)
            logger.debug('Dataset size=%d' % (size))
            x = self.transform(bal_graphs)
            self.model = self.model.fit(x, bal_targets)
        else:
            x = self.transform(graphs)
            self.model = self.model.fit(x, targets)
        return self

    @timeit
    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    @timeit
    def decision_function(self, graphs):
        """decision_function."""
        x = self.transform(graphs)
        preds = self.model.decision_function(x)
        return preds

    @timeit
    def cross_val_score(self, graphs, targets,
                        scoring='roc_auc', cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_score(
            self.model, x, targets, cv=cv, scoring=scoring)
        return scores

    @timeit
    def cross_val_predict(self, graphs, targets, cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_predict(
            self.model, x, targets, cv=cv, method='decision_function')
        return scores

    @timeit
    def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids

    @timeit
    def model_selection(self, graphs, targets,
                        n_iter=30, subsample_size=None):
        """model_selection_randomized."""
        param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))}
        if subsample_size:
            graphs, targets = subsample(
                graphs, targets, subsample_size=subsample_size)

        pool = mp.Pool()
        scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter)
        pool.close()
        pool.join()

        best_params = max(scores)[1]
        logger.debug("Best parameters:\n%s" % (best_params))
        self = EdenEstimator(**best_params)
        return self

    @timeit
    def learning_curve(self, graphs, targets,
                       cv=5, n_steps=10, start_fraction=0.1):
        """learning_curve."""
        graphs, targets = paired_shuffle(graphs, targets)
        x = self.transform(graphs)
        train_sizes = np.linspace(start_fraction, 1.0, n_steps)
        scoring = 'roc_auc'
        train_sizes, train_scores, test_scores = learning_curve(
            self.model, x, targets,
            cv=cv, train_sizes=train_sizes,
            scoring=scoring)
        return train_sizes, train_scores, test_scores

    def bias_variance_decomposition(self, graphs, targets,
                                    cv=5, n_bootstraps=10):
        """bias_variance_decomposition."""
        x = self.transform(graphs)
        score_list = []
        for i in range(n_bootstraps):
            scores = cross_val_score(
                self.model, x, targets, cv=cv)
            score_list.append(scores)
        score_list = np.array(score_list)
        mean_scores = np.mean(score_list, axis=1)
        std_scores = np.std(score_list, axis=1)
        return mean_scores, std_scores
예제 #48
0
class Vectorizer(object):

    def __init__(self,
                 complexity=None,
                 nbits=20,
                 sequence_vectorizer_complexity=3,
                 graph_vectorizer_complexity=2,
                 n_neighbors=5,
                 sampling_prob=.5,
                 n_iter=5,
                 min_energy=-5,
                 random_state=1):
        random.seed(random_state)
        if complexity is not None:
            sequence_vectorizer_complexity = complexity
            graph_vectorizer_complexity = complexity

        self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity,
                                                 nbits=nbits,
                                                 normalization=False,
                                                 inner_normalization=False)
        self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits)
        self.n_neighbors = n_neighbors
        self.sampling_prob = sampling_prob
        self.n_iter = n_iter
        self.min_energy = min_energy
        self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, seqs):
        # store seqs
        self.seqs = list(normalize_seqs(seqs))
        data_matrix = self.sequence_vectorizer.transform(self.seqs)
        # fit nearest_neighbors model
        self.nearest_neighbors.fit(data_matrix)
        return self

    def fit_transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs, seqs_ = tee(seqs)
        return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter)

    def transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        graphs_ = self.graphs(seqs)
        data_matrix = self.graph_vectorizer.transform(graphs_)
        return data_matrix

    def graphs(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        if n_iter is not None:
            self.n_iter = n_iter
        if sampling_prob is not None:
            self.sampling_prob = sampling_prob
        for seq, neighs in self._compute_neighbors(seqs):
            if self.n_iter > 1:
                header, sequence, struct, energy = self._optimize_struct(seq, neighs)
            else:
                header, sequence, struct, energy = self._align_sequence_structure(seq, neighs)
            graph = self._seq_to_eden(header, sequence, struct, energy)
            yield graph

    def _optimize_struct(self, seq, neighs):
        structs = []
        results = []
        for i in range(self.n_iter):
            new_neighs = self._sample_neighbors(neighs)
            header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs)
            results.append((header, sequence, struct, energy))
            structs.append(struct)
        instance_id = self._most_representative(structs)
        selected = results[instance_id]
        return selected

    def _most_representative(self, structs):
        # compute kernel matrix with sequence_vectorizer
        data_matrix = self.sequence_vectorizer.transform(structs)
        kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1)
        # compute instance density as 1 over average pairwise distance
        density = np.sum(kernel_matrix, 0) / data_matrix.shape[0]
        # compute list of nearest neighbors
        max_id = np.argsort(-density)[0]
        return max_id

    def _sample_neighbors(self, neighs):
        out_neighs = []
        # insert one element at random
        out_neighs.append(random.choice(neighs))
        # add other elements sampling without replacement
        for neigh in neighs:
            if random.random() < self.sampling_prob:
                out_neighs.append(neigh)
        return out_neighs

    def _align_sequence_structure(self, seq, neighs, structure_deletions=False):
        header = seq[0]
        if len(neighs) < 1:
            clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            energy = 0
            logger.debug('Warning: no alignment for: %s' % seq)
        else:
            str_out = convert_seq_to_fasta_str(seq)
            for neigh in neighs:
                str_out += convert_seq_to_fasta_str(neigh)
            cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out)
            out = sp.check_output(cmd, shell=True)
            seed = extract_aligned_seed(header, out)
            cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out)
            out = sp.check_output(cmd, shell=True)
            struct, energy = extract_struct_energy(out)
            if energy > self.min_energy:
                # use min free energy structure
                clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            else:
                clean_seq, clean_struct = make_seq_struct(seed, struct)
            if structure_deletions:
                clean_struct = self._clean_structure(clean_seq, clean_struct)

        return header, clean_seq, clean_struct, energy

    def _clean_structure(self, seq, stru):
        '''
        Parameters
        ----------
        seq : basestring
            rna sequence
        stru : basestring
            dotbracket string

        Returns
        -------
        the structure given may not respect deletions in the sequence.
        we transform the structure to one that does
        '''

        # find  deletions in sequence
        ids = []
        for i, c in enumerate(seq):
            if c == '-':
                ids.append(i)
        # remove brackets that dont have a partner anymore
        stru = list(stru)
        pairdict = self._pairs(stru)
        for i in ids:
            stru[pairdict[i]] = '.'
        # delete deletions in structure
        ids.reverse()
        for i in ids:
            del stru[i]
        stru = ''.join(stru)

        # removing obvious mistakes
        stru = stru.replace("(())", "....")
        stru = stru.replace("(.)", "...")
        stru = stru.replace("(..)", "....")

        return stru

    def _pairs(self, struct):
        '''
        Parameters
        ----------
        struct : basestring

        Returns
        -------
        dictionary of ids in the struct, that are bond pairs
        '''
        unpaired = []
        pairs = {}
        for i, c in enumerate(struct):
            if c == '(':
                unpaired.append(i)
            if c == ')':
                partner = unpaired.pop()
                pairs[i] = partner
                pairs[partner] = i
        return pairs

    def _compute_neighbors(self, seqs):
        seqs = list(seqs)
        data_matrix = self.sequence_vectorizer.transform(seqs)
        # find neighbors
        distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix)
        # for each seq
        for seq, neighs in zip(seqs, neighbors):
            neighbor_seqs = [self.seqs[neigh] for neigh in neighs]
            yield seq, neighbor_seqs

    def _seq_to_eden(self, header, sequence, struct, energy):
        graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct)
        if graph.number_of_nodes() < 2:
            graph = seq_to_networkx(header, sequence)
        graph.graph['id'] = header
        graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy)
        graph.graph['energy'] = energy
        graph.graph['sequence'] = sequence
        return graph
예제 #49
0
class RegressorWrapper(BaseEstimator, RegressorMixin):
    """Regressor."""
    def __init__(self, program=SGDRegressor(average=True, shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            y = self._extract_targets(graphs)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            predictions = self.program.predict(data_matrix)
            for prediction, graph in izip(predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = prediction
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
예제 #50
0
class ClassifierWrapper(BaseEstimator, ClassifierMixin):
    """Classifier."""
    def __init__(self,
                 program=SGDClassifier(average=True,
                                       class_weight='balanced',
                                       shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            y = self._extract_targets(graphs)
            # manage case for single class learning
            if len(set(y)) == 1:
                # make negative data matrix
                negative_data_matrix = data_matrix.multiply(-1)
                # make targets
                y = list(y)
                y_neg = [-1] * len(y)
                # concatenate elements
                data_matrix = vstack([data_matrix, negative_data_matrix],
                                     format="csr")
                y = y + y_neg
                y = np.ravel(y)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            predictions = self.program.predict(data_matrix)
            scores = self.program.decision_function(data_matrix)
            for score, prediction, graph in izip(scores, predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = score
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
예제 #51
0
 def __init__(self):
     # this is mainly for the forest. the sampler uses a different vectorizer
     self.vectorizer = Vectorizer(nbits=14)
예제 #52
0
class VolumeConstructor(object):
    """VolumeConstructor."""
    def __init__(self,
                 min_count=2,
                 max_n_neighbors=100,
                 r=3,
                 d=3,
                 class_discretizer=2,
                 class_std_discretizer=1,
                 similarity_discretizer=10,
                 size_discretizer=1,
                 volume_discretizer=10,
                 n_neighbors=10,
                 improve=True):
        """init."""
        self.improve = improve
        self.n_neighbors = n_neighbors
        self.non_norm_vec = Vectorizer(r=r,
                                       d=d,
                                       normalization=False,
                                       inner_normalization=False)
        self.vec = Vectorizer(r=r,
                              d=d,
                              normalization=True,
                              inner_normalization=True)
        self.grammar = GrammarWrapper(radius_list=[1, 2, 3],
                                      thickness_list=[2],
                                      min_cip_count=min_count,
                                      min_interface_count=min_count,
                                      max_n_neighbors=max_n_neighbors,
                                      n_neigh_steps=1,
                                      max_neighborhood_size=max_n_neighbors)
        self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator(
            self.vec,
            class_discretizer=class_discretizer,
            class_std_discretizer=class_std_discretizer,
            similarity_discretizer=similarity_discretizer,
            size_discretizer=size_discretizer,
            volume_discretizer=volume_discretizer,
            improve=improve)
        self.cost_estimator = MultiObjectiveCostEstimator(
            self.non_norm_vec, improve)
        self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, pos_graphs, neg_graphs):
        """fit."""
        self.all_graphs = pos_graphs + neg_graphs
        self.all_vecs = self.vec.transform(self.all_graphs)
        self.grammar.fit(self.all_graphs)
        logger.info('%s' % self.grammar)
        self.sim_cost_estimator.fit(pos_graphs, neg_graphs)
        self.cost_estimator.fit(pos_graphs, neg_graphs)
        self.nn_estimator.fit(self.all_vecs)

    def sample(self, sample_graphs):
        """sample."""
        # pareto filter using similarity of the dataset for initial seed
        costs = self.sim_cost_estimator.compute(sample_graphs)
        seed_graphs = get_pareto_set(sample_graphs, costs)

        # run optimization in parallel
        pareto_graphs_list = self._optimize_parallel(seed_graphs)
        self._log_result(pareto_graphs_list)

        # join all pareto sets
        pareto_set_graphs = pipe(pareto_graphs_list, concat, list)

        # pareto filter using similarity of the solutions
        pareto_set_costs = self.sim_cost_estimator.compute(pareto_set_graphs)
        sel_pareto_set_graphs = get_pareto_set(pareto_set_graphs,
                                               pareto_set_costs)
        logger.info('#constructed graphs:%5d' % (len(sel_pareto_set_graphs)))
        return sel_pareto_set_graphs

    def _log_result(self, pareto_graphs_list):
        tot_size = sum(len(graphs) for graphs in pareto_graphs_list)
        msg = 'pareto set sizes [%d]: ' % tot_size
        for graphs in pareto_graphs_list:
            msg += '[%d]' % len(graphs)
        logger.info(msg)

    def _optimize_parallel(self, reference_graphs):
        """optimize_parallel."""
        pool = multiprocessing.Pool()
        res = [
            apply_async(pool, self._optimize_single, args=(g, ))
            for g in reference_graphs
        ]
        pareto_set_graphs_list = [p.get() for p in res]
        pool.close()
        pool.join()
        return pareto_set_graphs_list

    def _get_constraints(self, reference_graph):
        reference_vec = self.non_norm_vec.transform([reference_graph])
        # find neighbors
        neighbors = self.nn_estimator.kneighbors(reference_vec,
                                                 return_distance=False)
        neighbors = neighbors[0]
        # compute center of mass
        reference_graphs = [self.all_graphs[i] for i in neighbors]
        reference_vecs = self.all_vecs[neighbors]
        avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0)

        reference_vecs = self.non_norm_vec.transform(reference_graphs)
        # compute desired distances
        desired_distances = euclidean_distances(avg_reference_vec,
                                                reference_vecs)
        desired_distances = desired_distances[0]
        return reference_graphs, desired_distances

    def _optimize_single(self, reference_graph):
        """optimize_single."""
        res = self._get_constraints(reference_graph)
        reference_graphs, desired_distances = res
        moo = MultiObjectiveOptimizer(self.vec,
                                      self.grammar,
                                      self.cost_estimator,
                                      max_neighborhood_order=1,
                                      max_n_iter=100)
        moo.fit(desired_distances, reference_graphs)
        pareto_set_graphs = moo.sample(reference_graphs)

        return pareto_set_graphs
예제 #53
0
 def __init__(self, program=SGDRegressor(average=True, shuffle=True)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #54
0
 def __init__(self, program=None):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #55
0
class RegressorWrapper(BaseEstimator, RegressorMixin):
    """Regressor."""

    def __init__(self,
                 program=SGDRegressor(average=True, shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            y = self._extract_targets(graphs)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            predictions = self.program.predict(data_matrix)
            for prediction, graph in izip(predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = prediction
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
예제 #56
0
class KNNWrapper(BaseEstimator, ClassifierMixin):
    """KNNWrapper."""
    def __init__(self, program=NearestNeighbors(n_neighbors=2)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()
        self.params_vectorize = dict()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            elif "vectorize__" in param:
                key = param.split('__')[1]
                val = params[param]
                self.params_vectorize[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            self.graphs = list(graphs)
            data_matrix = vectorize(self.graphs,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            self.program = self.program.fit(data_matrix)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = vectorize(graphs_,
                                    vectorizer=self.vectorizer,
                                    **self.params_vectorize)
            distances, indices = self.program.kneighbors(data_matrix)
            for knn_dists, knn_ids, graph in izip(distances, indices, graphs):
                neighbor_graphs = []
                for knn_id in knn_ids:
                    neighbor_graphs.append(self.graphs[knn_id])
                graph.graph['neighbors'] = neighbor_graphs
                graph.graph['ids'] = knn_ids
                graph.graph['distances'] = knn_dists
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
예제 #57
0
 def __init__(self, program=NearestNeighbors(n_neighbors=2)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
예제 #58
0
 def __init__(self, program=NearestNeighbors(n_neighbors=2)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()