Пример #1
0
class KNNWrapper(BaseEstimator, ClassifierMixin):
    """KNNWrapper."""

    def __init__(self, program=NearestNeighbors(n_neighbors=2)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            self.graphs = list(graphs)
            data_matrix = self.vectorizer.transform(graphs)
            self.program = self.program.fit(data_matrix)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            distances, indices = self.program.kneighbors(data_matrix)
            for knn_dists, knn_ids, graph in izip(distances, indices, graphs):
                neighbor_graphs = []
                for knn_id in knn_ids:
                    neighbor_graphs.append(self.graphs[knn_id])
                graph.graph['neighbors'] = neighbor_graphs
                graph.graph['ids'] = knn_ids
                graph.graph['distances'] = knn_dists
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
Пример #2
0
class IsomorphicClusterer(BaseEstimator, ClusterMixin):
    """IsomorphismClusterer.
    """

    def __init__(self):
        """Construct."""
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        for param in params:
            self.__dict__[param] = params[param]
        return self

    def fit_predict(self, graphs):
        """fit_predict."""
        def vec_to_hash(vec):
            return hash(tuple(vec.data + vec.indices))
        try:
            for graph in graphs:
                prediction = vec_to_hash(self.vectorizer.transform([graph]))
                yield prediction
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
Пример #3
0
class IsomorphicClusterer(BaseEstimator, ClusterMixin):
    """IsomorphismClusterer."""

    def __init__(self):
        """Construct."""
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        for param in params:
            self.__dict__[param] = params[param]
        return self

    def fit_predict(self, graphs):
        """fit_predict."""
        def vec_to_hash(vec):
            return hash(tuple(vec.data + vec.indices))
        try:
            for graph in graphs:
                prediction = vec_to_hash(self.vectorizer.transform([graph]))
                yield prediction
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
Пример #4
0
class EdenRegressor(BaseEstimator, RegressorMixin):
    """Build a regressor for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 normalization=True, inner_normalization=True,
                 penalty='elasticnet', loss='squared_loss'):
        """construct."""
        self.set_params(r, d, nbits, discrete,
                        normalization, inner_normalization,
                        penalty, loss)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   normalization=True, inner_normalization=True,
                   penalty='elasticnet', loss='squared_loss'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.model = SGDRegressor(
            loss=loss, penalty=penalty,
            average=True, shuffle=True,
            max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        x = self.transform(graphs)
        self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        return self.predict(graphs)
def prep(graphlist,id=0):
    if not graphlist:
        return {}
    v=Vectorizer()
    map(lambda x: node_operation(x, lambda n, d: d.pop('weight', None)), graphlist)
    csr=v.transform(graphlist)
    hash_function = lambda vec: hash(tuple(vec.data + vec.indices))
    return {hash_function(row): (id,ith) for ith, row in enumerate(csr)}
Пример #6
0
def make_fold_vectorize(complexity=3, nbits=15, fold=None, boundaries=None):
    """Curry parameters in vectorizer."""
    vec = Vectorizer(complexity=complexity, nbits=nbits)
    vectorize = curry(lambda vec, graphs: vec.transform(graphs))(vec)

    cwindow_reweight = curry(_window_reweight)(boundaries)
    fold_vectorize = compose(vectorize, map(cwindow_reweight), fold)
    return fold_vectorize
Пример #7
0
def make_graphs(smiles):
    eden_graph_generator = [smiles_to_eden(smi) for smi in smiles
                            ]  # Convert from SMILES to EdEN format
    graphs = [graph for graph in eden_graph_generator
              ]  # Compute graphs for each molecule
    vectorizer = Vectorizer(min_r=0, min_d=0, r=1, d=2)
    sparse = vectorizer.transform(
        graphs)  # Compute the NSPDK features and store in a sparse array
    return sparse
Пример #8
0
class InstanceMaker(object):
    """InstanceMaker."""
    def __init__(self, n_landmarks=5, n_neighbors=50):
        """init."""
        self.vec = Vectorizer(r=3,
                              d=3,
                              normalization=False,
                              inner_normalization=False)
        self.n_neighbors = n_neighbors
        self.n_landmarks = n_landmarks

    def fit(self, graphs, ntargets):
        """graphs/targets split, trains NN on graphs"""
        self.graphs = graphs[:-ntargets]
        self.targets = graphs[-ntargets:]

        vecs = self.vec.transform(self.graphs)
        if self.n_neighbors > len(self.graphs):
            self.n_neighbors = len(self.graphs)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors).fit(vecs)
        return self

    def get(self, idd=-1):
        if idd == -1:
            target_graph = self.targets.pop()
        else:
            target_graph = self.targets[idd]
        target_vec = self.vec.transform([target_graph])
        distances, neighbors = self.nn.kneighbors(target_vec,
                                                  return_distance=True)
        distances = distances[0]
        neighbors = neighbors[0]
        ranked_graphs = [self.graphs[i] for i in neighbors]
        landmark_graphs = ranked_graphs[:self.n_landmarks]
        desired_distances = distances[:self.n_landmarks]

        logger.debug(
            "target(%d,%d) and nn(%d,%d)" %
            (target_graph.number_of_nodes(), target_graph.number_of_edges(),
             ranked_graphs[0].number_of_nodes(),
             ranked_graphs[0].number_of_edges()))
        so.gprint([target_graph, ranked_graphs[0]], edgelabel='label')

        return landmark_graphs, desired_distances, ranked_graphs, target_graph
Пример #9
0
def compute_NSPDK_features():
  import eden
  from eden.graph import Vectorizer
  from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden
  mol_path = olfaction_prediction_path + '/data/sdf/'
  iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf')
  iter_graphs = obabel_to_eden(iter_mols)

  vectorizer = Vectorizer( r=3, d=4 )
  X = vectorizer.transform( iter_graphs )
  return X
Пример #10
0
def compute_NSPDK_features():
    import eden
    from eden.graph import Vectorizer
    from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden
    mol_path = olfaction_prediction_path + '/data/sdf/'
    iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf')
    iter_graphs = obabel_to_eden(iter_mols)

    vectorizer = Vectorizer(r=3, d=4)
    X = vectorizer.transform(iter_graphs)
    return X
Пример #11
0
def _remove_similar_pairs(graphs):
    vec = Vectorizer(r=3, d=3,
                     normalization=False, inner_normalization=False)
    x = vec.transform(graphs)
    matrix = cosine_similarity(x)
    scores = np.array([1] * len(graphs))
    ids = min_similarity_selection(matrix,
                                   scores=scores,
                                   max_num=len(graphs) / 2)
    graphs = [graphs[i] for i in ids]
    logging.debug('similar pairs removal:%d' % len(graphs))
    return graphs
Пример #12
0
class Annotator():

    def __init__(self, multiprocess=True, score_attribute='importance'):
        self.score_attribute=score_attribute
        self.vectorizer=Vectorizer()
        self.multi_process=multiprocess
        self.trained=False

    def fit(self, graphs_pos, graphs_neg=[]):

        if self.trained:
            return self
        self.trained=True
        map(utils.remove_eden_annotation,graphs_pos+graphs_neg)
        map(lambda x: utils.node_operation(x, lambda n,d: d.pop('importance',None)), graphs_pos+graphs_neg)
        map( lambda graph: graph.graph.pop('mass_annotate_mp_was_here',None) ,graphs_pos+graphs_neg)

        if graphs_neg:
            #print 'choosing to train binary esti'
            self.estimator = SGDClassifier()
            classes= [1]*len(graphs_pos)+[-1]*len(graphs_neg)
            self.estimator.fit(self.vectorizer.transform(graphs_pos+graphs_neg),classes)
        else:
            self.estimator = ExperimentalOneClassEstimator()
            self.estimator.fit(self.vectorizer.transform(graphs_pos))
        return self


    def fit_transform(self,graphs_p, graphs_n=[]):
        self.fit(graphs_p,graphs_n)
        return self.transform(graphs_p),self.transform(graphs_n)

    def transform(self,graphs):
        return  self.annotate(graphs)

    def annotate(self,graphs,neg=False):
        if not graphs:
            return []
        return mass_annotate_mp(graphs,self.vectorizer,score_attribute=self.score_attribute,estimator=self.estimator,
                                multi_process=self.multi_process, invert_score=neg)
Пример #13
0
def smiles2nspdk(input_path, complexity, nbits, save_path):
    """
    Smiles strings to nspdk descriptors
    :param input_path: path to file with SMILES
    :param complexity: descriptor complexity
    :param nbits: bits of descriptor
    :param save_path:
    :return:
    """
    vec = Vectorizer(complexity=complexity, nbits=nbits)
    smiles_list = load_dataset(input_path)
    res = vec.transform(list(smiles_strings_to_nx(smiles_list))).todense()
    output = open(save_path, "w")
    for row in res:
        np.savetxt(output, row)
Пример #14
0
def _outliers(graphs, k=3):
    vec = Vectorizer(r=3, d=3,
                     normalization=False, inner_normalization=False)
    x = vec.transform(graphs)
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(x)
    neigbhbors = knn.kneighbors(x, return_distance=False)
    outlier_list = []
    non_outlier_list = []
    for i, ns in enumerate(neigbhbors):
        not_outlier = False
        for n in ns[1:]:
            if i in list(neigbhbors[n, :]):
                not_outlier = True
                break
        if not_outlier is False:
            outlier_list.append(i)
        else:
            non_outlier_list.append(i)
    return outlier_list, non_outlier_list
Пример #15
0
class OrdererWrapper(BaseEstimator, ClassifierMixin):
    """Orderer."""

    def __init__(self, program=None):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_orderer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_orderer[param] = params[param]
        self.program.set_params(**params_orderer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def decision_function(self, graphs):
        """decision_function."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            scores = self.program.decision_function(data_matrix)
            return scores
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)
Пример #16
0
def clusterGraphs(graphs, r, d, copt):
    opts = copt[1:-1]
    optl = opts.split(",")
    opt = int(optl[0])
    vectorizer = Vectorizer(r=r, d=d)
    samples = len(graphs)
    minlclu = 5
    Xsp = vectorizer.transform(graphs)  #sparse feature matrix
    X = Xsp.todense()  #regular feature matrix
    #SM=metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma = 1)#similarity matrix
    SM = metrics.pairwise.pairwise_kernels(Xsp, metric='linear')
    DM = []  #distance matrix
    for i in range(len(SM)):
        DM.append([])
        for j in range(len(SM[i])):
            val = 1.0 - SM[i][j]
            if val < 0:
                DM[i].append(0.0)
            else:
                DM[i].append(val)
    if opt == 0:
        nc, labels = MShift(X)
    if opt == 1:
        #print(DM)
        minlclu = int(optl[2])
        nc, labels = DB_SCAN(DM, float(optl[1]), int(optl[2]))
    if opt == 2:
        nc, labels = AffProp(SM)
    if opt == 3:
        print(SM)  #Matrix(X)
        return 0, []
    if opt == 4:
        nc, labels = K_Means(X)
    if opt == 5:
        nc, labels = SpecClus(SM)
    if opt == 6:
        nc, labels = dclust(DM, int(optl[1]), int(optl[2]), float(optl[3]))

    return nc, labels, minlclu
Пример #17
0
 def setup(self, known_graphs=None, candidate_graphs=None):
     """Setup."""
     # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the
     # known graphs in the list 'known_graphs'
     parameters_priors = dict(n_neighbors=self.n_neighbors)
     parameters_priors.update(dict(vectorizer__complexity=self.complexity,
                                   vectorizer__discrete=True))
     fit_wrapped_knn_predictor_known = \
         model(known_graphs,
               program=KNNWrapper(program=NearestNeighbors()),
               parameters_priors=parameters_priors)
     # compute distances of candidate_graphs to known_graphs
     knn_candidate_graphs = predict(candidate_graphs,
                                    program=fit_wrapped_knn_predictor_known)
     knn_candidate_graphs = list(knn_candidate_graphs)
     self.distances_to_known_graphs = []
     for knn_candidate_graph in knn_candidate_graphs:
         distances = knn_candidate_graph.graph['distances']
         self.distances_to_known_graphs.append(distances)
     # compute candidate_graphs encodings
     vec = Vectorizer(complexity=self.complexity)
     self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
Пример #18
0
 def setup(self, known_graphs=None, candidate_graphs=None):
     """Setup."""
     # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the
     # known graphs in the list 'known_graphs'
     parameters_priors = dict(n_neighbors=self.n_neighbors)
     parameters_priors.update(
         dict(vectorizer__complexity=self.complexity,
              vectorizer__discrete=True))
     fit_wrapped_knn_predictor_known = \
         model(known_graphs,
               program=KNNWrapper(program=NearestNeighbors()),
               parameters_priors=parameters_priors)
     # compute distances of candidate_graphs to known_graphs
     knn_candidate_graphs = predict(candidate_graphs,
                                    program=fit_wrapped_knn_predictor_known)
     knn_candidate_graphs = list(knn_candidate_graphs)
     self.distances_to_known_graphs = []
     for knn_candidate_graph in knn_candidate_graphs:
         distances = knn_candidate_graph.graph['distances']
         self.distances_to_known_graphs.append(distances)
     # compute candidate_graphs encodings
     vec = Vectorizer(complexity=self.complexity)
     self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
Пример #19
0
def compare(finalL, L, peaks, opt, th, alpha):
    n = len(L)
    lpeaks = {}
    for key in L:
        lpeaks[key] = peaks[key]
    for key in finalL:
        lpeaks[key] = peaks[key]
    graphs, dict = peaksToGraphs(lpeaks, opt, alpha)

    vectorizer = Vectorizer(r=2, d=3)
    samples = len(graphs)
    Xsp = vectorizer.transform(graphs)  #sparse feature matrix
    X = Xsp.todense()  #regular feature matrix
    SM = metrics.pairwise.pairwise_kernels(Xsp, metric='rbf',
                                           gamma=1)  #similarity matrix
    DM = []  #distance matrix
    for i in range(len(SM)):
        DM.append([])
        for j in range(len(SM[i])):
            val = 1.0 - SM[i][j]
            if val < 0:
                DM[i].append(0.0)
            else:
                DM[i].append(val)
    avgDM = 0.0
    counts = 0.0
    for i in range(len(graphs)):
        if dict[i] in L:
            for j in range(len(graphs)):
                if i != j and dict[j] in finalL:
                    avgDM += DM[i][j]
                    counts += 1
    avgDM = avgDM / counts
    if avgDM >= 0.0 and avgDM <= th:
        return 0
    else:
        return 1
Пример #20
0
class EdenEstimator(BaseEstimator, ClassifierMixin):
    """Build an estimator for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 balance=False, subsample_size=200, ratio=2,
                 normalization=False, inner_normalization=False,
                 penalty='elasticnet'):
        """construct."""
        self.set_params(r, d, nbits, discrete, balance, subsample_size,
                        ratio, normalization, inner_normalization,
                        penalty)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   balance=False, subsample_size=200, ratio=2,
                   normalization=False, inner_normalization=False,
                   penalty='elasticnet'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.balance = balance
        self.subsample_size = subsample_size
        self.ratio = ratio
        if penalty == 'perceptron':
            self.model = Perceptron(max_iter=5, tol=None)
        else:
            self.model = SGDClassifier(
                average=True, class_weight='balanced', shuffle=True,
                penalty=penalty, max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        if self.balance:
            if randomize:
                bal_graphs, bal_targets = balance(
                    graphs, targets, None, ratio=self.ratio)
            else:
                samp_graphs, samp_targets = subsample(
                    graphs, targets, subsample_size=self.subsample_size)
                x = self.transform(samp_graphs)
                self.model.fit(x, samp_targets)
                bal_graphs, bal_targets = balance(
                    graphs, targets, self, ratio=self.ratio)
            size = len(bal_targets)
            logger.debug('Dataset size=%d' % (size))
            x = self.transform(bal_graphs)
            self.model = self.model.fit(x, bal_targets)
        else:
            x = self.transform(graphs)
            self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        x = self.transform(graphs)
        preds = self.model.decision_function(x)
        return preds

    @timeit
    def cross_val_score(self, graphs, targets,
                        scoring='roc_auc', cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_score(
            self.model, x, targets, cv=cv, scoring=scoring)
        return scores

    @timeit
    def cross_val_predict(self, graphs, targets, cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_predict(
            self.model, x, targets, cv=cv, method='decision_function')
        return scores

    @timeit
    def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids

    @timeit
    def model_selection(self, graphs, targets,
                        n_iter=30, subsample_size=None):
        """model_selection_randomized."""
        param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))}
        if subsample_size:
            graphs, targets = subsample(
                graphs, targets, subsample_size=subsample_size)

        pool = mp.Pool()
        scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter)
        pool.close()
        pool.join()

        best_params = max(scores)[1]
        logger.debug("Best parameters:\n%s" % (best_params))
        self = EdenEstimator(**best_params)
        return self

    @timeit
    def learning_curve(self, graphs, targets,
                       cv=5, n_steps=10, start_fraction=0.1):
        """learning_curve."""
        graphs, targets = paired_shuffle(graphs, targets)
        x = self.transform(graphs)
        train_sizes = np.linspace(start_fraction, 1.0, n_steps)
        scoring = 'roc_auc'
        train_sizes, train_scores, test_scores = learning_curve(
            self.model, x, targets,
            cv=cv, train_sizes=train_sizes,
            scoring=scoring)
        return train_sizes, train_scores, test_scores

    @timeit
    def bias_variance_decomposition(self, graphs, targets,
                                    cv=5, n_bootstraps=10):
        """bias_variance_decomposition."""
        x = self.transform(graphs)
        score_list = []
        for i in range(n_bootstraps):
            scores = cross_val_score(
                self.model, x, targets, cv=cv)
            score_list.append(scores)
        score_list = np.array(score_list)
        mean_scores = np.mean(score_list, axis=1)
        std_scores = np.std(score_list, axis=1)
        return mean_scores, std_scores
Пример #21
0
def vectorize(instances):
    vec = Vectorizer()
    return vec.transform(instances)
Пример #22
0
class Vectorizer(object):

    def __init__(self,
                 complexity=None,
                 nbits=20,
                 sequence_vectorizer_complexity=3,
                 graph_vectorizer_complexity=2,
                 n_neighbors=5,
                 sampling_prob=.5,
                 n_iter=5,
                 min_energy=-5,
                 random_state=1):
        random.seed(random_state)
        if complexity is not None:
            sequence_vectorizer_complexity = complexity
            graph_vectorizer_complexity = complexity

        self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity,
                                                 nbits=nbits,
                                                 normalization=False,
                                                 inner_normalization=False)
        self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits)
        self.n_neighbors = n_neighbors
        self.sampling_prob = sampling_prob
        self.n_iter = n_iter
        self.min_energy = min_energy
        self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, seqs):
        # store seqs
        self.seqs = list(normalize_seqs(seqs))
        data_matrix = self.sequence_vectorizer.transform(self.seqs)
        # fit nearest_neighbors model
        self.nearest_neighbors.fit(data_matrix)
        return self

    def fit_transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs, seqs_ = tee(seqs)
        return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter)

    def transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        graphs_ = self.graphs(seqs)
        data_matrix = self.graph_vectorizer.transform(graphs_)
        return data_matrix

    def graphs(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        if n_iter is not None:
            self.n_iter = n_iter
        if sampling_prob is not None:
            self.sampling_prob = sampling_prob
        for seq, neighs in self._compute_neighbors(seqs):
            if self.n_iter > 1:
                header, sequence, struct, energy = self._optimize_struct(seq, neighs)
            else:
                header, sequence, struct, energy = self._align_sequence_structure(seq, neighs)
            graph = self._seq_to_eden(header, sequence, struct, energy)
            yield graph

    def _optimize_struct(self, seq, neighs):
        structs = []
        results = []
        for i in range(self.n_iter):
            new_neighs = self._sample_neighbors(neighs)
            header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs)
            results.append((header, sequence, struct, energy))
            structs.append(struct)
        instance_id = self._most_representative(structs)
        selected = results[instance_id]
        return selected

    def _most_representative(self, structs):
        # compute kernel matrix with sequence_vectorizer
        data_matrix = self.sequence_vectorizer.transform(structs)
        kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1)
        # compute instance density as 1 over average pairwise distance
        density = np.sum(kernel_matrix, 0) / data_matrix.shape[0]
        # compute list of nearest neighbors
        max_id = np.argsort(-density)[0]
        return max_id

    def _sample_neighbors(self, neighs):
        out_neighs = []
        # insert one element at random
        out_neighs.append(random.choice(neighs))
        # add other elements sampling without replacement
        for neigh in neighs:
            if random.random() < self.sampling_prob:
                out_neighs.append(neigh)
        return out_neighs

    def _align_sequence_structure(self, seq, neighs, structure_deletions=False):
        header = seq[0]
        if len(neighs) < 1:
            clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            energy = 0
            logger.debug('Warning: no alignment for: %s' % seq)
        else:
            str_out = convert_seq_to_fasta_str(seq)
            for neigh in neighs:
                str_out += convert_seq_to_fasta_str(neigh)
            cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out)
            out = sp.check_output(cmd, shell=True)
            seed = extract_aligned_seed(header, out)
            cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out)
            out = sp.check_output(cmd, shell=True)
            struct, energy = extract_struct_energy(out)
            if energy > self.min_energy:
                # use min free energy structure
                clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            else:
                clean_seq, clean_struct = make_seq_struct(seed, struct)
            if structure_deletions:
                clean_struct = self._clean_structure(clean_seq, clean_struct)

        return header, clean_seq, clean_struct, energy

    def _clean_structure(self, seq, stru):
        '''
        Parameters
        ----------
        seq : basestring
            rna sequence
        stru : basestring
            dotbracket string

        Returns
        -------
        the structure given may not respect deletions in the sequence.
        we transform the structure to one that does
        '''

        # find  deletions in sequence
        ids = []
        for i, c in enumerate(seq):
            if c == '-':
                ids.append(i)
        # remove brackets that dont have a partner anymore
        stru = list(stru)
        pairdict = self._pairs(stru)
        for i in ids:
            stru[pairdict[i]] = '.'
        # delete deletions in structure
        ids.reverse()
        for i in ids:
            del stru[i]
        stru = ''.join(stru)

        # removing obvious mistakes
        stru = stru.replace("(())", "....")
        stru = stru.replace("(.)", "...")
        stru = stru.replace("(..)", "....")

        return stru

    def _pairs(self, struct):
        '''
        Parameters
        ----------
        struct : basestring

        Returns
        -------
        dictionary of ids in the struct, that are bond pairs
        '''
        unpaired = []
        pairs = {}
        for i, c in enumerate(struct):
            if c == '(':
                unpaired.append(i)
            if c == ')':
                partner = unpaired.pop()
                pairs[i] = partner
                pairs[partner] = i
        return pairs

    def _compute_neighbors(self, seqs):
        seqs = list(seqs)
        data_matrix = self.sequence_vectorizer.transform(seqs)
        # find neighbors
        distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix)
        # for each seq
        for seq, neighs in zip(seqs, neighbors):
            neighbor_seqs = [self.seqs[neigh] for neigh in neighs]
            yield seq, neighbor_seqs

    def _seq_to_eden(self, header, sequence, struct, energy):
        graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct)
        if graph.number_of_nodes() < 2:
            graph = seq_to_networkx(header, sequence)
        graph.graph['id'] = header
        graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy)
        graph.graph['energy'] = energy
        graph.graph['sequence'] = sequence
        return graph
Пример #23
0
class VolumeConstructor(object):
    """VolumeConstructor."""
    def __init__(self,
                 min_count=2,
                 max_n_neighbors=100,
                 r=3,
                 d=3,
                 class_discretizer=2,
                 class_std_discretizer=1,
                 similarity_discretizer=10,
                 size_discretizer=1,
                 volume_discretizer=10,
                 n_neighbors=10,
                 improve=True):
        """init."""
        self.improve = improve
        self.n_neighbors = n_neighbors
        self.non_norm_vec = Vectorizer(r=r,
                                       d=d,
                                       normalization=False,
                                       inner_normalization=False)
        self.vec = Vectorizer(r=r,
                              d=d,
                              normalization=True,
                              inner_normalization=True)
        self.grammar = GrammarWrapper(radius_list=[1, 2, 3],
                                      thickness_list=[2],
                                      min_cip_count=min_count,
                                      min_interface_count=min_count,
                                      max_n_neighbors=max_n_neighbors,
                                      n_neigh_steps=1,
                                      max_neighborhood_size=max_n_neighbors)
        self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator(
            self.vec,
            class_discretizer=class_discretizer,
            class_std_discretizer=class_std_discretizer,
            similarity_discretizer=similarity_discretizer,
            size_discretizer=size_discretizer,
            volume_discretizer=volume_discretizer,
            improve=improve)
        self.cost_estimator = MultiObjectiveCostEstimator(
            self.non_norm_vec, improve)
        self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, pos_graphs, neg_graphs):
        """fit."""
        self.all_graphs = pos_graphs + neg_graphs
        self.all_vecs = self.vec.transform(self.all_graphs)
        self.grammar.fit(self.all_graphs)
        logger.info('%s' % self.grammar)
        self.sim_cost_estimator.fit(pos_graphs, neg_graphs)
        self.cost_estimator.fit(pos_graphs, neg_graphs)
        self.nn_estimator.fit(self.all_vecs)

    def sample(self, sample_graphs):
        """sample."""
        # pareto filter using similarity of the dataset for initial seed
        costs = self.sim_cost_estimator.compute(sample_graphs)
        seed_graphs = get_pareto_set(sample_graphs, costs)

        # run optimization in parallel
        pareto_graphs_list = self._optimize_parallel(seed_graphs)
        self._log_result(pareto_graphs_list)

        # join all pareto sets
        pareto_set_graphs = pipe(pareto_graphs_list, concat, list)

        # pareto filter using similarity of the solutions
        pareto_set_costs = self.sim_cost_estimator.compute(pareto_set_graphs)
        sel_pareto_set_graphs = get_pareto_set(pareto_set_graphs,
                                               pareto_set_costs)
        logger.info('#constructed graphs:%5d' % (len(sel_pareto_set_graphs)))
        return sel_pareto_set_graphs

    def _log_result(self, pareto_graphs_list):
        tot_size = sum(len(graphs) for graphs in pareto_graphs_list)
        msg = 'pareto set sizes [%d]: ' % tot_size
        for graphs in pareto_graphs_list:
            msg += '[%d]' % len(graphs)
        logger.info(msg)

    def _optimize_parallel(self, reference_graphs):
        """optimize_parallel."""
        pool = multiprocessing.Pool()
        res = [
            apply_async(pool, self._optimize_single, args=(g, ))
            for g in reference_graphs
        ]
        pareto_set_graphs_list = [p.get() for p in res]
        pool.close()
        pool.join()
        return pareto_set_graphs_list

    def _get_constraints(self, reference_graph):
        reference_vec = self.non_norm_vec.transform([reference_graph])
        # find neighbors
        neighbors = self.nn_estimator.kneighbors(reference_vec,
                                                 return_distance=False)
        neighbors = neighbors[0]
        # compute center of mass
        reference_graphs = [self.all_graphs[i] for i in neighbors]
        reference_vecs = self.all_vecs[neighbors]
        avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0)

        reference_vecs = self.non_norm_vec.transform(reference_graphs)
        # compute desired distances
        desired_distances = euclidean_distances(avg_reference_vec,
                                                reference_vecs)
        desired_distances = desired_distances[0]
        return reference_graphs, desired_distances

    def _optimize_single(self, reference_graph):
        """optimize_single."""
        res = self._get_constraints(reference_graph)
        reference_graphs, desired_distances = res
        moo = MultiObjectiveOptimizer(self.vec,
                                      self.grammar,
                                      self.cost_estimator,
                                      max_neighborhood_order=1,
                                      max_n_iter=100)
        moo.fit(desired_distances, reference_graphs)
        pareto_set_graphs = moo.sample(reference_graphs)

        return pareto_set_graphs
Пример #24
0
class ClassifierWrapper(BaseEstimator, ClassifierMixin):
    """Classifier."""

    def __init__(self,
                 program=SGDClassifier(average=True,
                                       class_weight='balanced',
                                       shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            y = self._extract_targets(graphs)
            # manage case for single class learning
            if len(set(y)) == 1:
                # make negative data matrix
                negative_data_matrix = data_matrix.multiply(-1)
                # make targets
                y = list(y)
                y_neg = [-1] * len(y)
                # concatenate elements
                data_matrix = vstack(
                    [data_matrix, negative_data_matrix], format="csr")
                y = y + y_neg
                y = np.ravel(y)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            predictions = self.program.predict(data_matrix)
            scores = self.program.decision_function(data_matrix)
            for score, prediction, graph in izip(scores, predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = score
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
Пример #25
0
def vectorize(thing):
    v = Vectorizer()
    if not thing:
        raise Exception( "need something to vectirize.. received %s" % str(thing))
    thing=list(thing) # current eden does not eat generators anymore? weird
    return v.transform(thing)
Пример #26
0
    improved_graphs = sampler.transform(graphs_pos_,
                                        same_radius=False,
                                        size_constrained_core_choice=True,
                                        sampling_interval=9999,
                                        select_cip_max_tries=100,
                                        batch_size=int(count/4)+1,
                                        n_steps=100,
                                        n_jobs=-1,
                                        improving_threshold=0.9)



    #calculate the score of the improved versions
    #calculate score of the originals
    avg_imp=sum( [estimator.decision_function(e) for e in vectorizer.transform(unpack(improved_graphs)) ] )/count
    avg_ori=sum( [estimator.decision_function(e) for e in vectorizer.transform(graphs_pos___)] )/count
    improved.append(avg_imp)
    originals.append(avg_ori)


t = range(len(percentages))
# originals are blue
# improved ones are green

print originals
print improved
plt.plot(t,originals ,'bs')
plt.plot(t, improved ,'g^')
plt.savefig('zomg.png')
Пример #27
0
class IdealGraphEstimator(object):
    """Build an estimator for graphs."""

    def __init__(
            self,
            min_count=2,
            max_n_neighbors=100,
            r=3,
            d=3,
            n_neighbors=10,
            max_num_solutions=30):
        """construct."""
        self.min_count = min_count
        self.max_n_neighbors = max_n_neighbors
        self.max_num_solutions = max_num_solutions
        self.r = r
        self.d = d
        self.n_neighbors = n_neighbors

        self.clf = Perceptron(n_iter=500)
        self.vec = Vectorizer(r=r, d=d,
                              normalization=True,
                              inner_normalization=True,
                              nbits=16)
        self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6]

    def fit(self, pos_graphs, neg_graphs):
        """fit."""
        ref_graphs = self.construct(pos_graphs, neg_graphs)
        logger.debug('Working on %d constructed graphs' % len(ref_graphs))
        y = [1] * len(pos_graphs) + [-1] * len(neg_graphs)
        x = self.vec.transform(pos_graphs + neg_graphs)
        z = self.vec.transform(ref_graphs)
        n_features = z.shape[0]
        k = np.hstack([pairwise_kernels(x, z, metric='rbf', gamma=g)
                       for g in self.gs])
        step = len(ref_graphs) / 2
        n_inst, n_feat = k.shape
        txt = 'RFECV on %d instances with %d features with step: %d' % \
            (n_inst, n_feat, step)
        logger.debug(txt)
        selector = RFECV(self.clf, step=step, cv=10)
        selector = selector.fit(k, y)

        ids = list(concat([range(n_features)] * len(self.gs)))
        gs_list = list(concat([[g] * n_features for g in self.gs]))

        feat = defaultdict(list)
        for g, i, s in zip(gs_list, ids, selector.support_):
            if s:
                feat[g].append(i)

        self.mats = dict()
        for g in sorted(feat):
            mat = vstack([z[i] for i in feat[g]])
            self.mats[g] = mat

        sel_ids = set([i for i, s in zip(ids, selector.support_) if s])
        self.ideal_graphs_ = [ref_graphs[i] for i in sel_ids]
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vec.transform(graphs)
        xtr = np.hstack([pairwise_kernels(x,
                                          self.mats[g], metric='rbf', gamma=g)
                         for g in sorted(self.mats)])
        return xtr

    def construct(self, pos_graphs, neg_graphs):
        """construct."""
        args = dict(
            min_count=self.min_count,
            max_n_neighbors=self.max_n_neighbors,
            r=self.r,
            d=self.d,
            n_landmarks=5,
            n_neighbors=self.n_neighbors,
            n_iter=20,
            k_best=5,
            max_num_solutions=self.max_num_solutions)
        self.active_constr = NearestNeighborsMeanOptimizer(
            improve=False, **args)
        self.active_constr.fit(pos_graphs, neg_graphs)
        graphs = pos_graphs + neg_graphs
        active_pareto_set_graphs = self.active_constr.optimize(graphs)

        self.pos_constr = NearestNeighborsMeanOptimizer(
            improve=True, **args)
        self.pos_constr.fit(pos_graphs, neg_graphs)
        pareto_set_graphs = self.pos_constr.optimize(graphs)

        sel_constructed_graphs = pareto_set_graphs + active_pareto_set_graphs
        return sel_constructed_graphs
Пример #28
0
class NearestNeighborsMeanOptimizer(object):
    """NearestNeighborsMeanOptimizer."""
    def __init__(self,
                 min_count=2,
                 max_n_neighbors=None,
                 r=3,
                 d=3,
                 n_landmarks=5,
                 n_neighbors=100,
                 n_iter=20,
                 k_best=5,
                 max_num_solutions=30,
                 improve=True):
        """init."""
        self.improve = improve
        self.max_num = max_num_solutions
        self.n_landmarks = n_landmarks
        self.n_neighbors = n_neighbors
        self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)
        self.non_norm_vec = Vectorizer(r=r,
                                       d=d,
                                       normalization=False,
                                       inner_normalization=False)
        self.vec = Vectorizer(r=r,
                              d=d,
                              normalization=True,
                              inner_normalization=True)
        self.dist_opt = LandmarksDistanceOptimizer(
            r=r,
            d=d,
            min_count=min_count,
            max_n_neighbors=max_n_neighbors,
            n_iter=n_iter,
            k_best=k_best,
            improve=improve)

    def fit(self, pos_graphs, neg_graphs):
        """fit."""
        self.all_graphs = pos_graphs + neg_graphs
        self.all_vecs = self.vec.transform(self.all_graphs)
        self.nn_estimator.fit(self.all_vecs)
        self.dist_opt.fit(pos_graphs, neg_graphs)
        self.sim_est = VarSimVolCostEstimator(improve=self.improve)
        self.sim_est.fit(pos_graphs, neg_graphs)

    def optimize(self, graphs):
        """optimize."""
        seed_graphs = self.select(graphs, max_num=self.max_num)

        # run optimization in parallel
        pareto_graphs_list = self._optimize_parallel(seed_graphs)
        self._log_result(pareto_graphs_list)

        # join all pareto sets
        pareto_set_graphs = pipe(pareto_graphs_list, concat, list)

        # pareto filter using similarity of the solutions
        sel_graphs = self.select(pareto_set_graphs, max_num=self.max_num)
        logger.debug('#constructed graphs:%5d' % (len(sel_graphs)))
        return sel_graphs

    def select(self, graphs, max_num=30):
        """select."""
        costs = self.sim_est.decision_function(graphs)
        pareto_graphs = get_pareto_set(graphs, costs)
        select_graphs = self.sim_est.select(pareto_graphs, k_best=max_num)
        i, p, s = len(graphs), len(pareto_graphs), len(select_graphs)
        logger.debug('initial:%d  pareto:%d  selected:%d' % (i, p, s))
        return select_graphs

    def _log_result(self, pareto_graphs_list):
        tot_size = sum(len(graphs) for graphs in pareto_graphs_list)
        msg = 'pareto set sizes [%d]: ' % tot_size
        for graphs in pareto_graphs_list:
            msg += '[%d]' % len(graphs)
        logger.debug(msg)

    def _optimize_parallel(self, reference_graphs):
        """optimize_parallel."""
        pool = multiprocessing.Pool()
        res = [
            apply_async(pool, self._optimize, args=(reference_graph, ))
            for reference_graph in reference_graphs
        ]
        pareto_set_graphs_list = [p.get() for p in res]
        pool.close()
        pool.join()
        return pareto_set_graphs_list

    def _optimize(self, reference_graph):
        """optimize_single."""
        constraints = self._get_constraints(reference_graph)
        graphs = self.dist_opt.optimize(*constraints)
        return graphs

    def _get_constraints(self, reference_graph):
        reference_vec = self.non_norm_vec.transform([reference_graph])
        # find neighbors
        neighbors = self.nn_estimator.kneighbors(reference_vec,
                                                 return_distance=False)
        neighbors = neighbors[0]
        # compute center of mass
        landmarks = neighbors[:self.n_landmarks]
        loc_graphs = [self.all_graphs[i] for i in neighbors]
        reference_graphs = [self.all_graphs[i] for i in landmarks]
        reference_vecs = self.all_vecs[landmarks]
        avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0)

        reference_vecs = self.non_norm_vec.transform(reference_graphs)
        # compute desired distances
        desired_distances = euclidean_distances(avg_reference_vec,
                                                reference_vecs)
        desired_distances = desired_distances[0]
        return reference_graphs, desired_distances, loc_graphs
Пример #29
0
    improved_graphs = sampler.sample(graphs_pos_,
                                     same_radius=False,
                                     max_size_diff=True,
                                     sampling_interval=9999,
                                     select_cip_max_tries=100,
                                     batch_size=int(count/4)+1,
                                     n_steps=100,
                                     n_jobs=-1,
                                     improving_threshold=0.9)



    #calculate the score of the improved versions
    #calculate score of the originals
    avg_imp=sum( [estimator.decision_function(e) for e in vectorizer.transform(unpack(improved_graphs)) ] )/count
    avg_ori=sum( [estimator.decision_function(e) for e in vectorizer.transform(graphs_pos___)] )/count
    improved.append(avg_imp)
    originals.append(avg_ori)


t = range(len(percentages))
# originals are blue
# improved ones are green

print originals
print improved
plt.plot(t,originals ,'bs')
plt.plot(t, improved ,'g^')
plt.savefig('zomg.png')
class DiscSampler():
    '''
    '''
    def __init__(self):
        # this is mainly for the forest. the sampler uses a different vectorizer
        self.vectorizer = Vectorizer(nbits=14)

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(
                heap,
                (
                    self.sampler.estimator.predict_proba(
                        self.sampler.vectorizer.transform_single(
                            graph2))[0][1],  # score ~ dist from hyperplane
                    k +
                    1,  # making sure that the counter is high so we dont output the startgraphz at the end
                    graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances
                     ]  # the second element should be the dist we want
        avg_dist = distances[len(distances) /
                             2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

    '''
    def sample_simple(self,graphiter,iterneg):
        graphiter,grait,griter2 = itertools.tee(graphiter,3)
        
        self.fit_sampler(graphiter,iterneg)
        a,b,c=self.get_heap_and_forest( griter2, 30)


        grait= itertools.islice(grait,5)
        rez=self.sampler.sample(grait,n_samples=5,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=1,
                                       select_cip_max_tries=100,
                                       accept_annealing_factor=.5,
                                       generatormode=False,
                                       same_core_size=False )
        return rez
    '''

    def sample_graphs(self,
                      graphiter,
                      iter_neg,
                      radius,
                      how_many,
                      check_k,
                      heap_chunk_size=10):

        # some initialisation,
        # creating samper
        # setup heap and forest
        graphiter, iter2 = itertools.tee(graphiter)
        self.fit_sampler(iter2, iter_neg)

        heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k)
        # heap should be like   (hpdist, count, graph)
        radius = radius * avg_dist
        # so lets start the loop1ng
        result = []
        while heap and len(result) < how_many:

            # pop all the graphs we want
            todo = []
            for i in range(heap_chunk_size):
                if heap:
                    todo.append(heapq.heappop(heap))

            # let the sampler do the sampling
            graphz = [e[2] for e in todo]
            #draw.draw_graph_set_graphlearn(graphz)
            work = self.sampler.sample(graphz,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=30,
                                       select_cip_max_tries=100,
                                       improving_threshold=.5,
                                       generatormode=False,
                                       max_core_size_diff=False,
                                       n_samples=3)
            # lets see, we need to take care of
            # = the initialy poped stuff
            # - increase and check the counter, reinsert into heap
            # = the new graphs
            # put them in the heap and the forest
            for graph, task in zip(work, todo):
                graphlist = graph.graph['sampling_info']['graphs_history']
                print 'rez:', graphlist, task
                for graph2 in graphlist:
                    # check distance from created instances
                    x = self.vectorizer.transform_single(graph2)
                    dist, void = forest.kneighbors(x, 1)
                    dist = sum(dist)
                    # is the distance ok?
                    # if so, insert into forest and heap
                    if radius < dist < radius * 2:
                        forest.partial_fit(x)
                        heapq.heappush(heap,
                                       (graph2.graph['score'], 0, graph2))
                        print 'heap'
                    print 'cant heap', radius, dist
                # taking care of task graph
                # put in result list if necessary
                if task[1] < check_k < task[1] + len(graphlist):
                    result.append(task[2])
                    print 'found sth'
                # go back to the heap!
                heapq.heappush(heap,
                               (task[0], task[1] + len(graphlist), task[2]))

        return result

    '''
    def simple_fit(self,iter_pos):
        self.sampler= GraphLearnSampler()
        self.sampler.fit(iter_pos)
        self.estimator=self.sampler.estimator
    '''

    def fit_sampler(self, iter_pos, iter_neg):
        # getting the sampler ready:
        self.sampler = MySampler(radius_list=[0, 1],
                                 thickness_list=[0.5, 1, 2])
        iter_pos, pos, pos_ = itertools.tee(iter_pos, 3)
        self.estimator = self.sampler.estimatorobject.fit_2(
            iter_pos, iter_neg, self.sampler.vectorizer)
        print 'got estimeetaaa'
        self.sampler.local_substitutable_graph_grammar.fit(
            pos, grammar_n_jobs=-1, grammar_batch_size=8)
        self.sampler.estimator = self.estimator
        print 'got grammar:grammar is there oO'
Пример #31
0
class EdenEstimator(BaseEstimator, ClassifierMixin):
    """Build an estimator for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 balance=False, subsample_size=200, ratio=2,
                 normalization=False, inner_normalization=False,
                 penalty='elasticnet', n_iter=500):
        """construct."""
        self.set_params(r, d, nbits, discrete, balance, subsample_size,
                        ratio, normalization, inner_normalization,
                        penalty, n_iter)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   balance=False, subsample_size=200, ratio=2,
                   normalization=False, inner_normalization=False,
                   penalty='elasticnet', n_iter=500):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.balance = balance
        self.subsample_size = subsample_size
        self.ratio = ratio
        if penalty == 'perceptron':
            self.model = Perceptron(n_iter=n_iter)
        else:
            self.model = SGDClassifier(
                average=True, class_weight='balanced', shuffle=True,
                penalty=penalty)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    @timeit
    def fit(self, graphs, targets, randomize=True):
        """fit."""
        if self.balance:
            if randomize:
                bal_graphs, bal_targets = balance(
                    graphs, targets, None, ratio=self.ratio)
            else:
                samp_graphs, samp_targets = subsample(
                    graphs, targets, subsample_size=self.subsample_size)
                x = self.transform(samp_graphs)
                self.model.fit(x, samp_targets)
                bal_graphs, bal_targets = balance(
                    graphs, targets, self, ratio=self.ratio)
            size = len(bal_targets)
            logger.debug('Dataset size=%d' % (size))
            x = self.transform(bal_graphs)
            self.model = self.model.fit(x, bal_targets)
        else:
            x = self.transform(graphs)
            self.model = self.model.fit(x, targets)
        return self

    @timeit
    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    @timeit
    def decision_function(self, graphs):
        """decision_function."""
        x = self.transform(graphs)
        preds = self.model.decision_function(x)
        return preds

    @timeit
    def cross_val_score(self, graphs, targets,
                        scoring='roc_auc', cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_score(
            self.model, x, targets, cv=cv, scoring=scoring)
        return scores

    @timeit
    def cross_val_predict(self, graphs, targets, cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_predict(
            self.model, x, targets, cv=cv, method='decision_function')
        return scores

    @timeit
    def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids

    @timeit
    def model_selection(self, graphs, targets,
                        n_iter=30, subsample_size=None):
        """model_selection_randomized."""
        param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))}
        if subsample_size:
            graphs, targets = subsample(
                graphs, targets, subsample_size=subsample_size)

        pool = mp.Pool()
        scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter)
        pool.close()
        pool.join()

        best_params = max(scores)[1]
        logger.debug("Best parameters:\n%s" % (best_params))
        self = EdenEstimator(**best_params)
        return self

    @timeit
    def learning_curve(self, graphs, targets,
                       cv=5, n_steps=10, start_fraction=0.1):
        """learning_curve."""
        graphs, targets = paired_shuffle(graphs, targets)
        x = self.transform(graphs)
        train_sizes = np.linspace(start_fraction, 1.0, n_steps)
        scoring = 'roc_auc'
        train_sizes, train_scores, test_scores = learning_curve(
            self.model, x, targets,
            cv=cv, train_sizes=train_sizes,
            scoring=scoring)
        return train_sizes, train_scores, test_scores

    def bias_variance_decomposition(self, graphs, targets,
                                    cv=5, n_bootstraps=10):
        """bias_variance_decomposition."""
        x = self.transform(graphs)
        score_list = []
        for i in range(n_bootstraps):
            scores = cross_val_score(
                self.model, x, targets, cv=cv)
            score_list.append(scores)
        score_list = np.array(score_list)
        mean_scores = np.mean(score_list, axis=1)
        std_scores = np.std(score_list, axis=1)
        return mean_scores, std_scores
Пример #32
0
class RegressorWrapper(BaseEstimator, RegressorMixin):
    """Regressor."""

    def __init__(self,
                 program=SGDRegressor(average=True, shuffle=True)):
        """Construct."""
        self.program = program
        self.vectorizer = Vectorizer()

    def set_params(self, **params):
        """Set the parameters of this estimator.

        The method.

        Returns
        -------
        self
        """
        # finds parameters for the vectorizer as those that contain "__"
        params_vectorizer = dict()
        params_clusterer = dict()
        for param in params:
            if "vectorizer__" in param:
                key = param.split('__')[1]
                val = params[param]
                params_vectorizer[key] = val
            else:
                params_clusterer[param] = params[param]
        self.program.set_params(**params_clusterer)
        self.vectorizer.set_params(**params_vectorizer)
        return self

    def fit(self, graphs):
        """fit."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            y = self._extract_targets(graphs)
            self.program = self.program.fit(data_matrix, y)
            return self
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def predict(self, graphs):
        """predict."""
        try:
            graphs, graphs_ = tee(graphs)
            data_matrix = self.vectorizer.transform(graphs_)
            predictions = self.program.predict(data_matrix)
            for prediction, graph in izip(predictions, graphs):
                graph.graph['prediction'] = prediction
                graph.graph['score'] = prediction
                yield graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def _extract_targets(self, graphs):
        y = []
        for graph in graphs:
            if graph.graph.get('target', None) is not None:
                y.append(graph.graph['target'])
            else:
                raise Exception('Missing the attribute "target" \
                    in graph dictionary!')
        y = np.ravel(y)
        return y
Пример #33
0
class DiscSampler():
    '''
    '''

    def __init__(self):
        # this is mainly for the forest. the sampler uses a different vectorizer
        self.vectorizer = Vectorizer(nbits=14)

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

    '''
    def sample_simple(self,graphiter,iterneg):
        graphiter,grait,griter2 = itertools.tee(graphiter,3)
        
        self.fit_sampler(graphiter,iterneg)
        a,b,c=self.get_heap_and_forest( griter2, 30)


        grait= itertools.islice(grait,5)
        rez=self.sampler.sample(grait,n_samples=5,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=1,
                                       select_cip_max_tries=100,
                                       accept_annealing_factor=.5,
                                       generatormode=False,
                                       same_core_size=False )
        return rez
    '''

    def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10):

        # some initialisation,
        # creating samper
        # setup heap and forest
        graphiter, iter2 = itertools.tee(graphiter)
        self.fit_sampler(iter2, iter_neg)

        heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k)
        # heap should be like   (hpdist, count, graph)
        radius = radius * avg_dist
        # so lets start the loop1ng
        result = []
        while heap and len(result) < how_many:

            # pop all the graphs we want
            todo = []
            for i in range(heap_chunk_size):
                if heap:
                    todo.append(heapq.heappop(heap))

            # let the sampler do the sampling
            graphz = [e[2] for e in todo]
            # draw.draw_graph_set_graphlearn(graphz)
            work = self.sampler.sample(graphz,
                                       batch_size=1,
                                       n_jobs=0,
                                       n_steps=30,
                                       select_cip_max_tries=100,
                                       improving_threshold=.5,
                                       generatormode=False,
                                       max_core_size_diff=False,
                                       n_samples=3
                                       )
            # lets see, we need to take care of
            # = the initialy poped stuff
            # - increase and check the counter, reinsert into heap
            # = the new graphs
            # put them in the heap and the forest
            for graph, task in zip(work, todo):
                graphlist = graph.graph['sampling_info']['graphs_history']
                print 'rez:', graphlist, task
                for graph2 in graphlist:
                    # check distance from created instances
                    x = self.vectorizer.transform_single(graph2)
                    dist, void = forest.kneighbors(x, 1)
                    dist = sum(dist)
                    # is the distance ok?
                    # if so, insert into forest and heap
                    if radius < dist < radius * 2:
                        forest.partial_fit(x)
                        heapq.heappush(heap, (graph2.graph['score'], 0, graph2))
                        print 'heap'
                    print 'cant heap', radius, dist
                # taking care of task graph
                # put in result list if necessary
                if task[1] < check_k < task[1] + len(graphlist):
                    result.append(task[2])
                    print 'found sth'
                # go back to the heap!
                heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2]))

        return result

    '''
    def simple_fit(self,iter_pos):
        self.sampler= GraphLearnSampler()
        self.sampler.fit(iter_pos)
        self.estimator=self.sampler.estimator
    '''

    def fit_sampler(self, iter_pos, iter_neg):
        # getting the sampler ready:
        self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2])
        iter_pos, pos, pos_ = itertools.tee(iter_pos, 3)
        self.estimator = self.sampler.estimatorobject.fit_2(iter_pos, iter_neg, self.sampler.vectorizer)
        print 'got estimeetaaa'
        self.sampler.local_substitutable_graph_grammar.fit(pos, grammar_n_jobs=-1, grammar_batch_size=8)
        self.sampler.estimator = self.estimator
        print 'got grammar:grammar is there oO'
Пример #34
0
class ListVectorizer(Vectorizer):
    """Transform vector labeled, weighted, nested graphs in sparse vectors.

    A list of iterators over graphs and a list of weights are taken in input.
    The returned vector is the linear combination of sparse vectors obtained on each
    corresponding graph.
    """

    def __init__(self,
                 complexity=3,
                 r=None,
                 d=None,
                 min_r=0,
                 min_d=0,
                 nbits=20,
                 normalization=True,
                 inner_normalization=True,
                 n=1,
                 min_n=2):
        """
        Arguments:


        complexity : int
          The complexity of the features extracted.

        r : int
          The maximal radius size.

        d : int
          The maximal distance size.

        min_r : int
          The minimal radius size.

        min_d : int
          The minimal distance size.

        nbits : int
          The number of bits that defines the feature space size: |feature space|=2^nbits.

        normalization : bool
          If set the resulting feature vector will have unit euclidean norm.

        inner_normalization : bool
          If set the feature vector for a specific combination of the radius and
          distance size will have unit euclidean norm.
          When used together with the 'normalization' flag it will be applied first and
          then the resulting feature vector will be normalized.

        n : int
          The maximal number of clusters used to discretized label vectors.

        min:n : int
          The minimal number of clusters used to discretized label vectors.
        """
        self.vectorizer = Vectorizer(complexity=complexity,
                                     r=r,
                                     d=d,
                                     min_r=min_r,
                                     min_d=min_d,
                                     nbits=nbits,
                                     normalization=normalization,
                                     inner_normalization=inner_normalization,
                                     n=n,
                                     min_n=min_n)
        self.vectorizers = list()

    def fit(self, graphs_iterators_list):
        """
        Constructs an approximate explicit mapping of a kernel function on the data
        stored in the nodes of the graphs.

        Arguments:

        graphs_iterators_list : list of iterators over networkx graphs.
          The data.
        """
        for i, graphs in enumerate(graphs_iterators_list):
            self.vectorizers.append(copy.copy(self.vectorizer))
            self.vectorizers[i].fit(graphs)

    def fit_transform(self, graphs_iterators_list, weights=list()):
        """
        Arguments:

        graphs_iterators_list : list of iterators over networkx graphs.
          The data.

        weights : list of positive real values.
          Weights for the linear combination of sparse vectors obtained on each iterated tuple of graphs.
        """
        graphs_iterators_list_fit, graphs_iterators_list_transf = itertools.tee(graphs_iterators_list)
        self.fit(graphs_iterators_list_fit)
        return self.transform(graphs_iterators_list_transf)

    def transform(self, graphs_iterators_list, weights=list()):
        """
        Transforms a list of networkx graphs into a Numpy csr sparse matrix
        ( Compressed Sparse Row matrix ).

        Arguments:

        graphs_iterators_list : list of iterators over networkx graphs.
          The data.

        weights : list of positive real values.
          Weights for the linear combination of sparse vectors obtained on each iterated tuple of graphs.
        """
        # if no weights are provided then assume unitary weight
        if len(weights) == 0:
            weights = [1] * len(graphs_iterators_list)
        assert(len(graphs_iterators_list) == len(weights)), 'ERROR: weights size is different than iterators size.'
        assert(len(filter(lambda x: x < 0, weights)) == 0), 'ERROR: weight list contains negative values.'
        for i, graphs in enumerate(graphs_iterators_list):
            if len(self.vectorizers) == 0:
                data_matrix_curr = self.vectorizer.transform(graphs)
            else:
                data_matrix_curr = self.vectorizers[i].transform(graphs)
            if i == 0:
                data_matrix = data_matrix_curr * weights[i]
            else:
                data_matrix = data_matrix + data_matrix_curr * weights[i]
        return data_matrix

    def similarity(self, graphs_iterators_list, ref_instance=None, weights=list()):
        """
        This is a generator.
        """
        self._reference_vec = self._convert_dict_to_sparse_matrix(
            self._transform(0, ref_instance))

        # if no weights are provided then assume unitary weight
        if len(weights) == 0:
            weights = [1] * len(graphs_iterators_list)
        assert(len(graphs_iterators_list) == len(weights)
               ), 'ERROR: weights count is different than iterators count.'
        assert(len(filter(lambda x: x < 0, weights)) ==
               0), 'ERROR: weight list contains negative values.'
        try:
            while True:
                graphs = [G_iterator.next() for G_iterator in graphs_iterators_list]
                yield self._similarity(graphs, weights)
        except StopIteration:
            return

    def _similarity(self, graphs, weights=list()):
        # extract feature vector
        for i, graph in enumerate(graphs):
            x_curr = self.vectorizer._convert_dict_to_sparse_matrix(
                self.vectorizer._transform(0, graph))
            if i == 0:
                x = x_curr * weights[i]
            else:
                x = x + x_curr * weights[i]
        res = self._reference_vec.dot(x.T).todense()
        prediction = res[0, 0]
        return prediction

    def predict(self, graphs_iterators_list, estimator=SGDClassifier(), weights=list()):
        """
        Purpose:
        ----------
        It outputs the estimator prediction of the vectorized graph.

        Arguments:

        estimator : scikit-learn predictor trained on data sampled from the same distribution.
          If None the vertex weigths are by default 1.
        """
        self.estimator = estimator
        # if no weights are provided then assume unitary weight
        if len(weights) == 0:
            weights = [1] * len(graphs_iterators_list)
        assert(len(graphs_iterators_list) == len(weights)), 'ERROR: weights count is different than iterators count.'
        assert(len(filter(lambda x: x < 0, weights)) == 0), 'ERROR: weight list contains negative values.'
        try:
            while True:
                graphs = [G_iterator.next() for G_iterator in graphs_iterators_list]
                yield self._predict(graphs, weights)
        except StopIteration:
            return

    def _predict(self, graphs, weights=list()):
        # extract feature vector
        for i, graph in enumerate(graphs):
            x_curr = self.vectorizer._convert_dict_to_sparse_matrix(self.vectorizer._transform(0, graph))
            if i == 0:
                x = x_curr * weights[i]
            else:
                x = x + x_curr * weights[i]
        margins = self.estimator.decision_function(x)
        prediction = margins[0]
        return prediction