class KNNWrapper(BaseEstimator, ClassifierMixin): """KNNWrapper.""" def __init__(self, program=NearestNeighbors(n_neighbors=2)): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: self.graphs = list(graphs) data_matrix = self.vectorizer.transform(graphs) self.program = self.program.fit(data_matrix) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) distances, indices = self.program.kneighbors(data_matrix) for knn_dists, knn_ids, graph in izip(distances, indices, graphs): neighbor_graphs = [] for knn_id in knn_ids: neighbor_graphs.append(self.graphs[knn_id]) graph.graph['neighbors'] = neighbor_graphs graph.graph['ids'] = knn_ids graph.graph['distances'] = knn_dists yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def prep(graphlist,id=0): if not graphlist: return {} v=Vectorizer() map(lambda x: node_operation(x, lambda n, d: d.pop('weight', None)), graphlist) csr=v.transform(graphlist) hash_function = lambda vec: hash(tuple(vec.data + vec.indices)) return {hash_function(row): (id,ith) for ith, row in enumerate(csr)}
class TransformerWrapper(BaseEstimator, ClassifierMixin): """TransformerWrapper.""" def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: self.program.fit(graphs) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def transform(self, graphs): """predict.""" try: for graph in graphs: transformed_graph = self._transform(graph) yield transformed_graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _transform(self, graph): return graph
def compute_NSPDK_features(): import eden from eden.graph import Vectorizer from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden mol_path = olfaction_prediction_path + '/data/sdf/' iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf') iter_graphs = obabel_to_eden(iter_mols) vectorizer = Vectorizer( r=3, d=4 ) X = vectorizer.transform( iter_graphs ) return X
class AnnotateImportance(BaseEstimator, ClassifierMixin): """Annotate minimal cycles.""" def __init__(self, program=None, vertex_features=True, reweight=1.0): """Construct.""" self.program = program self.vertex_features = vertex_features self.reweight = reweight self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this program. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_program = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_program[param] = params[param] self.program.set_params(**params_program) self.vectorizer.set_params(**params_vectorizer) return self def transform(self, graphs): """Transform.""" try: annotated_graphs = self.vectorizer.annotate( graphs, estimator=self.program, reweight=self.reweight, vertex_features=self.vertex_features) for graph in annotated_graphs: yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
class OrdererWrapper(BaseEstimator, ClassifierMixin): """Orderer.""" def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_orderer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_orderer[param] = params[param] self.program.set_params(**params_orderer) self.vectorizer.set_params(**params_vectorizer) return self def decision_function(self, graphs): """decision_function.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) scores = self.program.decision_function(data_matrix) return scores except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
class IsomorphicClusterer(BaseEstimator, ClusterMixin): """IsomorphismClusterer. """ def __init__(self): """Construct.""" self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ for param in params: self.__dict__[param] = params[param] return self def fit_predict(self, graphs): """fit_predict.""" def vec_to_hash(vec): return hash(tuple(vec.data + vec.indices)) try: for graph in graphs: prediction = vec_to_hash(self.vectorizer.transform([graph])) yield prediction except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(max_iter=5, tol=None) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self
def __init__(self, complexity=None, nbits=20, sequence_vectorizer_complexity=3, graph_vectorizer_complexity=2, n_neighbors=5, sampling_prob=.5, n_iter=5, min_energy=-5, random_state=1): random.seed(random_state) if complexity is not None: sequence_vectorizer_complexity = complexity graph_vectorizer_complexity = complexity self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity, nbits=nbits, normalization=False, inner_normalization=False) self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits) self.n_neighbors = n_neighbors self.sampling_prob = sampling_prob self.n_iter = n_iter self.min_energy = min_energy self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
def __init__(self, program=SGDClassifier(average=True, class_weight='balanced', shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer()
def __init__(self, complexity=3, r=None, d=None, min_r=0, min_d=0, nbits=20, normalization=True, inner_normalization=True, n=1, min_n=2): """ Arguments: complexity : int The complexity of the features extracted. r : int The maximal radius size. d : int The maximal distance size. min_r : int The minimal radius size. min_d : int The minimal distance size. nbits : int The number of bits that defines the feature space size: |feature space|=2^nbits. normalization : bool If set the resulting feature vector will have unit euclidean norm. inner_normalization : bool If set the feature vector for a specific combination of the radius and distance size will have unit euclidean norm. When used together with the 'normalization' flag it will be applied first and then the resulting feature vector will be normalized. n : int The maximal number of clusters used to discretized label vectors. min:n : int The minimal number of clusters used to discretized label vectors. """ self.vectorizer = Vectorizer(complexity=complexity, r=r, d=d, min_r=min_r, min_d=min_d, nbits=nbits, normalization=normalization, inner_normalization=inner_normalization, n=n, min_n=min_n) self.vectorizers = list()
class EdenRegressor(BaseEstimator, RegressorMixin): """Build a regressor for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """construct.""" self.set_params(r, d, nbits, discrete, normalization, inner_normalization, penalty, loss) def set_params(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.model = SGDRegressor( loss=loss, penalty=penalty, average=True, shuffle=True, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" return self.predict(graphs)
def __init__(self, program=None, vertex_features=True, reweight=1.0): """Construct.""" self.program = program self.vertex_features = vertex_features self.reweight = reweight self.vectorizer = Vectorizer() self.params_vectorize = dict()
def __init__(self, program=None, relabel=False, reweight=1.0): """Construct.""" self.program = program self.relabel = relabel self.reweight = reweight self.vectorizer = Vectorizer() self.params_vectorize = dict()
class OrdererWrapper(BaseEstimator, ClassifierMixin): """Orderer.""" def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_orderer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_orderer[param] = params[param] self.program.set_params(**params_orderer) self.vectorizer.set_params(**params_vectorizer) return self def decision_function(self, graphs): """decision_function.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) scores = self.program.decision_function(data_matrix) return scores except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def vectorize(self, g): """ Vectorize graph nodes Return: a matrix in which rows are the vectors that represents for nodes """ vec = Vectorizer(nbits=self.nbits, discrete=self.discrete, d=self.d, r=self.r ) M = vec.vertex_transform([g])[0] M_reduce = [] for idx in range(self.n_nodes): vec = M[idx,:] for l in range(1, self.L): vec = vec + M[idx + l*self.n_nodes,: ] M_reduce.append(vec) M = vstack(M_reduce) return M
def generate_negatives_and_evaluate(iterable=None, estimator=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None): vectorizer = Vectorizer(complexity=vectorizer_complexity) iterable, iterable_neg = binary_classification_dataset_setup( iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order) roc, apr = estimate(iterable, iterable_neg, estimator, vectorizer, n_jobs=-1) return roc, apr
def __init__(self, min_subarray_size=7, max_subarray_size=10, min_motif_count=1, min_cluster_size=1, training_size=None, negative_ratio=1, shuffle_order=2, n_iter_search=1, complexity=4, radius=None, distance=None, nbits=20, clustering_algorithm=None, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, random_state=1): self.n_jobs = n_jobs self.n_blocks = n_blocks self.block_size = block_size self.pre_processor_n_jobs = pre_processor_n_jobs self.pre_processor_n_blocks = pre_processor_n_blocks self.pre_processor_block_size = pre_processor_block_size self.training_size = training_size self.n_iter_search = n_iter_search self.complexity = complexity self.nbits = nbits # init vectorizer self.vectorizer = Vectorizer(complexity=self.complexity, r=radius, d=distance, nbits=self.nbits) self.seq_vectorizer = SeqVectorizer(complexity=self.complexity, r=radius, d=distance, nbits=self.nbits) self.negative_ratio = negative_ratio self.shuffle_order = shuffle_order self.clustering_algorithm = clustering_algorithm self.min_subarray_size = min_subarray_size self.max_subarray_size = max_subarray_size self.min_motif_count = min_motif_count self.min_cluster_size = min_cluster_size self.random_state = random_state random.seed(random_state) self.motives_db = defaultdict(list) self.motives = [] self.clusters = defaultdict(list) self.cluster_models = [] self.importances = []
def set_params(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.model = SGDRegressor( loss=loss, penalty=penalty, average=True, shuffle=True, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self
def __init__(self, transformer=None, vectorizer=Vectorizer(complexity=4, nbits=13), clustering_algo=DBSCAN(), distance_std_factor=2, min_cluster_size=2, random_state=1): """Cluster sequences according to regions of interest and structural folding. Parameters ---------- transformer : initialized PreProcessor object Transforms sequences to graphs that encode secondary structure information and weights nucleotides according to user defined list of intervals. vectorizer : initialized Vectorizer object Transforms graphs to sparse vectors. clustering_algo : scikit-learn clustering algorithm Clusters sparse vectors in a finite number of classes. distance_std_factor : int (default 2) How many standard deviations less than the mean pairwise distance is the maximal distance required to join an instance in a cluster. min_cluster_size : int (default 2) Minimal size of any cluster. random_state: int (default 1) Random seed. Attributes ---------- predictions : list(int) List of cluster ids, one per instance. clusters : defaultdict(list) Dictionary with cluster id as key and list of sequences as variable. data_matrix : Scipy sparse matrix (Compressed Sparse Row matrix) List of sparse vectors resulting from the transformation of sequences into structures. """ self.name = self.__class__.__name__ self.transformer = transformer self.vectorizer = vectorizer self.clustering_algo = clustering_algo self.distance_std_factor = distance_std_factor self.min_cluster_size = min_cluster_size self.clusters = defaultdict(list) self.predictions = list() self.data_matrix = None self.random_state = random_state random.seed(self.random_state)
class Annotator(): def __init__(self, multiprocess=True, score_attribute='importance'): self.score_attribute=score_attribute self.vectorizer=Vectorizer() self.multi_process=multiprocess self.trained=False def fit(self, graphs_pos, graphs_neg=[]): if self.trained: return self self.trained=True map(utils.remove_eden_annotation,graphs_pos+graphs_neg) map(lambda x: utils.node_operation(x, lambda n,d: d.pop('importance',None)), graphs_pos+graphs_neg) map( lambda graph: graph.graph.pop('mass_annotate_mp_was_here',None) ,graphs_pos+graphs_neg) if graphs_neg: #print 'choosing to train binary esti' self.estimator = SGDClassifier() classes= [1]*len(graphs_pos)+[-1]*len(graphs_neg) self.estimator.fit(self.vectorizer.transform(graphs_pos+graphs_neg),classes) else: self.estimator = ExperimentalOneClassEstimator() self.estimator.fit(self.vectorizer.transform(graphs_pos)) return self def fit_transform(self,graphs_p, graphs_n=[]): self.fit(graphs_p,graphs_n) return self.transform(graphs_p),self.transform(graphs_n) def transform(self,graphs): return self.annotate(graphs) def annotate(self,graphs,neg=False): if not graphs: return [] return mass_annotate_mp(graphs,self.vectorizer,score_attribute=self.score_attribute,estimator=self.estimator, multi_process=self.multi_process, invert_score=neg)
class ClustererWrapper(BaseEstimator, ClusterMixin): """Clusterer.""" def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit_predict(self, graphs): """fit_predict.""" try: data_matrix = self.vectorizer.transform(graphs) predictions = self.program.fit_predict(data_matrix) return predictions except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def setup(self, known_graphs=None, candidate_graphs=None): """Setup.""" # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the # known graphs in the list 'known_graphs' parameters_priors = dict(n_neighbors=self.n_neighbors) parameters_priors.update(dict(vectorizer__complexity=self.complexity, vectorizer__discrete=True)) fit_wrapped_knn_predictor_known = \ model(known_graphs, program=KNNWrapper(program=NearestNeighbors()), parameters_priors=parameters_priors) # compute distances of candidate_graphs to known_graphs knn_candidate_graphs = predict(candidate_graphs, program=fit_wrapped_knn_predictor_known) knn_candidate_graphs = list(knn_candidate_graphs) self.distances_to_known_graphs = [] for knn_candidate_graph in knn_candidate_graphs: distances = knn_candidate_graph.graph['distances'] self.distances_to_known_graphs.append(distances) # compute candidate_graphs encodings vec = Vectorizer(complexity=self.complexity) self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
def clusterGraphs(graphs, r, d, copt): opts = copt[1:-1] optl = opts.split(",") opt = int(optl[0]) vectorizer = Vectorizer(r=r, d=d) samples = len(graphs) minlclu = 5 Xsp = vectorizer.transform(graphs) #sparse feature matrix X = Xsp.todense() #regular feature matrix #SM=metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma = 1)#similarity matrix SM = metrics.pairwise.pairwise_kernels(Xsp, metric='linear') DM = [] #distance matrix for i in range(len(SM)): DM.append([]) for j in range(len(SM[i])): val = 1.0 - SM[i][j] if val < 0: DM[i].append(0.0) else: DM[i].append(val) if opt == 0: nc, labels = MShift(X) if opt == 1: #print(DM) minlclu = int(optl[2]) nc, labels = DB_SCAN(DM, float(optl[1]), int(optl[2])) if opt == 2: nc, labels = AffProp(SM) if opt == 3: print(SM) #Matrix(X) return 0, [] if opt == 4: nc, labels = K_Means(X) if opt == 5: nc, labels = SpecClus(SM) if opt == 6: nc, labels = dclust(DM, int(optl[1]), int(optl[2]), float(optl[3])) return nc, labels, minlclu
def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3): """init.""" self.vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.grammar = GrammarWrapper(radius_list=[1, 2, 3], thickness_list=[2], min_cip_count=min_count, min_interface_count=min_count, max_n_neighbors=max_n_neighbors, n_neigh_steps=1, max_neighborhood_size=max_n_neighbors)
def __init__( self, min_count=2, max_n_neighbors=100, r=3, d=3, n_neighbors=10, max_num_solutions=30): """construct.""" self.min_count = min_count self.max_n_neighbors = max_n_neighbors self.max_num_solutions = max_num_solutions self.r = r self.d = d self.n_neighbors = n_neighbors self.clf = Perceptron(n_iter=500) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True, nbits=16) self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6]
def setup(self, known_graphs=None, candidate_graphs=None): """Setup.""" # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the # known graphs in the list 'known_graphs' parameters_priors = dict(n_neighbors=self.n_neighbors) parameters_priors.update( dict(vectorizer__complexity=self.complexity, vectorizer__discrete=True)) fit_wrapped_knn_predictor_known = \ model(known_graphs, program=KNNWrapper(program=NearestNeighbors()), parameters_priors=parameters_priors) # compute distances of candidate_graphs to known_graphs knn_candidate_graphs = predict(candidate_graphs, program=fit_wrapped_knn_predictor_known) knn_candidate_graphs = list(knn_candidate_graphs) self.distances_to_known_graphs = [] for knn_candidate_graph in knn_candidate_graphs: distances = knn_candidate_graph.graph['distances'] self.distances_to_known_graphs.append(distances) # compute candidate_graphs encodings vec = Vectorizer(complexity=self.complexity) self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
def compare(finalL, L, peaks, opt, th, alpha): n = len(L) lpeaks = {} for key in L: lpeaks[key] = peaks[key] for key in finalL: lpeaks[key] = peaks[key] graphs, dict = peaksToGraphs(lpeaks, opt, alpha) vectorizer = Vectorizer(r=2, d=3) samples = len(graphs) Xsp = vectorizer.transform(graphs) #sparse feature matrix X = Xsp.todense() #regular feature matrix SM = metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma=1) #similarity matrix DM = [] #distance matrix for i in range(len(SM)): DM.append([]) for j in range(len(SM[i])): val = 1.0 - SM[i][j] if val < 0: DM[i].append(0.0) else: DM[i].append(val) avgDM = 0.0 counts = 0.0 for i in range(len(graphs)): if dict[i] in L: for j in range(len(graphs)): if i != j and dict[j] in finalL: avgDM += DM[i][j] counts += 1 avgDM = avgDM / counts if avgDM >= 0.0 and avgDM <= th: return 0 else: return 1
def _vectorize_graphs(self, graphs): """Vectorize the RNAplfold graphs using EDeN.""" if self.verbose: print("Vectorizing (complexity: %i, hashing: %i bits)..." % (self.complexity, self.nbits), end=' ') sys.stdout.flush() vec = Vectorizer(complexity=self.complexity, nbits=self.nbits) x_sparse = eden_vectorize(graphs, vectorizer=vec, n_jobs=self.njobs) if self.verbose: print("Done.\n") sys.stdout.flush() return x_sparse.todense()
def __init__(self, radius_list=None, thickness_list=None, min_cip_count=3, vectorizer=Vectorizer(complexity=3), min_interface_count=2, nbit=20, node_entity_check=lambda x, y: True): self.productions = {} self.min_interface_count = min_interface_count self.radius_list = radius_list self.thickness_list = thickness_list self.min_cip_count = min_cip_count self.vectorizer = vectorizer self.hash_bitmask = 2**nbit - 1 self.nbit = nbit # checked when extracting grammar. see graphtools self.node_entity_check = node_entity_check self.prep_is_outdated = True
def __init__(self, radius_list=[0, 1], thickness_list=[1, 2], grammar=None, core_interface_pair_remove_threshold=2, interface_remove_threshold=2, complexity=3, vectorizer=Vectorizer(complexity=3), estimator=estimator_wrapper.estimator_wrapper()): self.complexity = complexity self.feasibility_checker = FeasibilityChecker() self.postprocessor = processing.PostProcessor() self.vectorizer = vectorizer # lists of int self.radius_list = [int(2 * r) for r in radius_list] self.thickness_list = [int(2 * t) for t in thickness_list] # scikit classifier self.estimatorobject = estimator # grammar object self.local_substitutable_graph_grammar = grammar # cips hashes will be masked with this, this is unrelated to the vectorizer self.hash_bitmask = pow(2, 20) - 1 # we will save current graph at every intervalth step of sampling and attach to graphinfos[graphs] self.sampling_interval = None # how many sampling steps are done self.n_steps = None # current step in sampling proces of a single graph self.step = None # how often do we try to get a cip from the current graph in sampling self.select_cip_max_tries = None # sample path self.sample_path = None self.local_substitutable_graph_grammar = LocalSubstitutableGraphGrammar( self.radius_list, self.thickness_list, complexity=self.complexity, cip_remove_threshold=core_interface_pair_remove_threshold, interface_remove_threshold=interface_remove_threshold, nbit=20)
class SequenceMotif(object): def __init__(self, min_subarray_size=7, max_subarray_size=10, min_motif_count=1, min_cluster_size=1, training_size=None, negative_ratio=2, shuffle_order=2, n_iter_search=1, complexity=4, nbits=20, clustering_algorithm=None, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, random_state=1): self.n_jobs = n_jobs self.n_blocks = n_blocks self.block_size = block_size self.pre_processor_n_jobs = pre_processor_n_jobs self.pre_processor_n_blocks = pre_processor_n_blocks self.pre_processor_block_size = pre_processor_block_size self.training_size = training_size self.n_iter_search = n_iter_search self.complexity = complexity self.nbits = nbits # init vectorizer self.vectorizer = Vectorizer(complexity=self.complexity, nbits=self.nbits) self.seq_vectorizer = PathVectorizer(complexity=self.complexity, nbits=self.nbits) self.negative_ratio = negative_ratio self.shuffle_order = shuffle_order self.clustering_algorithm = clustering_algorithm self.min_subarray_size = min_subarray_size self.max_subarray_size = max_subarray_size self.min_motif_count = min_motif_count self.min_cluster_size = min_cluster_size self.random_state = random_state random.seed(random_state) self.motives_db = defaultdict(list) self.motives = [] self.clusters = defaultdict(list) self.cluster_models = [] def save(self, model_name): self.clustering_algorithm = None # NOTE: some algorithms cannot be pickled joblib.dump(self, model_name, compress=1) def load(self, obj): self.__dict__.update(joblib.load(obj).__dict__) self._build_cluster_models() def fit(self, seqs, neg_seqs=None): """ Builds a discriminative estimator. Identifies the maximal subarrays in the data. Clusters them with the clustering algorithm provided in the initialization phase. For each cluster builds a fast sequence search model (Aho Corasick data structure). """ start = time() if self.training_size is None: training_seqs = seqs else: training_seqs = random.sample(seqs, self.training_size) self._fit_predictive_model(training_seqs, neg_seqs=neg_seqs) end = time() logger.info('model induction: %d positive instances %d s' % (len(training_seqs), (end - start))) start = time() self.motives = self._motif_finder(seqs) end = time() logger.info('motives extraction: %d motives in %ds' % (len(self.motives), end - start)) start = time() self._cluster(self.motives, clustering_algorithm=self.clustering_algorithm) end = time() logger.info('motives clustering: %d clusters in %ds' % (len(self.clusters), end - start)) start = time() self._filter() end = time() n_motives = sum(len(self.motives_db[cid]) for cid in self.motives_db) n_clusters = len(self.motives_db) logger.info('after filtering: %d motives %d clusters in %ds' % (n_motives, n_clusters, (end - start))) start = time() # create models self._build_cluster_models() end = time() logger.info('motif model construction in %ds' % (end - start)) start = time() # update motives counts self._update_counts(seqs) end = time() logger.info('updated motif counts in %ds' % (end - start)) def info(self): text = [] for cluster_id in self.motives_db: num_hits = len(self.cluster_hits[cluster_id]) frac_num_hits = num_hits / float(self.dataset_size) text.append('Cluster: %s #%d (%.3f)' % (cluster_id, num_hits, frac_num_hits)) for count, motif in sorted(self.motives_db[cluster_id], reverse=True): text.append('%s #%d' % (motif, count)) text.append('') return text def _update_counts(self, seqs): self.dataset_size = len(seqs) cluster_hits = defaultdict(set) motives_db = defaultdict(list) for cluster_id in self.motives_db: motives = [motif for count, motif in self.motives_db[cluster_id]] motif_dict = {} for motif in motives: counter = 0 for header, seq in seqs: if motif in seq: counter += 1 cluster_hits[cluster_id].add(header) motif_dict[motif] = counter # remove implied motives motif_dict_copy = motif_dict.copy() for motif_i in motif_dict: for motif_j in motif_dict: if motif_dict[motif_i] == motif_dict[motif_j] and \ len(motif_j) < len(motif_i) and motif_j in motif_i: if motif_j in motif_dict_copy: motif_dict_copy.pop(motif_j) for motif in motif_dict_copy: motives_db[cluster_id].append((motif_dict[motif], motif)) self.motives_db = motives_db self.cluster_hits = cluster_hits def fit_predict(self, seqs, return_list=False): self.fit(seqs) for prediction in self.predict(seqs, return_list=return_list): yield prediction def fit_transform(self, seqs, return_match=False): self.fit(seqs) for prediction in self.transform(seqs, return_match=return_match): yield prediction def predict(self, seqs, return_list=False): """Returns for each instance a list with the cluster ids that have a hit if return_list=False then just return 1 if there is at least one hit from one cluster.""" for header, seq in seqs: cluster_hits = [] for cluster_id in self.motives_db: hits = list(self._cluster_hit(seq, cluster_id)) if len(hits): begin, end = min(hits) cluster_hits.append((begin, cluster_id)) if return_list is False: if len(cluster_hits): yield len(cluster_hits) else: yield 0 else: yield [cluster_id for pos, cluster_id in sorted(cluster_hits)] def transform(self, seqs, return_match=False): """Transform an instance to a dense vector with features as cluster ID and entries 0/1 if a motif is found, if 'return_match' argument is True, then write a pair with (start position,end position) in the entry instead of 0/1""" num = len(self.motives_db) for header, seq in seqs: cluster_hits = [0] * num for cluster_id in self.motives_db: hits = self._cluster_hit(seq, cluster_id) hits = list(hits) if return_match is False: if len(hits): cluster_hits[cluster_id] = 1 else: cluster_hits[cluster_id] = hits yield cluster_hits def _serial_graph_motif(self, seqs, placeholder=None): # make graphs iterable = sequence_to_eden(seqs) # use node importance and 'position' attribute to identify max_subarrays of a specific size graphs = self.vectorizer.annotate(iterable, estimator=self.estimator) # use compute_max_subarrays to return an iterator over motives motives = [] for graph in graphs: subarrays = compute_max_subarrays( graph=graph, min_subarray_size=self.min_subarray_size, max_subarray_size=self.max_subarray_size) if subarrays: for subarray in subarrays: motives.append(subarray['subarray_string']) return motives def _multiprocess_graph_motif(self, seqs): size = len(seqs) intervals = compute_intervals(size=size, n_blocks=self.n_blocks, block_size=self.block_size) if self.n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(processes=self.n_jobs) results = [ apply_async(pool, self._serial_graph_motif, args=(seqs[start:end], True)) for start, end in intervals ] output = [p.get() for p in results] return list(chain(*output)) def _motif_finder(self, seqs): if self.n_jobs > 1 or self.n_jobs == -1: return self._multiprocess_graph_motif(seqs) else: return self._serial_graph_motif(seqs) def _fit_predictive_model(self, seqs, neg_seqs=None): # duplicate iterator pos_seqs, pos_seqs_ = tee(seqs) pos_graphs = mp_pre_process(pos_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) if neg_seqs is None: # shuffle seqs to obtain negatives neg_seqs = seq_to_seq(pos_seqs_, modifier=shuffle_modifier, times=self.negative_ratio, order=self.shuffle_order) neg_graphs = mp_pre_process(neg_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) # fit discriminative estimator self.estimator = fit(pos_graphs, neg_graphs, vectorizer=self.vectorizer, n_iter_search=self.n_iter_search, n_jobs=self.n_jobs, n_blocks=self.n_blocks, block_size=self.block_size, random_state=self.random_state) def _cluster(self, seqs, clustering_algorithm=None): data_matrix = vectorize(seqs, vectorizer=self.seq_vectorizer, n_blocks=self.n_blocks, block_size=self.block_size, n_jobs=self.n_jobs) predictions = clustering_algorithm.fit_predict(data_matrix) # collect instance ids per cluster id for i in range(len(predictions)): self.clusters[predictions[i]] += [i] def _filter(self): # transform self.clusters that contains only the ids of the motives to # clustered_motives that contains the actual sequences new_sequential_cluster_id = -1 clustered_motives = defaultdict(list) for cluster_id in self.clusters: if cluster_id != -1: if len(self.clusters[cluster_id]) >= self.min_cluster_size: new_sequential_cluster_id += 1 for motif_id in self.clusters[cluster_id]: clustered_motives[new_sequential_cluster_id].append( self.motives[motif_id]) motives_db = defaultdict(list) # extract motif count within a cluster for cluster_id in clustered_motives: # consider only non identical motives motif_set = set(clustered_motives[cluster_id]) for motif_i in motif_set: # count occurrences of each motif in cluster count = 0 for motif_j in clustered_motives[cluster_id]: if motif_i == motif_j: count += 1 # create dict with motives and their counts # if counts are above a threshold if count >= self.min_motif_count: motives_db[cluster_id].append((count, motif_i)) # transform cluster ids to incremental ids incremental_id = 0 for cluster_id in motives_db: if len(motives_db[cluster_id]) >= self.min_cluster_size: self.motives_db[incremental_id] = motives_db[cluster_id] incremental_id += 1 def _build_cluster_models(self): self.cluster_models = [] for cluster_id in self.motives_db: motives = [motif for count, motif in self.motives_db[cluster_id]] cluster_model = esm.Index() for motif in motives: cluster_model.enter(motif) cluster_model.fix() self.cluster_models.append(cluster_model) def _cluster_hit(self, seq, cluster_id): for ((start, end), motif) in self.cluster_models[cluster_id].query(seq): yield (start, end)
matplotlib.use('Agg') from eden.converter.graph.gspan import gspan_to_eden from graphlearn.graphlearn import GraphLearnSampler from eden.graph import Vectorizer import matplotlib.pyplot as plt import itertools from graphlearn.utils import myeden from eden.util import fit_estimator as eden_fit_estimator from eden.util import selection_iterator as picker from sklearn.linear_model import SGDClassifier import random # a vectorizer vectorizer = Vectorizer( complexity=3 ) # select 1st element in an iterator def unpack(graphs): for graphlist in graphs: yield graphlist[0] def make_estimator(pos,neg): pos = vectorizer.transform( pos ) neg = vectorizer.transform( neg ) esti = eden_fit_estimator(SGDClassifier(), positive_data_matrix=pos, negative_data_matrix=neg) return esti
class IdealGraphEstimator(object): """Build an estimator for graphs.""" def __init__( self, min_count=2, max_n_neighbors=100, r=3, d=3, n_neighbors=10, max_num_solutions=30): """construct.""" self.min_count = min_count self.max_n_neighbors = max_n_neighbors self.max_num_solutions = max_num_solutions self.r = r self.d = d self.n_neighbors = n_neighbors self.clf = Perceptron(n_iter=500) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True, nbits=16) self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6] def fit(self, pos_graphs, neg_graphs): """fit.""" ref_graphs = self.construct(pos_graphs, neg_graphs) logger.debug('Working on %d constructed graphs' % len(ref_graphs)) y = [1] * len(pos_graphs) + [-1] * len(neg_graphs) x = self.vec.transform(pos_graphs + neg_graphs) z = self.vec.transform(ref_graphs) n_features = z.shape[0] k = np.hstack([pairwise_kernels(x, z, metric='rbf', gamma=g) for g in self.gs]) step = len(ref_graphs) / 2 n_inst, n_feat = k.shape txt = 'RFECV on %d instances with %d features with step: %d' % \ (n_inst, n_feat, step) logger.debug(txt) selector = RFECV(self.clf, step=step, cv=10) selector = selector.fit(k, y) ids = list(concat([range(n_features)] * len(self.gs))) gs_list = list(concat([[g] * n_features for g in self.gs])) feat = defaultdict(list) for g, i, s in zip(gs_list, ids, selector.support_): if s: feat[g].append(i) self.mats = dict() for g in sorted(feat): mat = vstack([z[i] for i in feat[g]]) self.mats[g] = mat sel_ids = set([i for i, s in zip(ids, selector.support_) if s]) self.ideal_graphs_ = [ref_graphs[i] for i in sel_ids] return self def transform(self, graphs): """transform.""" x = self.vec.transform(graphs) xtr = np.hstack([pairwise_kernels(x, self.mats[g], metric='rbf', gamma=g) for g in sorted(self.mats)]) return xtr def construct(self, pos_graphs, neg_graphs): """construct.""" args = dict( min_count=self.min_count, max_n_neighbors=self.max_n_neighbors, r=self.r, d=self.d, n_landmarks=5, n_neighbors=self.n_neighbors, n_iter=20, k_best=5, max_num_solutions=self.max_num_solutions) self.active_constr = NearestNeighborsMeanOptimizer( improve=False, **args) self.active_constr.fit(pos_graphs, neg_graphs) graphs = pos_graphs + neg_graphs active_pareto_set_graphs = self.active_constr.optimize(graphs) self.pos_constr = NearestNeighborsMeanOptimizer( improve=True, **args) self.pos_constr.fit(pos_graphs, neg_graphs) pareto_set_graphs = self.pos_constr.optimize(graphs) sel_constructed_graphs = pareto_set_graphs + active_pareto_set_graphs return sel_constructed_graphs
def vectorize(thing): v = Vectorizer() if not thing: raise Exception( "need something to vectirize.. received %s" % str(thing)) thing=list(thing) # current eden does not eat generators anymore? weird return v.transform(thing)
def __init__(self, program=SGDRegressor(average=True, shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict()
def __init__(self): # this is mainly for the forest. the sampler uses a different vectorizer self.vectorizer = Vectorizer(nbits=14)
class DiscSampler(): ''' ''' def __init__(self): # this is mainly for the forest. the sampler uses a different vectorizer self.vectorizer = Vectorizer(nbits=14) def get_heap_and_forest(self, griter, k): ''' so we create the heap and the forest... heap is (dist to hyperplane, count, graph) and the forest ist just a nearest neighbor from sklearn ''' graphs = list(griter) graphs2 = copy.deepcopy(graphs) # transform doess mess up the graph objects X = self.vectorizer.transform(graphs) forest = LSHForest() forest.fit(X) print 'got forest' heap = [] for vector, graph in zip(X, graphs2): graph2 = nx.Graph(graph) heapq.heappush( heap, ( self.sampler.estimator.predict_proba( self.sampler.vectorizer.transform_single( graph2))[0][1], # score ~ dist from hyperplane k + 1, # making sure that the counter is high so we dont output the startgraphz at the end graph)) # at last the actual graph print 'got heap' distances, unused = forest.kneighbors(X, n_neighbors=2) distances = [a[1] for a in distances ] # the second element should be the dist we want avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances) print 'got dist' return heap, forest, avg_dist ''' def sample_simple(self,graphiter,iterneg): graphiter,grait,griter2 = itertools.tee(graphiter,3) self.fit_sampler(graphiter,iterneg) a,b,c=self.get_heap_and_forest( griter2, 30) grait= itertools.islice(grait,5) rez=self.sampler.sample(grait,n_samples=5, batch_size=1, n_jobs=0, n_steps=1, select_cip_max_tries=100, accept_annealing_factor=.5, generatormode=False, same_core_size=False ) return rez ''' def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10): # some initialisation, # creating samper # setup heap and forest graphiter, iter2 = itertools.tee(graphiter) self.fit_sampler(iter2, iter_neg) heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k) # heap should be like (hpdist, count, graph) radius = radius * avg_dist # so lets start the loop1ng result = [] while heap and len(result) < how_many: # pop all the graphs we want todo = [] for i in range(heap_chunk_size): if heap: todo.append(heapq.heappop(heap)) # let the sampler do the sampling graphz = [e[2] for e in todo] #draw.draw_graph_set_graphlearn(graphz) work = self.sampler.sample(graphz, batch_size=1, n_jobs=0, n_steps=30, select_cip_max_tries=100, improving_threshold=.5, generatormode=False, max_core_size_diff=False, n_samples=3) # lets see, we need to take care of # = the initialy poped stuff # - increase and check the counter, reinsert into heap # = the new graphs # put them in the heap and the forest for graph, task in zip(work, todo): graphlist = graph.graph['sampling_info']['graphs_history'] print 'rez:', graphlist, task for graph2 in graphlist: # check distance from created instances x = self.vectorizer.transform_single(graph2) dist, void = forest.kneighbors(x, 1) dist = sum(dist) # is the distance ok? # if so, insert into forest and heap if radius < dist < radius * 2: forest.partial_fit(x) heapq.heappush(heap, (graph2.graph['score'], 0, graph2)) print 'heap' print 'cant heap', radius, dist # taking care of task graph # put in result list if necessary if task[1] < check_k < task[1] + len(graphlist): result.append(task[2]) print 'found sth' # go back to the heap! heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2])) return result ''' def simple_fit(self,iter_pos): self.sampler= GraphLearnSampler() self.sampler.fit(iter_pos) self.estimator=self.sampler.estimator ''' def fit_sampler(self, iter_pos, iter_neg): # getting the sampler ready: self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2]) iter_pos, pos, pos_ = itertools.tee(iter_pos, 3) self.estimator = self.sampler.estimatorobject.fit_2( iter_pos, iter_neg, self.sampler.vectorizer) print 'got estimeetaaa' self.sampler.local_substitutable_graph_grammar.fit( pos, grammar_n_jobs=-1, grammar_batch_size=8) self.sampler.estimator = self.estimator print 'got grammar:grammar is there oO'
def __init__(self): """Construct.""" self.vectorizer = Vectorizer()
class ClassifierWrapper(BaseEstimator, ClassifierMixin): """Classifier.""" def __init__(self, program=SGDClassifier(average=True, class_weight='balanced', shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) y = self._extract_targets(graphs) # manage case for single class learning if len(set(y)) == 1: # make negative data matrix negative_data_matrix = data_matrix.multiply(-1) # make targets y = list(y) y_neg = [-1] * len(y) # concatenate elements data_matrix = vstack( [data_matrix, negative_data_matrix], format="csr") y = y + y_neg y = np.ravel(y) self.program = self.program.fit(data_matrix, y) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) predictions = self.program.predict(data_matrix) scores = self.program.decision_function(data_matrix) for score, prediction, graph in izip(scores, predictions, graphs): graph.graph['prediction'] = prediction graph.graph['score'] = score yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _extract_targets(self, graphs): y = [] for graph in graphs: if graph.graph.get('target', None) is not None: y.append(graph.graph['target']) else: raise Exception('Missing the attribute "target" \ in graph dictionary!') y = np.ravel(y) return y
class DiscSampler(): ''' ''' def __init__(self): # this is mainly for the forest. the sampler uses a different vectorizer self.vectorizer = Vectorizer(nbits=14) def get_heap_and_forest(self, griter, k): ''' so we create the heap and the forest... heap is (dist to hyperplane, count, graph) and the forest ist just a nearest neighbor from sklearn ''' graphs = list(griter) graphs2 = copy.deepcopy(graphs) # transform doess mess up the graph objects X = self.vectorizer.transform(graphs) forest = LSHForest() forest.fit(X) print 'got forest' heap = [] for vector, graph in zip(X, graphs2): graph2 = nx.Graph(graph) heapq.heappush(heap, ( self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1], # score ~ dist from hyperplane k + 1, # making sure that the counter is high so we dont output the startgraphz at the end graph)) # at last the actual graph print 'got heap' distances, unused = forest.kneighbors(X, n_neighbors=2) distances = [a[1] for a in distances] # the second element should be the dist we want avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances) print 'got dist' return heap, forest, avg_dist ''' def sample_simple(self,graphiter,iterneg): graphiter,grait,griter2 = itertools.tee(graphiter,3) self.fit_sampler(graphiter,iterneg) a,b,c=self.get_heap_and_forest( griter2, 30) grait= itertools.islice(grait,5) rez=self.sampler.sample(grait,n_samples=5, batch_size=1, n_jobs=0, n_steps=1, select_cip_max_tries=100, accept_annealing_factor=.5, generatormode=False, same_core_size=False ) return rez ''' def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10): # some initialisation, # creating samper # setup heap and forest graphiter, iter2 = itertools.tee(graphiter) self.fit_sampler(iter2, iter_neg) heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k) # heap should be like (hpdist, count, graph) radius = radius * avg_dist # so lets start the loop1ng result = [] while heap and len(result) < how_many: # pop all the graphs we want todo = [] for i in range(heap_chunk_size): if heap: todo.append(heapq.heappop(heap)) # let the sampler do the sampling graphz = [e[2] for e in todo] # draw.draw_graph_set_graphlearn(graphz) work = self.sampler.sample(graphz, batch_size=1, n_jobs=0, n_steps=30, select_cip_max_tries=100, improving_threshold=.5, generatormode=False, max_core_size_diff=False, n_samples=3 ) # lets see, we need to take care of # = the initialy poped stuff # - increase and check the counter, reinsert into heap # = the new graphs # put them in the heap and the forest for graph, task in zip(work, todo): graphlist = graph.graph['sampling_info']['graphs_history'] print 'rez:', graphlist, task for graph2 in graphlist: # check distance from created instances x = self.vectorizer.transform_single(graph2) dist, void = forest.kneighbors(x, 1) dist = sum(dist) # is the distance ok? # if so, insert into forest and heap if radius < dist < radius * 2: forest.partial_fit(x) heapq.heappush(heap, (graph2.graph['score'], 0, graph2)) print 'heap' print 'cant heap', radius, dist # taking care of task graph # put in result list if necessary if task[1] < check_k < task[1] + len(graphlist): result.append(task[2]) print 'found sth' # go back to the heap! heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2])) return result ''' def simple_fit(self,iter_pos): self.sampler= GraphLearnSampler() self.sampler.fit(iter_pos) self.estimator=self.sampler.estimator ''' def fit_sampler(self, iter_pos, iter_neg): # getting the sampler ready: self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2]) iter_pos, pos, pos_ = itertools.tee(iter_pos, 3) self.estimator = self.sampler.estimatorobject.fit_2(iter_pos, iter_neg, self.sampler.vectorizer) print 'got estimeetaaa' self.sampler.local_substitutable_graph_grammar.fit(pos, grammar_n_jobs=-1, grammar_batch_size=8) self.sampler.estimator = self.estimator print 'got grammar:grammar is there oO'
def vectorizer_init(self, args): vectorizer = Vectorizer() vectorizer_parameters = {'complexity': [2, 3, 4, 5, 6]} return vectorizer, vectorizer_parameters
exit() print "*raw args" print "*"*80 print args # verbosity from eden.util import configure_logging import logging configure_logging(logging.getLogger(),verbosity=args.pop('verbose')) # handle Vectorizer: from eden.graph import Vectorizer args['vectorizer'] = Vectorizer(args.pop('vectorizer_complexity')) # estimator, if the user is providing a negative graph set, we use # the twoclass esti OO import graphlearn01.estimate as estimate if args['negative_input']==None: args['estimator']=estimate.OneClassEstimator(nu=.5, cv=2, n_jobs=-1) else: args['estimator']=estimate.TwoClassEstimator( cv=2, n_jobs=-1) #args for fitting: from eden.io.gspan import gspan_to_eden from itertools import islice fitargs={ k:args.pop(k) for k in ['lsgg_include_negatives','grammar_n_jobs','grammar_batch_size']}
def __init__(self, multiprocess=True, score_attribute='importance'): self.score_attribute=score_attribute self.vectorizer=Vectorizer() self.multi_process=multiprocess self.trained=False
def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict()
class EdenEstimator(BaseEstimator, ClassifierMixin): """Build an estimator for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet', n_iter=500): """construct.""" self.set_params(r, d, nbits, discrete, balance, subsample_size, ratio, normalization, inner_normalization, penalty, n_iter) def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet', n_iter=500): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(n_iter=n_iter) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') @timeit def fit(self, graphs, targets, randomize=True): """fit.""" if self.balance: if randomize: bal_graphs, bal_targets = balance( graphs, targets, None, ratio=self.ratio) else: samp_graphs, samp_targets = subsample( graphs, targets, subsample_size=self.subsample_size) x = self.transform(samp_graphs) self.model.fit(x, samp_targets) bal_graphs, bal_targets = balance( graphs, targets, self, ratio=self.ratio) size = len(bal_targets) logger.debug('Dataset size=%d' % (size)) x = self.transform(bal_graphs) self.model = self.model.fit(x, bal_targets) else: x = self.transform(graphs) self.model = self.model.fit(x, targets) return self @timeit def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds @timeit def decision_function(self, graphs): """decision_function.""" x = self.transform(graphs) preds = self.model.decision_function(x) return preds @timeit def cross_val_score(self, graphs, targets, scoring='roc_auc', cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_score( self.model, x, targets, cv=cv, scoring=scoring) return scores @timeit def cross_val_predict(self, graphs, targets, cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_predict( self.model, x, targets, cv=cv, method='decision_function') return scores @timeit def cluster(self, graphs, n_clusters=16): """cluster.""" x = self.transform(graphs) clust_est = MiniBatchKMeans(n_clusters=n_clusters) cluster_ids = clust_est.fit_predict(x) return cluster_ids @timeit def model_selection(self, graphs, targets, n_iter=30, subsample_size=None): """model_selection_randomized.""" param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))} if subsample_size: graphs, targets = subsample( graphs, targets, subsample_size=subsample_size) pool = mp.Pool() scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter) pool.close() pool.join() best_params = max(scores)[1] logger.debug("Best parameters:\n%s" % (best_params)) self = EdenEstimator(**best_params) return self @timeit def learning_curve(self, graphs, targets, cv=5, n_steps=10, start_fraction=0.1): """learning_curve.""" graphs, targets = paired_shuffle(graphs, targets) x = self.transform(graphs) train_sizes = np.linspace(start_fraction, 1.0, n_steps) scoring = 'roc_auc' train_sizes, train_scores, test_scores = learning_curve( self.model, x, targets, cv=cv, train_sizes=train_sizes, scoring=scoring) return train_sizes, train_scores, test_scores def bias_variance_decomposition(self, graphs, targets, cv=5, n_bootstraps=10): """bias_variance_decomposition.""" x = self.transform(graphs) score_list = [] for i in range(n_bootstraps): scores = cross_val_score( self.model, x, targets, cv=cv) score_list.append(scores) score_list = np.array(score_list) mean_scores = np.mean(score_list, axis=1) std_scores = np.std(score_list, axis=1) return mean_scores, std_scores
class Vectorizer(object): def __init__(self, complexity=None, nbits=20, sequence_vectorizer_complexity=3, graph_vectorizer_complexity=2, n_neighbors=5, sampling_prob=.5, n_iter=5, min_energy=-5, random_state=1): random.seed(random_state) if complexity is not None: sequence_vectorizer_complexity = complexity graph_vectorizer_complexity = complexity self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity, nbits=nbits, normalization=False, inner_normalization=False) self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits) self.n_neighbors = n_neighbors self.sampling_prob = sampling_prob self.n_iter = n_iter self.min_energy = min_energy self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors) def fit(self, seqs): # store seqs self.seqs = list(normalize_seqs(seqs)) data_matrix = self.sequence_vectorizer.transform(self.seqs) # fit nearest_neighbors model self.nearest_neighbors.fit(data_matrix) return self def fit_transform(self, seqs, sampling_prob=None, n_iter=None): seqs, seqs_ = tee(seqs) return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter) def transform(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) graphs_ = self.graphs(seqs) data_matrix = self.graph_vectorizer.transform(graphs_) return data_matrix def graphs(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) if n_iter is not None: self.n_iter = n_iter if sampling_prob is not None: self.sampling_prob = sampling_prob for seq, neighs in self._compute_neighbors(seqs): if self.n_iter > 1: header, sequence, struct, energy = self._optimize_struct(seq, neighs) else: header, sequence, struct, energy = self._align_sequence_structure(seq, neighs) graph = self._seq_to_eden(header, sequence, struct, energy) yield graph def _optimize_struct(self, seq, neighs): structs = [] results = [] for i in range(self.n_iter): new_neighs = self._sample_neighbors(neighs) header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs) results.append((header, sequence, struct, energy)) structs.append(struct) instance_id = self._most_representative(structs) selected = results[instance_id] return selected def _most_representative(self, structs): # compute kernel matrix with sequence_vectorizer data_matrix = self.sequence_vectorizer.transform(structs) kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1) # compute instance density as 1 over average pairwise distance density = np.sum(kernel_matrix, 0) / data_matrix.shape[0] # compute list of nearest neighbors max_id = np.argsort(-density)[0] return max_id def _sample_neighbors(self, neighs): out_neighs = [] # insert one element at random out_neighs.append(random.choice(neighs)) # add other elements sampling without replacement for neigh in neighs: if random.random() < self.sampling_prob: out_neighs.append(neigh) return out_neighs def _align_sequence_structure(self, seq, neighs, structure_deletions=False): header = seq[0] if len(neighs) < 1: clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) energy = 0 logger.debug('Warning: no alignment for: %s' % seq) else: str_out = convert_seq_to_fasta_str(seq) for neigh in neighs: str_out += convert_seq_to_fasta_str(neigh) cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out) out = sp.check_output(cmd, shell=True) seed = extract_aligned_seed(header, out) cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out) out = sp.check_output(cmd, shell=True) struct, energy = extract_struct_energy(out) if energy > self.min_energy: # use min free energy structure clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) else: clean_seq, clean_struct = make_seq_struct(seed, struct) if structure_deletions: clean_struct = self._clean_structure(clean_seq, clean_struct) return header, clean_seq, clean_struct, energy def _clean_structure(self, seq, stru): ''' Parameters ---------- seq : basestring rna sequence stru : basestring dotbracket string Returns ------- the structure given may not respect deletions in the sequence. we transform the structure to one that does ''' # find deletions in sequence ids = [] for i, c in enumerate(seq): if c == '-': ids.append(i) # remove brackets that dont have a partner anymore stru = list(stru) pairdict = self._pairs(stru) for i in ids: stru[pairdict[i]] = '.' # delete deletions in structure ids.reverse() for i in ids: del stru[i] stru = ''.join(stru) # removing obvious mistakes stru = stru.replace("(())", "....") stru = stru.replace("(.)", "...") stru = stru.replace("(..)", "....") return stru def _pairs(self, struct): ''' Parameters ---------- struct : basestring Returns ------- dictionary of ids in the struct, that are bond pairs ''' unpaired = [] pairs = {} for i, c in enumerate(struct): if c == '(': unpaired.append(i) if c == ')': partner = unpaired.pop() pairs[i] = partner pairs[partner] = i return pairs def _compute_neighbors(self, seqs): seqs = list(seqs) data_matrix = self.sequence_vectorizer.transform(seqs) # find neighbors distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix) # for each seq for seq, neighs in zip(seqs, neighbors): neighbor_seqs = [self.seqs[neigh] for neigh in neighs] yield seq, neighbor_seqs def _seq_to_eden(self, header, sequence, struct, energy): graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct) if graph.number_of_nodes() < 2: graph = seq_to_networkx(header, sequence) graph.graph['id'] = header graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy) graph.graph['energy'] = energy graph.graph['sequence'] = sequence return graph
class RegressorWrapper(BaseEstimator, RegressorMixin): """Regressor.""" def __init__(self, program=SGDRegressor(average=True, shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) y = self._extract_targets(graphs) self.program = self.program.fit(data_matrix, y) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) predictions = self.program.predict(data_matrix) for prediction, graph in izip(predictions, graphs): graph.graph['prediction'] = prediction graph.graph['score'] = prediction yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _extract_targets(self, graphs): y = [] for graph in graphs: if graph.graph.get('target', None) is not None: y.append(graph.graph['target']) else: raise Exception('Missing the attribute "target" \ in graph dictionary!') y = np.ravel(y) return y
class ClassifierWrapper(BaseEstimator, ClassifierMixin): """Classifier.""" def __init__(self, program=SGDClassifier(average=True, class_weight='balanced', shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) y = self._extract_targets(graphs) # manage case for single class learning if len(set(y)) == 1: # make negative data matrix negative_data_matrix = data_matrix.multiply(-1) # make targets y = list(y) y_neg = [-1] * len(y) # concatenate elements data_matrix = vstack([data_matrix, negative_data_matrix], format="csr") y = y + y_neg y = np.ravel(y) self.program = self.program.fit(data_matrix, y) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) predictions = self.program.predict(data_matrix) scores = self.program.decision_function(data_matrix) for score, prediction, graph in izip(scores, predictions, graphs): graph.graph['prediction'] = prediction graph.graph['score'] = score yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _extract_targets(self, graphs): y = [] for graph in graphs: if graph.graph.get('target', None) is not None: y.append(graph.graph['target']) else: raise Exception('Missing the attribute "target" \ in graph dictionary!') y = np.ravel(y) return y
class VolumeConstructor(object): """VolumeConstructor.""" def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3, class_discretizer=2, class_std_discretizer=1, similarity_discretizer=10, size_discretizer=1, volume_discretizer=10, n_neighbors=10, improve=True): """init.""" self.improve = improve self.n_neighbors = n_neighbors self.non_norm_vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True) self.grammar = GrammarWrapper(radius_list=[1, 2, 3], thickness_list=[2], min_cip_count=min_count, min_interface_count=min_count, max_n_neighbors=max_n_neighbors, n_neigh_steps=1, max_neighborhood_size=max_n_neighbors) self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator( self.vec, class_discretizer=class_discretizer, class_std_discretizer=class_std_discretizer, similarity_discretizer=similarity_discretizer, size_discretizer=size_discretizer, volume_discretizer=volume_discretizer, improve=improve) self.cost_estimator = MultiObjectiveCostEstimator( self.non_norm_vec, improve) self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors) def fit(self, pos_graphs, neg_graphs): """fit.""" self.all_graphs = pos_graphs + neg_graphs self.all_vecs = self.vec.transform(self.all_graphs) self.grammar.fit(self.all_graphs) logger.info('%s' % self.grammar) self.sim_cost_estimator.fit(pos_graphs, neg_graphs) self.cost_estimator.fit(pos_graphs, neg_graphs) self.nn_estimator.fit(self.all_vecs) def sample(self, sample_graphs): """sample.""" # pareto filter using similarity of the dataset for initial seed costs = self.sim_cost_estimator.compute(sample_graphs) seed_graphs = get_pareto_set(sample_graphs, costs) # run optimization in parallel pareto_graphs_list = self._optimize_parallel(seed_graphs) self._log_result(pareto_graphs_list) # join all pareto sets pareto_set_graphs = pipe(pareto_graphs_list, concat, list) # pareto filter using similarity of the solutions pareto_set_costs = self.sim_cost_estimator.compute(pareto_set_graphs) sel_pareto_set_graphs = get_pareto_set(pareto_set_graphs, pareto_set_costs) logger.info('#constructed graphs:%5d' % (len(sel_pareto_set_graphs))) return sel_pareto_set_graphs def _log_result(self, pareto_graphs_list): tot_size = sum(len(graphs) for graphs in pareto_graphs_list) msg = 'pareto set sizes [%d]: ' % tot_size for graphs in pareto_graphs_list: msg += '[%d]' % len(graphs) logger.info(msg) def _optimize_parallel(self, reference_graphs): """optimize_parallel.""" pool = multiprocessing.Pool() res = [ apply_async(pool, self._optimize_single, args=(g, )) for g in reference_graphs ] pareto_set_graphs_list = [p.get() for p in res] pool.close() pool.join() return pareto_set_graphs_list def _get_constraints(self, reference_graph): reference_vec = self.non_norm_vec.transform([reference_graph]) # find neighbors neighbors = self.nn_estimator.kneighbors(reference_vec, return_distance=False) neighbors = neighbors[0] # compute center of mass reference_graphs = [self.all_graphs[i] for i in neighbors] reference_vecs = self.all_vecs[neighbors] avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0) reference_vecs = self.non_norm_vec.transform(reference_graphs) # compute desired distances desired_distances = euclidean_distances(avg_reference_vec, reference_vecs) desired_distances = desired_distances[0] return reference_graphs, desired_distances def _optimize_single(self, reference_graph): """optimize_single.""" res = self._get_constraints(reference_graph) reference_graphs, desired_distances = res moo = MultiObjectiveOptimizer(self.vec, self.grammar, self.cost_estimator, max_neighborhood_order=1, max_n_iter=100) moo.fit(desired_distances, reference_graphs) pareto_set_graphs = moo.sample(reference_graphs) return pareto_set_graphs
class KNNWrapper(BaseEstimator, ClassifierMixin): """KNNWrapper.""" def __init__(self, program=NearestNeighbors(n_neighbors=2)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val elif "vectorize__" in param: key = param.split('__')[1] val = params[param] self.params_vectorize[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: self.graphs = list(graphs) data_matrix = vectorize(self.graphs, vectorizer=self.vectorizer, **self.params_vectorize) self.program = self.program.fit(data_matrix) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = vectorize(graphs_, vectorizer=self.vectorizer, **self.params_vectorize) distances, indices = self.program.kneighbors(data_matrix) for knn_dists, knn_ids, graph in izip(distances, indices, graphs): neighbor_graphs = [] for knn_id in knn_ids: neighbor_graphs.append(self.graphs[knn_id]) graph.graph['neighbors'] = neighbor_graphs graph.graph['ids'] = knn_ids graph.graph['distances'] = knn_dists yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def __init__(self, program=NearestNeighbors(n_neighbors=2)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict()