class KNNWrapper(BaseEstimator, ClassifierMixin): """KNNWrapper.""" def __init__(self, program=NearestNeighbors(n_neighbors=2)): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: self.graphs = list(graphs) data_matrix = self.vectorizer.transform(graphs) self.program = self.program.fit(data_matrix) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) distances, indices = self.program.kneighbors(data_matrix) for knn_dists, knn_ids, graph in izip(distances, indices, graphs): neighbor_graphs = [] for knn_id in knn_ids: neighbor_graphs.append(self.graphs[knn_id]) graph.graph['neighbors'] = neighbor_graphs graph.graph['ids'] = knn_ids graph.graph['distances'] = knn_dists yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
class IsomorphicClusterer(BaseEstimator, ClusterMixin): """IsomorphismClusterer. """ def __init__(self): """Construct.""" self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ for param in params: self.__dict__[param] = params[param] return self def fit_predict(self, graphs): """fit_predict.""" def vec_to_hash(vec): return hash(tuple(vec.data + vec.indices)) try: for graph in graphs: prediction = vec_to_hash(self.vectorizer.transform([graph])) yield prediction except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
class IsomorphicClusterer(BaseEstimator, ClusterMixin): """IsomorphismClusterer.""" def __init__(self): """Construct.""" self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ for param in params: self.__dict__[param] = params[param] return self def fit_predict(self, graphs): """fit_predict.""" def vec_to_hash(vec): return hash(tuple(vec.data + vec.indices)) try: for graph in graphs: prediction = vec_to_hash(self.vectorizer.transform([graph])) yield prediction except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
class EdenRegressor(BaseEstimator, RegressorMixin): """Build a regressor for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """construct.""" self.set_params(r, d, nbits, discrete, normalization, inner_normalization, penalty, loss) def set_params(self, r=3, d=8, nbits=16, discrete=True, normalization=True, inner_normalization=True, penalty='elasticnet', loss='squared_loss'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.model = SGDRegressor( loss=loss, penalty=penalty, average=True, shuffle=True, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" return self.predict(graphs)
def prep(graphlist,id=0): if not graphlist: return {} v=Vectorizer() map(lambda x: node_operation(x, lambda n, d: d.pop('weight', None)), graphlist) csr=v.transform(graphlist) hash_function = lambda vec: hash(tuple(vec.data + vec.indices)) return {hash_function(row): (id,ith) for ith, row in enumerate(csr)}
def make_fold_vectorize(complexity=3, nbits=15, fold=None, boundaries=None): """Curry parameters in vectorizer.""" vec = Vectorizer(complexity=complexity, nbits=nbits) vectorize = curry(lambda vec, graphs: vec.transform(graphs))(vec) cwindow_reweight = curry(_window_reweight)(boundaries) fold_vectorize = compose(vectorize, map(cwindow_reweight), fold) return fold_vectorize
def make_graphs(smiles): eden_graph_generator = [smiles_to_eden(smi) for smi in smiles ] # Convert from SMILES to EdEN format graphs = [graph for graph in eden_graph_generator ] # Compute graphs for each molecule vectorizer = Vectorizer(min_r=0, min_d=0, r=1, d=2) sparse = vectorizer.transform( graphs) # Compute the NSPDK features and store in a sparse array return sparse
class InstanceMaker(object): """InstanceMaker.""" def __init__(self, n_landmarks=5, n_neighbors=50): """init.""" self.vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) self.n_neighbors = n_neighbors self.n_landmarks = n_landmarks def fit(self, graphs, ntargets): """graphs/targets split, trains NN on graphs""" self.graphs = graphs[:-ntargets] self.targets = graphs[-ntargets:] vecs = self.vec.transform(self.graphs) if self.n_neighbors > len(self.graphs): self.n_neighbors = len(self.graphs) self.nn = NearestNeighbors(n_neighbors=self.n_neighbors).fit(vecs) return self def get(self, idd=-1): if idd == -1: target_graph = self.targets.pop() else: target_graph = self.targets[idd] target_vec = self.vec.transform([target_graph]) distances, neighbors = self.nn.kneighbors(target_vec, return_distance=True) distances = distances[0] neighbors = neighbors[0] ranked_graphs = [self.graphs[i] for i in neighbors] landmark_graphs = ranked_graphs[:self.n_landmarks] desired_distances = distances[:self.n_landmarks] logger.debug( "target(%d,%d) and nn(%d,%d)" % (target_graph.number_of_nodes(), target_graph.number_of_edges(), ranked_graphs[0].number_of_nodes(), ranked_graphs[0].number_of_edges())) so.gprint([target_graph, ranked_graphs[0]], edgelabel='label') return landmark_graphs, desired_distances, ranked_graphs, target_graph
def compute_NSPDK_features(): import eden from eden.graph import Vectorizer from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden mol_path = olfaction_prediction_path + '/data/sdf/' iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf') iter_graphs = obabel_to_eden(iter_mols) vectorizer = Vectorizer( r=3, d=4 ) X = vectorizer.transform( iter_graphs ) return X
def compute_NSPDK_features(): import eden from eden.graph import Vectorizer from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden mol_path = olfaction_prediction_path + '/data/sdf/' iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf') iter_graphs = obabel_to_eden(iter_mols) vectorizer = Vectorizer(r=3, d=4) X = vectorizer.transform(iter_graphs) return X
def _remove_similar_pairs(graphs): vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) x = vec.transform(graphs) matrix = cosine_similarity(x) scores = np.array([1] * len(graphs)) ids = min_similarity_selection(matrix, scores=scores, max_num=len(graphs) / 2) graphs = [graphs[i] for i in ids] logging.debug('similar pairs removal:%d' % len(graphs)) return graphs
class Annotator(): def __init__(self, multiprocess=True, score_attribute='importance'): self.score_attribute=score_attribute self.vectorizer=Vectorizer() self.multi_process=multiprocess self.trained=False def fit(self, graphs_pos, graphs_neg=[]): if self.trained: return self self.trained=True map(utils.remove_eden_annotation,graphs_pos+graphs_neg) map(lambda x: utils.node_operation(x, lambda n,d: d.pop('importance',None)), graphs_pos+graphs_neg) map( lambda graph: graph.graph.pop('mass_annotate_mp_was_here',None) ,graphs_pos+graphs_neg) if graphs_neg: #print 'choosing to train binary esti' self.estimator = SGDClassifier() classes= [1]*len(graphs_pos)+[-1]*len(graphs_neg) self.estimator.fit(self.vectorizer.transform(graphs_pos+graphs_neg),classes) else: self.estimator = ExperimentalOneClassEstimator() self.estimator.fit(self.vectorizer.transform(graphs_pos)) return self def fit_transform(self,graphs_p, graphs_n=[]): self.fit(graphs_p,graphs_n) return self.transform(graphs_p),self.transform(graphs_n) def transform(self,graphs): return self.annotate(graphs) def annotate(self,graphs,neg=False): if not graphs: return [] return mass_annotate_mp(graphs,self.vectorizer,score_attribute=self.score_attribute,estimator=self.estimator, multi_process=self.multi_process, invert_score=neg)
def smiles2nspdk(input_path, complexity, nbits, save_path): """ Smiles strings to nspdk descriptors :param input_path: path to file with SMILES :param complexity: descriptor complexity :param nbits: bits of descriptor :param save_path: :return: """ vec = Vectorizer(complexity=complexity, nbits=nbits) smiles_list = load_dataset(input_path) res = vec.transform(list(smiles_strings_to_nx(smiles_list))).todense() output = open(save_path, "w") for row in res: np.savetxt(output, row)
def _outliers(graphs, k=3): vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) x = vec.transform(graphs) knn = NearestNeighbors(n_neighbors=k) knn.fit(x) neigbhbors = knn.kneighbors(x, return_distance=False) outlier_list = [] non_outlier_list = [] for i, ns in enumerate(neigbhbors): not_outlier = False for n in ns[1:]: if i in list(neigbhbors[n, :]): not_outlier = True break if not_outlier is False: outlier_list.append(i) else: non_outlier_list.append(i) return outlier_list, non_outlier_list
class OrdererWrapper(BaseEstimator, ClassifierMixin): """Orderer.""" def __init__(self, program=None): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_orderer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_orderer[param] = params[param] self.program.set_params(**params_orderer) self.vectorizer.set_params(**params_vectorizer) return self def decision_function(self, graphs): """decision_function.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) scores = self.program.decision_function(data_matrix) return scores except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def clusterGraphs(graphs, r, d, copt): opts = copt[1:-1] optl = opts.split(",") opt = int(optl[0]) vectorizer = Vectorizer(r=r, d=d) samples = len(graphs) minlclu = 5 Xsp = vectorizer.transform(graphs) #sparse feature matrix X = Xsp.todense() #regular feature matrix #SM=metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma = 1)#similarity matrix SM = metrics.pairwise.pairwise_kernels(Xsp, metric='linear') DM = [] #distance matrix for i in range(len(SM)): DM.append([]) for j in range(len(SM[i])): val = 1.0 - SM[i][j] if val < 0: DM[i].append(0.0) else: DM[i].append(val) if opt == 0: nc, labels = MShift(X) if opt == 1: #print(DM) minlclu = int(optl[2]) nc, labels = DB_SCAN(DM, float(optl[1]), int(optl[2])) if opt == 2: nc, labels = AffProp(SM) if opt == 3: print(SM) #Matrix(X) return 0, [] if opt == 4: nc, labels = K_Means(X) if opt == 5: nc, labels = SpecClus(SM) if opt == 6: nc, labels = dclust(DM, int(optl[1]), int(optl[2]), float(optl[3])) return nc, labels, minlclu
def setup(self, known_graphs=None, candidate_graphs=None): """Setup.""" # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the # known graphs in the list 'known_graphs' parameters_priors = dict(n_neighbors=self.n_neighbors) parameters_priors.update(dict(vectorizer__complexity=self.complexity, vectorizer__discrete=True)) fit_wrapped_knn_predictor_known = \ model(known_graphs, program=KNNWrapper(program=NearestNeighbors()), parameters_priors=parameters_priors) # compute distances of candidate_graphs to known_graphs knn_candidate_graphs = predict(candidate_graphs, program=fit_wrapped_knn_predictor_known) knn_candidate_graphs = list(knn_candidate_graphs) self.distances_to_known_graphs = [] for knn_candidate_graph in knn_candidate_graphs: distances = knn_candidate_graph.graph['distances'] self.distances_to_known_graphs.append(distances) # compute candidate_graphs encodings vec = Vectorizer(complexity=self.complexity) self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
def setup(self, known_graphs=None, candidate_graphs=None): """Setup.""" # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the # known graphs in the list 'known_graphs' parameters_priors = dict(n_neighbors=self.n_neighbors) parameters_priors.update( dict(vectorizer__complexity=self.complexity, vectorizer__discrete=True)) fit_wrapped_knn_predictor_known = \ model(known_graphs, program=KNNWrapper(program=NearestNeighbors()), parameters_priors=parameters_priors) # compute distances of candidate_graphs to known_graphs knn_candidate_graphs = predict(candidate_graphs, program=fit_wrapped_knn_predictor_known) knn_candidate_graphs = list(knn_candidate_graphs) self.distances_to_known_graphs = [] for knn_candidate_graph in knn_candidate_graphs: distances = knn_candidate_graph.graph['distances'] self.distances_to_known_graphs.append(distances) # compute candidate_graphs encodings vec = Vectorizer(complexity=self.complexity) self.candidate_graphs_data_matrix = vec.transform(candidate_graphs)
def compare(finalL, L, peaks, opt, th, alpha): n = len(L) lpeaks = {} for key in L: lpeaks[key] = peaks[key] for key in finalL: lpeaks[key] = peaks[key] graphs, dict = peaksToGraphs(lpeaks, opt, alpha) vectorizer = Vectorizer(r=2, d=3) samples = len(graphs) Xsp = vectorizer.transform(graphs) #sparse feature matrix X = Xsp.todense() #regular feature matrix SM = metrics.pairwise.pairwise_kernels(Xsp, metric='rbf', gamma=1) #similarity matrix DM = [] #distance matrix for i in range(len(SM)): DM.append([]) for j in range(len(SM[i])): val = 1.0 - SM[i][j] if val < 0: DM[i].append(0.0) else: DM[i].append(val) avgDM = 0.0 counts = 0.0 for i in range(len(graphs)): if dict[i] in L: for j in range(len(graphs)): if i != j and dict[j] in finalL: avgDM += DM[i][j] counts += 1 avgDM = avgDM / counts if avgDM >= 0.0 and avgDM <= th: return 0 else: return 1
class EdenEstimator(BaseEstimator, ClassifierMixin): """Build an estimator for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet'): """construct.""" self.set_params(r, d, nbits, discrete, balance, subsample_size, ratio, normalization, inner_normalization, penalty) def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(max_iter=5, tol=None) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" if self.balance: if randomize: bal_graphs, bal_targets = balance( graphs, targets, None, ratio=self.ratio) else: samp_graphs, samp_targets = subsample( graphs, targets, subsample_size=self.subsample_size) x = self.transform(samp_graphs) self.model.fit(x, samp_targets) bal_graphs, bal_targets = balance( graphs, targets, self, ratio=self.ratio) size = len(bal_targets) logger.debug('Dataset size=%d' % (size)) x = self.transform(bal_graphs) self.model = self.model.fit(x, bal_targets) else: x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" x = self.transform(graphs) preds = self.model.decision_function(x) return preds @timeit def cross_val_score(self, graphs, targets, scoring='roc_auc', cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_score( self.model, x, targets, cv=cv, scoring=scoring) return scores @timeit def cross_val_predict(self, graphs, targets, cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_predict( self.model, x, targets, cv=cv, method='decision_function') return scores @timeit def cluster(self, graphs, n_clusters=16): """cluster.""" x = self.transform(graphs) clust_est = MiniBatchKMeans(n_clusters=n_clusters) cluster_ids = clust_est.fit_predict(x) return cluster_ids @timeit def model_selection(self, graphs, targets, n_iter=30, subsample_size=None): """model_selection_randomized.""" param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))} if subsample_size: graphs, targets = subsample( graphs, targets, subsample_size=subsample_size) pool = mp.Pool() scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter) pool.close() pool.join() best_params = max(scores)[1] logger.debug("Best parameters:\n%s" % (best_params)) self = EdenEstimator(**best_params) return self @timeit def learning_curve(self, graphs, targets, cv=5, n_steps=10, start_fraction=0.1): """learning_curve.""" graphs, targets = paired_shuffle(graphs, targets) x = self.transform(graphs) train_sizes = np.linspace(start_fraction, 1.0, n_steps) scoring = 'roc_auc' train_sizes, train_scores, test_scores = learning_curve( self.model, x, targets, cv=cv, train_sizes=train_sizes, scoring=scoring) return train_sizes, train_scores, test_scores @timeit def bias_variance_decomposition(self, graphs, targets, cv=5, n_bootstraps=10): """bias_variance_decomposition.""" x = self.transform(graphs) score_list = [] for i in range(n_bootstraps): scores = cross_val_score( self.model, x, targets, cv=cv) score_list.append(scores) score_list = np.array(score_list) mean_scores = np.mean(score_list, axis=1) std_scores = np.std(score_list, axis=1) return mean_scores, std_scores
def vectorize(instances): vec = Vectorizer() return vec.transform(instances)
class Vectorizer(object): def __init__(self, complexity=None, nbits=20, sequence_vectorizer_complexity=3, graph_vectorizer_complexity=2, n_neighbors=5, sampling_prob=.5, n_iter=5, min_energy=-5, random_state=1): random.seed(random_state) if complexity is not None: sequence_vectorizer_complexity = complexity graph_vectorizer_complexity = complexity self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity, nbits=nbits, normalization=False, inner_normalization=False) self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits) self.n_neighbors = n_neighbors self.sampling_prob = sampling_prob self.n_iter = n_iter self.min_energy = min_energy self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors) def fit(self, seqs): # store seqs self.seqs = list(normalize_seqs(seqs)) data_matrix = self.sequence_vectorizer.transform(self.seqs) # fit nearest_neighbors model self.nearest_neighbors.fit(data_matrix) return self def fit_transform(self, seqs, sampling_prob=None, n_iter=None): seqs, seqs_ = tee(seqs) return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter) def transform(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) graphs_ = self.graphs(seqs) data_matrix = self.graph_vectorizer.transform(graphs_) return data_matrix def graphs(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) if n_iter is not None: self.n_iter = n_iter if sampling_prob is not None: self.sampling_prob = sampling_prob for seq, neighs in self._compute_neighbors(seqs): if self.n_iter > 1: header, sequence, struct, energy = self._optimize_struct(seq, neighs) else: header, sequence, struct, energy = self._align_sequence_structure(seq, neighs) graph = self._seq_to_eden(header, sequence, struct, energy) yield graph def _optimize_struct(self, seq, neighs): structs = [] results = [] for i in range(self.n_iter): new_neighs = self._sample_neighbors(neighs) header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs) results.append((header, sequence, struct, energy)) structs.append(struct) instance_id = self._most_representative(structs) selected = results[instance_id] return selected def _most_representative(self, structs): # compute kernel matrix with sequence_vectorizer data_matrix = self.sequence_vectorizer.transform(structs) kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1) # compute instance density as 1 over average pairwise distance density = np.sum(kernel_matrix, 0) / data_matrix.shape[0] # compute list of nearest neighbors max_id = np.argsort(-density)[0] return max_id def _sample_neighbors(self, neighs): out_neighs = [] # insert one element at random out_neighs.append(random.choice(neighs)) # add other elements sampling without replacement for neigh in neighs: if random.random() < self.sampling_prob: out_neighs.append(neigh) return out_neighs def _align_sequence_structure(self, seq, neighs, structure_deletions=False): header = seq[0] if len(neighs) < 1: clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) energy = 0 logger.debug('Warning: no alignment for: %s' % seq) else: str_out = convert_seq_to_fasta_str(seq) for neigh in neighs: str_out += convert_seq_to_fasta_str(neigh) cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out) out = sp.check_output(cmd, shell=True) seed = extract_aligned_seed(header, out) cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out) out = sp.check_output(cmd, shell=True) struct, energy = extract_struct_energy(out) if energy > self.min_energy: # use min free energy structure clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) else: clean_seq, clean_struct = make_seq_struct(seed, struct) if structure_deletions: clean_struct = self._clean_structure(clean_seq, clean_struct) return header, clean_seq, clean_struct, energy def _clean_structure(self, seq, stru): ''' Parameters ---------- seq : basestring rna sequence stru : basestring dotbracket string Returns ------- the structure given may not respect deletions in the sequence. we transform the structure to one that does ''' # find deletions in sequence ids = [] for i, c in enumerate(seq): if c == '-': ids.append(i) # remove brackets that dont have a partner anymore stru = list(stru) pairdict = self._pairs(stru) for i in ids: stru[pairdict[i]] = '.' # delete deletions in structure ids.reverse() for i in ids: del stru[i] stru = ''.join(stru) # removing obvious mistakes stru = stru.replace("(())", "....") stru = stru.replace("(.)", "...") stru = stru.replace("(..)", "....") return stru def _pairs(self, struct): ''' Parameters ---------- struct : basestring Returns ------- dictionary of ids in the struct, that are bond pairs ''' unpaired = [] pairs = {} for i, c in enumerate(struct): if c == '(': unpaired.append(i) if c == ')': partner = unpaired.pop() pairs[i] = partner pairs[partner] = i return pairs def _compute_neighbors(self, seqs): seqs = list(seqs) data_matrix = self.sequence_vectorizer.transform(seqs) # find neighbors distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix) # for each seq for seq, neighs in zip(seqs, neighbors): neighbor_seqs = [self.seqs[neigh] for neigh in neighs] yield seq, neighbor_seqs def _seq_to_eden(self, header, sequence, struct, energy): graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct) if graph.number_of_nodes() < 2: graph = seq_to_networkx(header, sequence) graph.graph['id'] = header graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy) graph.graph['energy'] = energy graph.graph['sequence'] = sequence return graph
class VolumeConstructor(object): """VolumeConstructor.""" def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3, class_discretizer=2, class_std_discretizer=1, similarity_discretizer=10, size_discretizer=1, volume_discretizer=10, n_neighbors=10, improve=True): """init.""" self.improve = improve self.n_neighbors = n_neighbors self.non_norm_vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True) self.grammar = GrammarWrapper(radius_list=[1, 2, 3], thickness_list=[2], min_cip_count=min_count, min_interface_count=min_count, max_n_neighbors=max_n_neighbors, n_neigh_steps=1, max_neighborhood_size=max_n_neighbors) self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator( self.vec, class_discretizer=class_discretizer, class_std_discretizer=class_std_discretizer, similarity_discretizer=similarity_discretizer, size_discretizer=size_discretizer, volume_discretizer=volume_discretizer, improve=improve) self.cost_estimator = MultiObjectiveCostEstimator( self.non_norm_vec, improve) self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors) def fit(self, pos_graphs, neg_graphs): """fit.""" self.all_graphs = pos_graphs + neg_graphs self.all_vecs = self.vec.transform(self.all_graphs) self.grammar.fit(self.all_graphs) logger.info('%s' % self.grammar) self.sim_cost_estimator.fit(pos_graphs, neg_graphs) self.cost_estimator.fit(pos_graphs, neg_graphs) self.nn_estimator.fit(self.all_vecs) def sample(self, sample_graphs): """sample.""" # pareto filter using similarity of the dataset for initial seed costs = self.sim_cost_estimator.compute(sample_graphs) seed_graphs = get_pareto_set(sample_graphs, costs) # run optimization in parallel pareto_graphs_list = self._optimize_parallel(seed_graphs) self._log_result(pareto_graphs_list) # join all pareto sets pareto_set_graphs = pipe(pareto_graphs_list, concat, list) # pareto filter using similarity of the solutions pareto_set_costs = self.sim_cost_estimator.compute(pareto_set_graphs) sel_pareto_set_graphs = get_pareto_set(pareto_set_graphs, pareto_set_costs) logger.info('#constructed graphs:%5d' % (len(sel_pareto_set_graphs))) return sel_pareto_set_graphs def _log_result(self, pareto_graphs_list): tot_size = sum(len(graphs) for graphs in pareto_graphs_list) msg = 'pareto set sizes [%d]: ' % tot_size for graphs in pareto_graphs_list: msg += '[%d]' % len(graphs) logger.info(msg) def _optimize_parallel(self, reference_graphs): """optimize_parallel.""" pool = multiprocessing.Pool() res = [ apply_async(pool, self._optimize_single, args=(g, )) for g in reference_graphs ] pareto_set_graphs_list = [p.get() for p in res] pool.close() pool.join() return pareto_set_graphs_list def _get_constraints(self, reference_graph): reference_vec = self.non_norm_vec.transform([reference_graph]) # find neighbors neighbors = self.nn_estimator.kneighbors(reference_vec, return_distance=False) neighbors = neighbors[0] # compute center of mass reference_graphs = [self.all_graphs[i] for i in neighbors] reference_vecs = self.all_vecs[neighbors] avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0) reference_vecs = self.non_norm_vec.transform(reference_graphs) # compute desired distances desired_distances = euclidean_distances(avg_reference_vec, reference_vecs) desired_distances = desired_distances[0] return reference_graphs, desired_distances def _optimize_single(self, reference_graph): """optimize_single.""" res = self._get_constraints(reference_graph) reference_graphs, desired_distances = res moo = MultiObjectiveOptimizer(self.vec, self.grammar, self.cost_estimator, max_neighborhood_order=1, max_n_iter=100) moo.fit(desired_distances, reference_graphs) pareto_set_graphs = moo.sample(reference_graphs) return pareto_set_graphs
class ClassifierWrapper(BaseEstimator, ClassifierMixin): """Classifier.""" def __init__(self, program=SGDClassifier(average=True, class_weight='balanced', shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) y = self._extract_targets(graphs) # manage case for single class learning if len(set(y)) == 1: # make negative data matrix negative_data_matrix = data_matrix.multiply(-1) # make targets y = list(y) y_neg = [-1] * len(y) # concatenate elements data_matrix = vstack( [data_matrix, negative_data_matrix], format="csr") y = y + y_neg y = np.ravel(y) self.program = self.program.fit(data_matrix, y) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) predictions = self.program.predict(data_matrix) scores = self.program.decision_function(data_matrix) for score, prediction, graph in izip(scores, predictions, graphs): graph.graph['prediction'] = prediction graph.graph['score'] = score yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _extract_targets(self, graphs): y = [] for graph in graphs: if graph.graph.get('target', None) is not None: y.append(graph.graph['target']) else: raise Exception('Missing the attribute "target" \ in graph dictionary!') y = np.ravel(y) return y
def vectorize(thing): v = Vectorizer() if not thing: raise Exception( "need something to vectirize.. received %s" % str(thing)) thing=list(thing) # current eden does not eat generators anymore? weird return v.transform(thing)
improved_graphs = sampler.transform(graphs_pos_, same_radius=False, size_constrained_core_choice=True, sampling_interval=9999, select_cip_max_tries=100, batch_size=int(count/4)+1, n_steps=100, n_jobs=-1, improving_threshold=0.9) #calculate the score of the improved versions #calculate score of the originals avg_imp=sum( [estimator.decision_function(e) for e in vectorizer.transform(unpack(improved_graphs)) ] )/count avg_ori=sum( [estimator.decision_function(e) for e in vectorizer.transform(graphs_pos___)] )/count improved.append(avg_imp) originals.append(avg_ori) t = range(len(percentages)) # originals are blue # improved ones are green print originals print improved plt.plot(t,originals ,'bs') plt.plot(t, improved ,'g^') plt.savefig('zomg.png')
class IdealGraphEstimator(object): """Build an estimator for graphs.""" def __init__( self, min_count=2, max_n_neighbors=100, r=3, d=3, n_neighbors=10, max_num_solutions=30): """construct.""" self.min_count = min_count self.max_n_neighbors = max_n_neighbors self.max_num_solutions = max_num_solutions self.r = r self.d = d self.n_neighbors = n_neighbors self.clf = Perceptron(n_iter=500) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True, nbits=16) self.gs = [.05, .1, .2, .4, .6, .8, 1, 2, 4, 6] def fit(self, pos_graphs, neg_graphs): """fit.""" ref_graphs = self.construct(pos_graphs, neg_graphs) logger.debug('Working on %d constructed graphs' % len(ref_graphs)) y = [1] * len(pos_graphs) + [-1] * len(neg_graphs) x = self.vec.transform(pos_graphs + neg_graphs) z = self.vec.transform(ref_graphs) n_features = z.shape[0] k = np.hstack([pairwise_kernels(x, z, metric='rbf', gamma=g) for g in self.gs]) step = len(ref_graphs) / 2 n_inst, n_feat = k.shape txt = 'RFECV on %d instances with %d features with step: %d' % \ (n_inst, n_feat, step) logger.debug(txt) selector = RFECV(self.clf, step=step, cv=10) selector = selector.fit(k, y) ids = list(concat([range(n_features)] * len(self.gs))) gs_list = list(concat([[g] * n_features for g in self.gs])) feat = defaultdict(list) for g, i, s in zip(gs_list, ids, selector.support_): if s: feat[g].append(i) self.mats = dict() for g in sorted(feat): mat = vstack([z[i] for i in feat[g]]) self.mats[g] = mat sel_ids = set([i for i, s in zip(ids, selector.support_) if s]) self.ideal_graphs_ = [ref_graphs[i] for i in sel_ids] return self def transform(self, graphs): """transform.""" x = self.vec.transform(graphs) xtr = np.hstack([pairwise_kernels(x, self.mats[g], metric='rbf', gamma=g) for g in sorted(self.mats)]) return xtr def construct(self, pos_graphs, neg_graphs): """construct.""" args = dict( min_count=self.min_count, max_n_neighbors=self.max_n_neighbors, r=self.r, d=self.d, n_landmarks=5, n_neighbors=self.n_neighbors, n_iter=20, k_best=5, max_num_solutions=self.max_num_solutions) self.active_constr = NearestNeighborsMeanOptimizer( improve=False, **args) self.active_constr.fit(pos_graphs, neg_graphs) graphs = pos_graphs + neg_graphs active_pareto_set_graphs = self.active_constr.optimize(graphs) self.pos_constr = NearestNeighborsMeanOptimizer( improve=True, **args) self.pos_constr.fit(pos_graphs, neg_graphs) pareto_set_graphs = self.pos_constr.optimize(graphs) sel_constructed_graphs = pareto_set_graphs + active_pareto_set_graphs return sel_constructed_graphs
class NearestNeighborsMeanOptimizer(object): """NearestNeighborsMeanOptimizer.""" def __init__(self, min_count=2, max_n_neighbors=None, r=3, d=3, n_landmarks=5, n_neighbors=100, n_iter=20, k_best=5, max_num_solutions=30, improve=True): """init.""" self.improve = improve self.max_num = max_num_solutions self.n_landmarks = n_landmarks self.n_neighbors = n_neighbors self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors) self.non_norm_vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True) self.dist_opt = LandmarksDistanceOptimizer( r=r, d=d, min_count=min_count, max_n_neighbors=max_n_neighbors, n_iter=n_iter, k_best=k_best, improve=improve) def fit(self, pos_graphs, neg_graphs): """fit.""" self.all_graphs = pos_graphs + neg_graphs self.all_vecs = self.vec.transform(self.all_graphs) self.nn_estimator.fit(self.all_vecs) self.dist_opt.fit(pos_graphs, neg_graphs) self.sim_est = VarSimVolCostEstimator(improve=self.improve) self.sim_est.fit(pos_graphs, neg_graphs) def optimize(self, graphs): """optimize.""" seed_graphs = self.select(graphs, max_num=self.max_num) # run optimization in parallel pareto_graphs_list = self._optimize_parallel(seed_graphs) self._log_result(pareto_graphs_list) # join all pareto sets pareto_set_graphs = pipe(pareto_graphs_list, concat, list) # pareto filter using similarity of the solutions sel_graphs = self.select(pareto_set_graphs, max_num=self.max_num) logger.debug('#constructed graphs:%5d' % (len(sel_graphs))) return sel_graphs def select(self, graphs, max_num=30): """select.""" costs = self.sim_est.decision_function(graphs) pareto_graphs = get_pareto_set(graphs, costs) select_graphs = self.sim_est.select(pareto_graphs, k_best=max_num) i, p, s = len(graphs), len(pareto_graphs), len(select_graphs) logger.debug('initial:%d pareto:%d selected:%d' % (i, p, s)) return select_graphs def _log_result(self, pareto_graphs_list): tot_size = sum(len(graphs) for graphs in pareto_graphs_list) msg = 'pareto set sizes [%d]: ' % tot_size for graphs in pareto_graphs_list: msg += '[%d]' % len(graphs) logger.debug(msg) def _optimize_parallel(self, reference_graphs): """optimize_parallel.""" pool = multiprocessing.Pool() res = [ apply_async(pool, self._optimize, args=(reference_graph, )) for reference_graph in reference_graphs ] pareto_set_graphs_list = [p.get() for p in res] pool.close() pool.join() return pareto_set_graphs_list def _optimize(self, reference_graph): """optimize_single.""" constraints = self._get_constraints(reference_graph) graphs = self.dist_opt.optimize(*constraints) return graphs def _get_constraints(self, reference_graph): reference_vec = self.non_norm_vec.transform([reference_graph]) # find neighbors neighbors = self.nn_estimator.kneighbors(reference_vec, return_distance=False) neighbors = neighbors[0] # compute center of mass landmarks = neighbors[:self.n_landmarks] loc_graphs = [self.all_graphs[i] for i in neighbors] reference_graphs = [self.all_graphs[i] for i in landmarks] reference_vecs = self.all_vecs[landmarks] avg_reference_vec = sp.sparse.csr_matrix.mean(reference_vecs, axis=0) reference_vecs = self.non_norm_vec.transform(reference_graphs) # compute desired distances desired_distances = euclidean_distances(avg_reference_vec, reference_vecs) desired_distances = desired_distances[0] return reference_graphs, desired_distances, loc_graphs
improved_graphs = sampler.sample(graphs_pos_, same_radius=False, max_size_diff=True, sampling_interval=9999, select_cip_max_tries=100, batch_size=int(count/4)+1, n_steps=100, n_jobs=-1, improving_threshold=0.9) #calculate the score of the improved versions #calculate score of the originals avg_imp=sum( [estimator.decision_function(e) for e in vectorizer.transform(unpack(improved_graphs)) ] )/count avg_ori=sum( [estimator.decision_function(e) for e in vectorizer.transform(graphs_pos___)] )/count improved.append(avg_imp) originals.append(avg_ori) t = range(len(percentages)) # originals are blue # improved ones are green print originals print improved plt.plot(t,originals ,'bs') plt.plot(t, improved ,'g^') plt.savefig('zomg.png')
class DiscSampler(): ''' ''' def __init__(self): # this is mainly for the forest. the sampler uses a different vectorizer self.vectorizer = Vectorizer(nbits=14) def get_heap_and_forest(self, griter, k): ''' so we create the heap and the forest... heap is (dist to hyperplane, count, graph) and the forest ist just a nearest neighbor from sklearn ''' graphs = list(griter) graphs2 = copy.deepcopy(graphs) # transform doess mess up the graph objects X = self.vectorizer.transform(graphs) forest = LSHForest() forest.fit(X) print 'got forest' heap = [] for vector, graph in zip(X, graphs2): graph2 = nx.Graph(graph) heapq.heappush( heap, ( self.sampler.estimator.predict_proba( self.sampler.vectorizer.transform_single( graph2))[0][1], # score ~ dist from hyperplane k + 1, # making sure that the counter is high so we dont output the startgraphz at the end graph)) # at last the actual graph print 'got heap' distances, unused = forest.kneighbors(X, n_neighbors=2) distances = [a[1] for a in distances ] # the second element should be the dist we want avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances) print 'got dist' return heap, forest, avg_dist ''' def sample_simple(self,graphiter,iterneg): graphiter,grait,griter2 = itertools.tee(graphiter,3) self.fit_sampler(graphiter,iterneg) a,b,c=self.get_heap_and_forest( griter2, 30) grait= itertools.islice(grait,5) rez=self.sampler.sample(grait,n_samples=5, batch_size=1, n_jobs=0, n_steps=1, select_cip_max_tries=100, accept_annealing_factor=.5, generatormode=False, same_core_size=False ) return rez ''' def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10): # some initialisation, # creating samper # setup heap and forest graphiter, iter2 = itertools.tee(graphiter) self.fit_sampler(iter2, iter_neg) heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k) # heap should be like (hpdist, count, graph) radius = radius * avg_dist # so lets start the loop1ng result = [] while heap and len(result) < how_many: # pop all the graphs we want todo = [] for i in range(heap_chunk_size): if heap: todo.append(heapq.heappop(heap)) # let the sampler do the sampling graphz = [e[2] for e in todo] #draw.draw_graph_set_graphlearn(graphz) work = self.sampler.sample(graphz, batch_size=1, n_jobs=0, n_steps=30, select_cip_max_tries=100, improving_threshold=.5, generatormode=False, max_core_size_diff=False, n_samples=3) # lets see, we need to take care of # = the initialy poped stuff # - increase and check the counter, reinsert into heap # = the new graphs # put them in the heap and the forest for graph, task in zip(work, todo): graphlist = graph.graph['sampling_info']['graphs_history'] print 'rez:', graphlist, task for graph2 in graphlist: # check distance from created instances x = self.vectorizer.transform_single(graph2) dist, void = forest.kneighbors(x, 1) dist = sum(dist) # is the distance ok? # if so, insert into forest and heap if radius < dist < radius * 2: forest.partial_fit(x) heapq.heappush(heap, (graph2.graph['score'], 0, graph2)) print 'heap' print 'cant heap', radius, dist # taking care of task graph # put in result list if necessary if task[1] < check_k < task[1] + len(graphlist): result.append(task[2]) print 'found sth' # go back to the heap! heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2])) return result ''' def simple_fit(self,iter_pos): self.sampler= GraphLearnSampler() self.sampler.fit(iter_pos) self.estimator=self.sampler.estimator ''' def fit_sampler(self, iter_pos, iter_neg): # getting the sampler ready: self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2]) iter_pos, pos, pos_ = itertools.tee(iter_pos, 3) self.estimator = self.sampler.estimatorobject.fit_2( iter_pos, iter_neg, self.sampler.vectorizer) print 'got estimeetaaa' self.sampler.local_substitutable_graph_grammar.fit( pos, grammar_n_jobs=-1, grammar_batch_size=8) self.sampler.estimator = self.estimator print 'got grammar:grammar is there oO'
class EdenEstimator(BaseEstimator, ClassifierMixin): """Build an estimator for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet', n_iter=500): """construct.""" self.set_params(r, d, nbits, discrete, balance, subsample_size, ratio, normalization, inner_normalization, penalty, n_iter) def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet', n_iter=500): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(n_iter=n_iter) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') @timeit def fit(self, graphs, targets, randomize=True): """fit.""" if self.balance: if randomize: bal_graphs, bal_targets = balance( graphs, targets, None, ratio=self.ratio) else: samp_graphs, samp_targets = subsample( graphs, targets, subsample_size=self.subsample_size) x = self.transform(samp_graphs) self.model.fit(x, samp_targets) bal_graphs, bal_targets = balance( graphs, targets, self, ratio=self.ratio) size = len(bal_targets) logger.debug('Dataset size=%d' % (size)) x = self.transform(bal_graphs) self.model = self.model.fit(x, bal_targets) else: x = self.transform(graphs) self.model = self.model.fit(x, targets) return self @timeit def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds @timeit def decision_function(self, graphs): """decision_function.""" x = self.transform(graphs) preds = self.model.decision_function(x) return preds @timeit def cross_val_score(self, graphs, targets, scoring='roc_auc', cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_score( self.model, x, targets, cv=cv, scoring=scoring) return scores @timeit def cross_val_predict(self, graphs, targets, cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_predict( self.model, x, targets, cv=cv, method='decision_function') return scores @timeit def cluster(self, graphs, n_clusters=16): """cluster.""" x = self.transform(graphs) clust_est = MiniBatchKMeans(n_clusters=n_clusters) cluster_ids = clust_est.fit_predict(x) return cluster_ids @timeit def model_selection(self, graphs, targets, n_iter=30, subsample_size=None): """model_selection_randomized.""" param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))} if subsample_size: graphs, targets = subsample( graphs, targets, subsample_size=subsample_size) pool = mp.Pool() scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter) pool.close() pool.join() best_params = max(scores)[1] logger.debug("Best parameters:\n%s" % (best_params)) self = EdenEstimator(**best_params) return self @timeit def learning_curve(self, graphs, targets, cv=5, n_steps=10, start_fraction=0.1): """learning_curve.""" graphs, targets = paired_shuffle(graphs, targets) x = self.transform(graphs) train_sizes = np.linspace(start_fraction, 1.0, n_steps) scoring = 'roc_auc' train_sizes, train_scores, test_scores = learning_curve( self.model, x, targets, cv=cv, train_sizes=train_sizes, scoring=scoring) return train_sizes, train_scores, test_scores def bias_variance_decomposition(self, graphs, targets, cv=5, n_bootstraps=10): """bias_variance_decomposition.""" x = self.transform(graphs) score_list = [] for i in range(n_bootstraps): scores = cross_val_score( self.model, x, targets, cv=cv) score_list.append(scores) score_list = np.array(score_list) mean_scores = np.mean(score_list, axis=1) std_scores = np.std(score_list, axis=1) return mean_scores, std_scores
class RegressorWrapper(BaseEstimator, RegressorMixin): """Regressor.""" def __init__(self, program=SGDRegressor(average=True, shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() def set_params(self, **params): """Set the parameters of this estimator. The method. Returns ------- self """ # finds parameters for the vectorizer as those that contain "__" params_vectorizer = dict() params_clusterer = dict() for param in params: if "vectorizer__" in param: key = param.split('__')[1] val = params[param] params_vectorizer[key] = val else: params_clusterer[param] = params[param] self.program.set_params(**params_clusterer) self.vectorizer.set_params(**params_vectorizer) return self def fit(self, graphs): """fit.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) y = self._extract_targets(graphs) self.program = self.program.fit(data_matrix, y) return self except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def predict(self, graphs): """predict.""" try: graphs, graphs_ = tee(graphs) data_matrix = self.vectorizer.transform(graphs_) predictions = self.program.predict(data_matrix) for prediction, graph in izip(predictions, graphs): graph.graph['prediction'] = prediction graph.graph['score'] = prediction yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True) def _extract_targets(self, graphs): y = [] for graph in graphs: if graph.graph.get('target', None) is not None: y.append(graph.graph['target']) else: raise Exception('Missing the attribute "target" \ in graph dictionary!') y = np.ravel(y) return y
class DiscSampler(): ''' ''' def __init__(self): # this is mainly for the forest. the sampler uses a different vectorizer self.vectorizer = Vectorizer(nbits=14) def get_heap_and_forest(self, griter, k): ''' so we create the heap and the forest... heap is (dist to hyperplane, count, graph) and the forest ist just a nearest neighbor from sklearn ''' graphs = list(griter) graphs2 = copy.deepcopy(graphs) # transform doess mess up the graph objects X = self.vectorizer.transform(graphs) forest = LSHForest() forest.fit(X) print 'got forest' heap = [] for vector, graph in zip(X, graphs2): graph2 = nx.Graph(graph) heapq.heappush(heap, ( self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1], # score ~ dist from hyperplane k + 1, # making sure that the counter is high so we dont output the startgraphz at the end graph)) # at last the actual graph print 'got heap' distances, unused = forest.kneighbors(X, n_neighbors=2) distances = [a[1] for a in distances] # the second element should be the dist we want avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances) print 'got dist' return heap, forest, avg_dist ''' def sample_simple(self,graphiter,iterneg): graphiter,grait,griter2 = itertools.tee(graphiter,3) self.fit_sampler(graphiter,iterneg) a,b,c=self.get_heap_and_forest( griter2, 30) grait= itertools.islice(grait,5) rez=self.sampler.sample(grait,n_samples=5, batch_size=1, n_jobs=0, n_steps=1, select_cip_max_tries=100, accept_annealing_factor=.5, generatormode=False, same_core_size=False ) return rez ''' def sample_graphs(self, graphiter, iter_neg, radius, how_many, check_k, heap_chunk_size=10): # some initialisation, # creating samper # setup heap and forest graphiter, iter2 = itertools.tee(graphiter) self.fit_sampler(iter2, iter_neg) heap, forest, avg_dist = self.get_heap_and_forest(graphiter, check_k) # heap should be like (hpdist, count, graph) radius = radius * avg_dist # so lets start the loop1ng result = [] while heap and len(result) < how_many: # pop all the graphs we want todo = [] for i in range(heap_chunk_size): if heap: todo.append(heapq.heappop(heap)) # let the sampler do the sampling graphz = [e[2] for e in todo] # draw.draw_graph_set_graphlearn(graphz) work = self.sampler.sample(graphz, batch_size=1, n_jobs=0, n_steps=30, select_cip_max_tries=100, improving_threshold=.5, generatormode=False, max_core_size_diff=False, n_samples=3 ) # lets see, we need to take care of # = the initialy poped stuff # - increase and check the counter, reinsert into heap # = the new graphs # put them in the heap and the forest for graph, task in zip(work, todo): graphlist = graph.graph['sampling_info']['graphs_history'] print 'rez:', graphlist, task for graph2 in graphlist: # check distance from created instances x = self.vectorizer.transform_single(graph2) dist, void = forest.kneighbors(x, 1) dist = sum(dist) # is the distance ok? # if so, insert into forest and heap if radius < dist < radius * 2: forest.partial_fit(x) heapq.heappush(heap, (graph2.graph['score'], 0, graph2)) print 'heap' print 'cant heap', radius, dist # taking care of task graph # put in result list if necessary if task[1] < check_k < task[1] + len(graphlist): result.append(task[2]) print 'found sth' # go back to the heap! heapq.heappush(heap, (task[0], task[1] + len(graphlist), task[2])) return result ''' def simple_fit(self,iter_pos): self.sampler= GraphLearnSampler() self.sampler.fit(iter_pos) self.estimator=self.sampler.estimator ''' def fit_sampler(self, iter_pos, iter_neg): # getting the sampler ready: self.sampler = MySampler(radius_list=[0, 1], thickness_list=[0.5, 1, 2]) iter_pos, pos, pos_ = itertools.tee(iter_pos, 3) self.estimator = self.sampler.estimatorobject.fit_2(iter_pos, iter_neg, self.sampler.vectorizer) print 'got estimeetaaa' self.sampler.local_substitutable_graph_grammar.fit(pos, grammar_n_jobs=-1, grammar_batch_size=8) self.sampler.estimator = self.estimator print 'got grammar:grammar is there oO'
class ListVectorizer(Vectorizer): """Transform vector labeled, weighted, nested graphs in sparse vectors. A list of iterators over graphs and a list of weights are taken in input. The returned vector is the linear combination of sparse vectors obtained on each corresponding graph. """ def __init__(self, complexity=3, r=None, d=None, min_r=0, min_d=0, nbits=20, normalization=True, inner_normalization=True, n=1, min_n=2): """ Arguments: complexity : int The complexity of the features extracted. r : int The maximal radius size. d : int The maximal distance size. min_r : int The minimal radius size. min_d : int The minimal distance size. nbits : int The number of bits that defines the feature space size: |feature space|=2^nbits. normalization : bool If set the resulting feature vector will have unit euclidean norm. inner_normalization : bool If set the feature vector for a specific combination of the radius and distance size will have unit euclidean norm. When used together with the 'normalization' flag it will be applied first and then the resulting feature vector will be normalized. n : int The maximal number of clusters used to discretized label vectors. min:n : int The minimal number of clusters used to discretized label vectors. """ self.vectorizer = Vectorizer(complexity=complexity, r=r, d=d, min_r=min_r, min_d=min_d, nbits=nbits, normalization=normalization, inner_normalization=inner_normalization, n=n, min_n=min_n) self.vectorizers = list() def fit(self, graphs_iterators_list): """ Constructs an approximate explicit mapping of a kernel function on the data stored in the nodes of the graphs. Arguments: graphs_iterators_list : list of iterators over networkx graphs. The data. """ for i, graphs in enumerate(graphs_iterators_list): self.vectorizers.append(copy.copy(self.vectorizer)) self.vectorizers[i].fit(graphs) def fit_transform(self, graphs_iterators_list, weights=list()): """ Arguments: graphs_iterators_list : list of iterators over networkx graphs. The data. weights : list of positive real values. Weights for the linear combination of sparse vectors obtained on each iterated tuple of graphs. """ graphs_iterators_list_fit, graphs_iterators_list_transf = itertools.tee(graphs_iterators_list) self.fit(graphs_iterators_list_fit) return self.transform(graphs_iterators_list_transf) def transform(self, graphs_iterators_list, weights=list()): """ Transforms a list of networkx graphs into a Numpy csr sparse matrix ( Compressed Sparse Row matrix ). Arguments: graphs_iterators_list : list of iterators over networkx graphs. The data. weights : list of positive real values. Weights for the linear combination of sparse vectors obtained on each iterated tuple of graphs. """ # if no weights are provided then assume unitary weight if len(weights) == 0: weights = [1] * len(graphs_iterators_list) assert(len(graphs_iterators_list) == len(weights)), 'ERROR: weights size is different than iterators size.' assert(len(filter(lambda x: x < 0, weights)) == 0), 'ERROR: weight list contains negative values.' for i, graphs in enumerate(graphs_iterators_list): if len(self.vectorizers) == 0: data_matrix_curr = self.vectorizer.transform(graphs) else: data_matrix_curr = self.vectorizers[i].transform(graphs) if i == 0: data_matrix = data_matrix_curr * weights[i] else: data_matrix = data_matrix + data_matrix_curr * weights[i] return data_matrix def similarity(self, graphs_iterators_list, ref_instance=None, weights=list()): """ This is a generator. """ self._reference_vec = self._convert_dict_to_sparse_matrix( self._transform(0, ref_instance)) # if no weights are provided then assume unitary weight if len(weights) == 0: weights = [1] * len(graphs_iterators_list) assert(len(graphs_iterators_list) == len(weights) ), 'ERROR: weights count is different than iterators count.' assert(len(filter(lambda x: x < 0, weights)) == 0), 'ERROR: weight list contains negative values.' try: while True: graphs = [G_iterator.next() for G_iterator in graphs_iterators_list] yield self._similarity(graphs, weights) except StopIteration: return def _similarity(self, graphs, weights=list()): # extract feature vector for i, graph in enumerate(graphs): x_curr = self.vectorizer._convert_dict_to_sparse_matrix( self.vectorizer._transform(0, graph)) if i == 0: x = x_curr * weights[i] else: x = x + x_curr * weights[i] res = self._reference_vec.dot(x.T).todense() prediction = res[0, 0] return prediction def predict(self, graphs_iterators_list, estimator=SGDClassifier(), weights=list()): """ Purpose: ---------- It outputs the estimator prediction of the vectorized graph. Arguments: estimator : scikit-learn predictor trained on data sampled from the same distribution. If None the vertex weigths are by default 1. """ self.estimator = estimator # if no weights are provided then assume unitary weight if len(weights) == 0: weights = [1] * len(graphs_iterators_list) assert(len(graphs_iterators_list) == len(weights)), 'ERROR: weights count is different than iterators count.' assert(len(filter(lambda x: x < 0, weights)) == 0), 'ERROR: weight list contains negative values.' try: while True: graphs = [G_iterator.next() for G_iterator in graphs_iterators_list] yield self._predict(graphs, weights) except StopIteration: return def _predict(self, graphs, weights=list()): # extract feature vector for i, graph in enumerate(graphs): x_curr = self.vectorizer._convert_dict_to_sparse_matrix(self.vectorizer._transform(0, graph)) if i == 0: x = x_curr * weights[i] else: x = x + x_curr * weights[i] margins = self.estimator.decision_function(x) prediction = margins[0] return prediction