def __init__(self, min_count=2, max_n_neighbors=None, r=3, d=3, n_landmarks=5, n_neighbors=100, n_iter=20, k_best=5, max_num_solutions=30, improve=True): """init.""" self.improve = improve self.max_num = max_num_solutions self.n_landmarks = n_landmarks self.n_neighbors = n_neighbors self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors) self.non_norm_vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True) self.dist_opt = LandmarksDistanceOptimizer( r=r, d=d, min_count=min_count, max_n_neighbors=max_n_neighbors, n_iter=n_iter, k_best=k_best, improve=improve)
def __init__(self, program=None, vertex_features=True, reweight=1.0): """Construct.""" self.program = program self.vertex_features = vertex_features self.reweight = reweight self.vectorizer = Vectorizer() self.params_vectorize = dict()
def __init__(self, program=None, relabel=False, reweight=1.0): """Construct.""" self.program = program self.relabel = relabel self.reweight = reweight self.vectorizer = Vectorizer() self.params_vectorize = dict()
def setup(self, known_graphs=None, candidate_graphs=None): """Setup.""" # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the # known graphs in the list 'known_graphs' parameters_priors = dict(n_neighbors=self.n_neighbors) parameters_priors.update(dict(vectorizer__complexity=self.complexity, vectorize__n_jobs=-1, vectorize__fit_flag=False, vectorize__n_blocks=5, vectorize__block_size=100)) fit_wrapped_knn_predictor_known = \ model(known_graphs, program=KNNWrapper(program=NearestNeighbors()), parameters_priors=parameters_priors) # compute distances of candidate_graphs to known_graphs knn_candidate_graphs = predict(candidate_graphs, program=fit_wrapped_knn_predictor_known) knn_candidate_graphs = list(knn_candidate_graphs) self.distances_to_known_graphs = [] for knn_candidate_graph in knn_candidate_graphs: distances = knn_candidate_graph.graph['distances'] self.distances_to_known_graphs.append(distances) # compute candidate_graphs encodings self.candidate_graphs_data_matrix = \ vectorize(candidate_graphs, vectorizer=Vectorizer(complexity=self.complexity), block_size=400, n_jobs=-1)
def __init__(self, model=None, n_jobs=1, vectorizer=Vectorizer()): if not model: self.model = OneClassSVM(gamma='auto') else: self.model = model self.n_jobs = n_jobs self.vectorizer = vectorizer
def __init__(self, pre_processor=None, vectorizer=Vectorizer(complexity=1), estimator=SGDClassifier(class_weight='auto', shuffle=True), fit_vectorizer=False, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, description=None, random_state=1): self.pre_processor = copy.deepcopy(pre_processor) self.vectorizer = copy.deepcopy(vectorizer) self.estimator = copy.deepcopy(estimator) self.pre_processor_args = None self.vectorizer_args = None self.estimator_args = None self.description = description self.fit_vectorizer = fit_vectorizer self.n_jobs = n_jobs self.n_blocks = n_blocks self.block_size = block_size self.pre_processor_n_jobs = pre_processor_n_jobs self.pre_processor_n_blocks = pre_processor_n_blocks self.pre_processor_block_size = pre_processor_block_size random.seed(random_state)
def __init__(self, estimator=SGDClassifier(), vectorizer=Vectorizer(), pre_processor=PreProcessor(), designer=AntaRNAv117Designer(), constraint_extractor=ConstraintExtractor(), n_synthesized_seqs_per_seed_seq=3, instance_score_threshold_in=0, instance_score_threshold_out=1, shuffle_order=2, negative_shuffle_ratio=2, n_jobs=-1, cv=3, n_iter_search=1): self.estimator = estimator self.vectorizer = vectorizer self.designer = designer self.pre_processor = pre_processor self.constraint_extractor = constraint_extractor self._n_synthesized_seqs_per_seed_seq = n_synthesized_seqs_per_seed_seq self._instance_score_threshold_in = instance_score_threshold_in self._instance_score_threshold_out = instance_score_threshold_out self._shuffle_order = shuffle_order self._negative_shuffle_ratio = negative_shuffle_ratio self._n_jobs = n_jobs self._cv = cv self._n_iter_search = n_iter_search logger.debug('Instantiated an RNASynth object.') logger.debug(self.__dict__)
def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet', n_iter=500): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(n_iter=n_iter) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self
def __init__(self, complexity=3, r=None, d=None, min_r=0, min_d=0, nbits=20, normalization=True, inner_normalization=True, n=1, min_n=2): """ Arguments: complexity : int The complexity of the features extracted. r : int The maximal radius size. d : int The maximal distance size. min_r : int The minimal radius size. min_d : int The minimal distance size. nbits : int The number of bits that defines the feature space size: |feature space|=2^nbits. normalization : bool If set the resulting feature vector will have unit euclidean norm. inner_normalization : bool If set the feature vector for a specific combination of the radius and distance size will have unit euclidean norm. When used together with the 'normalization' flag it will be applied first and then the resulting feature vector will be normalized. n : int The maximal number of clusters used to discretized label vectors. min:n : int The minimal number of clusters used to discretized label vectors. """ self.vectorizer = Vectorizer(complexity=complexity, r=r, d=d, min_r=min_r, min_d=min_d, nbits=nbits, normalization=normalization, inner_normalization=inner_normalization, n=n, min_n=min_n) self.vectorizers = list()
def __init__(self, r=3, d=3, multiproc=1, squared_error=False): """Initialize.""" self.vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.multiproc = multiproc self.squared_error = squared_error
def __init__(self, program=SGDClassifier(average=True, class_weight='balanced', shuffle=True)): """Construct.""" self.program = program self.vectorizer = Vectorizer() self.params_vectorize = dict()
def generate_negatives_and_fit(iterable=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None): vectorizer = Vectorizer(complexity=vectorizer_complexity) iterable, iterable_neg = binary_classification_dataset_setup( iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order) model = fit(iterable, iterable_neg, vectorizer, n_jobs=-1, cv=3, n_iter_search=1) return model
def make_fold_vectorize(complexity=3, nbits=15, fold=None, boundaries=None): """Curry parameters in vectorizer.""" vec = Vectorizer(complexity=complexity, nbits=nbits) vectorize = curry(lambda vec, graphs: vec.transform(graphs))(vec) cwindow_reweight = curry(_window_reweight)(boundaries) fold_vectorize = compose(vectorize, map(cwindow_reweight), fold) return fold_vectorize
def __init__(self, n_landmarks=5, n_neighbors=50): """init.""" self.vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) self.n_neighbors = n_neighbors self.n_landmarks = n_landmarks
def make_graphs(smiles): eden_graph_generator = [smiles_to_eden(smi) for smi in smiles ] # Convert from SMILES to EdEN format graphs = [graph for graph in eden_graph_generator ] # Compute graphs for each molecule vectorizer = Vectorizer(min_r=0, min_d=0, r=1, d=2) sparse = vectorizer.transform( graphs) # Compute the NSPDK features and store in a sparse array return sparse
def vectorizer_init(self, args): """Setup the conversion of graphs generated by the the pre_processor function to feature vectors. Returns the function used to calculate feature vectors from graphs prepared by the pre_processor function and a set of matching parameter choices. """ vectorizer = Vectorizer() vectorizer_parameters = {'complexity': [2, 3, 4]} return vectorizer, vectorizer_parameters
def __init__(self, vectorizer=Vectorizer(), multiproc=1, squared_error=False): """init.""" self.desired_distances = None self.reference_vecs = None self.vectorizer = vectorizer self.multiproc = multiproc self.squared_error = squared_error
def __init__(self, min_subarray_size=7, max_subarray_size=10, min_motif_count=1, min_cluster_size=1, training_size=None, negative_ratio=1, shuffle_order=2, n_iter_search=1, complexity=4, radius=None, distance=None, nbits=20, clustering_algorithm=None, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, random_state=1): self.n_jobs = n_jobs self.n_blocks = n_blocks self.block_size = block_size self.pre_processor_n_jobs = pre_processor_n_jobs self.pre_processor_n_blocks = pre_processor_n_blocks self.pre_processor_block_size = pre_processor_block_size self.training_size = training_size self.n_iter_search = n_iter_search self.complexity = complexity self.nbits = nbits # init vectorizer self.vectorizer = Vectorizer(complexity=self.complexity, r=radius, d=distance, nbits=self.nbits) self.seq_vectorizer = SeqVectorizer(complexity=self.complexity, r=radius, d=distance, nbits=self.nbits) self.negative_ratio = negative_ratio self.shuffle_order = shuffle_order self.clustering_algorithm = clustering_algorithm self.min_subarray_size = min_subarray_size self.max_subarray_size = max_subarray_size self.min_motif_count = min_motif_count self.min_cluster_size = min_cluster_size self.random_state = random_state random.seed(random_state) self.motives_db = defaultdict(list) self.motives = [] self.clusters = defaultdict(list) self.cluster_models = [] self.importances = []
def compute_NSPDK_features(): import eden from eden.graph import Vectorizer from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden mol_path = olfaction_prediction_path + '/data/sdf/' iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf') iter_graphs = obabel_to_eden(iter_mols) vectorizer = Vectorizer(r=3, d=4) X = vectorizer.transform(iter_graphs) return X
def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3, class_discretizer=2, class_std_discretizer=1, similarity_discretizer=10, size_discretizer=1, volume_discretizer=10, n_neighbors=10, improve=True): """init.""" self.improve = improve self.n_neighbors = n_neighbors self.non_norm_vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.vec = Vectorizer(r=r, d=d, normalization=True, inner_normalization=True) self.grammar = GrammarWrapper(radius_list=[1, 2, 3], thickness_list=[2], min_cip_count=min_count, min_interface_count=min_count, max_n_neighbors=max_n_neighbors, n_neigh_steps=1, max_neighborhood_size=max_n_neighbors) self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator( self.vec, class_discretizer=class_discretizer, class_std_discretizer=class_std_discretizer, similarity_discretizer=similarity_discretizer, size_discretizer=size_discretizer, volume_discretizer=volume_discretizer, improve=improve) self.cost_estimator = MultiObjectiveCostEstimator( self.non_norm_vec, improve) self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)
def generate_negatives_and_evaluate(iterable=None, estimator=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None): vectorizer = Vectorizer(complexity=vectorizer_complexity) iterable, iterable_neg = binary_classification_dataset_setup( iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order) roc, apr = estimate(iterable, iterable_neg, estimator, vectorizer, n_jobs=-1) return roc, apr
def _remove_similar_pairs(graphs): vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) x = vec.transform(graphs) matrix = cosine_similarity(x) scores = np.array([1] * len(graphs)) ids = min_similarity_selection(matrix, scores=scores, max_num=len(graphs) / 2) graphs = [graphs[i] for i in ids] logging.debug('similar pairs removal:%d' % len(graphs)) return graphs
def __init__(self, transformer=None, vectorizer=Vectorizer(complexity=4, nbits=13), clustering_algo=DBSCAN(), distance_std_factor=2, min_cluster_size=2, random_state=1): """Cluster sequences according to regions of interest and structural folding. Parameters ---------- transformer : initialized PreProcessor object Transforms sequences to graphs that encode secondary structure information and weights nucleotides according to user defined list of intervals. vectorizer : initialized Vectorizer object Transforms graphs to sparse vectors. clustering_algo : scikit-learn clustering algorithm Clusters sparse vectors in a finite number of classes. distance_std_factor : int (default 2) How many standard deviations less than the mean pairwise distance is the maximal distance required to join an instance in a cluster. min_cluster_size : int (default 2) Minimal size of any cluster. random_state: int (default 1) Random seed. Attributes ---------- predictions : list(int) List of cluster ids, one per instance. clusters : defaultdict(list) Dictionary with cluster id as key and list of sequences as variable. data_matrix : Scipy sparse matrix (Compressed Sparse Row matrix) List of sparse vectors resulting from the transformation of sequences into structures. """ self.name = self.__class__.__name__ self.transformer = transformer self.vectorizer = vectorizer self.clustering_algo = clustering_algo self.distance_std_factor = distance_std_factor self.min_cluster_size = min_cluster_size self.clusters = defaultdict(list) self.predictions = list() self.data_matrix = None self.random_state = random_state random.seed(self.random_state)
def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3): """init.""" self.vec = Vectorizer(r=r, d=d, normalization=False, inner_normalization=False) self.grammar = GrammarWrapper(radius_list=[1, 2, 3], thickness_list=[2], min_cip_count=min_count, min_interface_count=min_count, max_n_neighbors=max_n_neighbors, n_neigh_steps=1, max_neighborhood_size=max_n_neighbors)
def _vectorize_graphs(self, graphs): """Vectorize the RNAplfold graphs using EDeN.""" if self.verbose: print("Vectorizing (complexity: %i, hashing: %i bits)..." % (self.complexity, self.nbits), end=' ') sys.stdout.flush() vec = Vectorizer(complexity=self.complexity, nbits=self.nbits) x_sparse = eden_vectorize(graphs, vectorizer=vec, n_jobs=self.njobs) if self.verbose: print("Done.\n") sys.stdout.flush() return x_sparse.todense()
def smiles2nspdk(input_path, complexity, nbits, save_path): """ Smiles strings to nspdk descriptors :param input_path: path to file with SMILES :param complexity: descriptor complexity :param nbits: bits of descriptor :param save_path: :return: """ vec = Vectorizer(complexity=complexity, nbits=nbits) smiles_list = load_dataset(input_path) res = vec.transform(list(smiles_strings_to_nx(smiles_list))).todense() output = open(save_path, "w") for row in res: np.savetxt(output, row)
def __init__(self, radius_list=None, thickness_list=None, min_cip_count=3, vectorizer=Vectorizer(complexity=3), min_interface_count=2, nbit=20, node_entity_check=lambda x, y: True): self.productions = {} self.min_interface_count = min_interface_count self.radius_list = radius_list self.thickness_list = thickness_list self.min_cip_count = min_cip_count self.vectorizer = vectorizer self.hash_bitmask = 2**nbit - 1 self.nbit = nbit # checked when extracting grammar. see graphtools self.node_entity_check = node_entity_check self.prep_is_outdated = True
def vectorize(self, g): """ Vectorize graph nodes Return: a matrix in which rows are the vectors that represents for nodes """ vec = Vectorizer(nbits=self.nbits, d=self.d, r=self.r, discrete=self.discrete) M = vec.vertex_transform([g])[0] M_reduce = [] for idx in range(self.n_nodes): vec = M[idx, :] for l in range(1, self.L): vec = vec + M[idx + l * self.n_nodes, :] M_reduce.append(vec) M = vstack(M_reduce) return M
def _outliers(graphs, k=3): vec = Vectorizer(r=3, d=3, normalization=False, inner_normalization=False) x = vec.transform(graphs) knn = NearestNeighbors(n_neighbors=k) knn.fit(x) neigbhbors = knn.kneighbors(x, return_distance=False) outlier_list = [] non_outlier_list = [] for i, ns in enumerate(neigbhbors): not_outlier = False for n in ns[1:]: if i in list(neigbhbors[n, :]): not_outlier = True break if not_outlier is False: outlier_list.append(i) else: non_outlier_list.append(i) return outlier_list, non_outlier_list
def __init__(self, radius_list=[0, 1], thickness_list=[1, 2], grammar=None, core_interface_pair_remove_threshold=2, interface_remove_threshold=2, complexity=3, vectorizer=Vectorizer(complexity=3), estimator=estimator_wrapper.estimator_wrapper()): self.complexity = complexity self.feasibility_checker = FeasibilityChecker() self.postprocessor = processing.PostProcessor() self.vectorizer = vectorizer # lists of int self.radius_list = [int(2 * r) for r in radius_list] self.thickness_list = [int(2 * t) for t in thickness_list] # scikit classifier self.estimatorobject = estimator # grammar object self.local_substitutable_graph_grammar = grammar # cips hashes will be masked with this, this is unrelated to the vectorizer self.hash_bitmask = pow(2, 20) - 1 # we will save current graph at every intervalth step of sampling and attach to graphinfos[graphs] self.sampling_interval = None # how many sampling steps are done self.n_steps = None # current step in sampling proces of a single graph self.step = None # how often do we try to get a cip from the current graph in sampling self.select_cip_max_tries = None # sample path self.sample_path = None self.local_substitutable_graph_grammar = LocalSubstitutableGraphGrammar( self.radius_list, self.thickness_list, complexity=self.complexity, cip_remove_threshold=core_interface_pair_remove_threshold, interface_remove_threshold=interface_remove_threshold, nbit=20)