Пример #1
0
 def __init__(self,
              min_count=2,
              max_n_neighbors=None,
              r=3,
              d=3,
              n_landmarks=5,
              n_neighbors=100,
              n_iter=20,
              k_best=5,
              max_num_solutions=30,
              improve=True):
     """init."""
     self.improve = improve
     self.max_num = max_num_solutions
     self.n_landmarks = n_landmarks
     self.n_neighbors = n_neighbors
     self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)
     self.non_norm_vec = Vectorizer(r=r,
                                    d=d,
                                    normalization=False,
                                    inner_normalization=False)
     self.vec = Vectorizer(r=r,
                           d=d,
                           normalization=True,
                           inner_normalization=True)
     self.dist_opt = LandmarksDistanceOptimizer(
         r=r,
         d=d,
         min_count=min_count,
         max_n_neighbors=max_n_neighbors,
         n_iter=n_iter,
         k_best=k_best,
         improve=improve)
Пример #2
0
 def __init__(self, program=None, vertex_features=True, reweight=1.0):
     """Construct."""
     self.program = program
     self.vertex_features = vertex_features
     self.reweight = reweight
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
Пример #3
0
 def __init__(self, program=None, relabel=False, reweight=1.0):
     """Construct."""
     self.program = program
     self.relabel = relabel
     self.reweight = reweight
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
Пример #4
0
 def setup(self, known_graphs=None, candidate_graphs=None):
     """Setup."""
     # compute the nearest neighbors for the 'proposal_graphs' w.r.t. the
     # known graphs in the list 'known_graphs'
     parameters_priors = dict(n_neighbors=self.n_neighbors)
     parameters_priors.update(dict(vectorizer__complexity=self.complexity,
                                   vectorize__n_jobs=-1,
                                   vectorize__fit_flag=False,
                                   vectorize__n_blocks=5,
                                   vectorize__block_size=100))
     fit_wrapped_knn_predictor_known = \
         model(known_graphs,
               program=KNNWrapper(program=NearestNeighbors()),
               parameters_priors=parameters_priors)
     # compute distances of candidate_graphs to known_graphs
     knn_candidate_graphs = predict(candidate_graphs,
                                    program=fit_wrapped_knn_predictor_known)
     knn_candidate_graphs = list(knn_candidate_graphs)
     self.distances_to_known_graphs = []
     for knn_candidate_graph in knn_candidate_graphs:
         distances = knn_candidate_graph.graph['distances']
         self.distances_to_known_graphs.append(distances)
     # compute candidate_graphs encodings
     self.candidate_graphs_data_matrix = \
         vectorize(candidate_graphs,
                   vectorizer=Vectorizer(complexity=self.complexity),
                   block_size=400, n_jobs=-1)
Пример #5
0
 def __init__(self, model=None, n_jobs=1, vectorizer=Vectorizer()):
     if not model:
         self.model = OneClassSVM(gamma='auto')
     else:
         self.model = model
     self.n_jobs = n_jobs
     self.vectorizer = vectorizer
Пример #6
0
 def __init__(self,
              pre_processor=None,
              vectorizer=Vectorizer(complexity=1),
              estimator=SGDClassifier(class_weight='auto', shuffle=True),
              fit_vectorizer=False,
              n_jobs=4,
              n_blocks=8,
              block_size=None,
              pre_processor_n_jobs=4,
              pre_processor_n_blocks=8,
              pre_processor_block_size=None,
              description=None,
              random_state=1):
     self.pre_processor = copy.deepcopy(pre_processor)
     self.vectorizer = copy.deepcopy(vectorizer)
     self.estimator = copy.deepcopy(estimator)
     self.pre_processor_args = None
     self.vectorizer_args = None
     self.estimator_args = None
     self.description = description
     self.fit_vectorizer = fit_vectorizer
     self.n_jobs = n_jobs
     self.n_blocks = n_blocks
     self.block_size = block_size
     self.pre_processor_n_jobs = pre_processor_n_jobs
     self.pre_processor_n_blocks = pre_processor_n_blocks
     self.pre_processor_block_size = pre_processor_block_size
     random.seed(random_state)
Пример #7
0
    def __init__(self,
                 estimator=SGDClassifier(),
                 vectorizer=Vectorizer(),
                 pre_processor=PreProcessor(),
                 designer=AntaRNAv117Designer(),
                 constraint_extractor=ConstraintExtractor(),
                 n_synthesized_seqs_per_seed_seq=3,
                 instance_score_threshold_in=0,
                 instance_score_threshold_out=1,
                 shuffle_order=2,
                 negative_shuffle_ratio=2,
                 n_jobs=-1,
                 cv=3,
                 n_iter_search=1):

        self.estimator = estimator
        self.vectorizer = vectorizer
        self.designer = designer
        self.pre_processor = pre_processor
        self.constraint_extractor = constraint_extractor

        self._n_synthesized_seqs_per_seed_seq = n_synthesized_seqs_per_seed_seq
        self._instance_score_threshold_in = instance_score_threshold_in
        self._instance_score_threshold_out = instance_score_threshold_out
        self._shuffle_order = shuffle_order
        self._negative_shuffle_ratio = negative_shuffle_ratio
        self._n_jobs = n_jobs
        self._cv = cv
        self._n_iter_search = n_iter_search

        logger.debug('Instantiated an RNASynth object.')
        logger.debug(self.__dict__)
Пример #8
0
 def set_params(self, r=3, d=8, nbits=16, discrete=True,
                balance=False, subsample_size=200, ratio=2,
                normalization=False, inner_normalization=False,
                penalty='elasticnet', n_iter=500):
     """setter."""
     self.r = r
     self.d = d
     self.nbits = nbits
     self.normalization = normalization
     self.inner_normalization = inner_normalization
     self.discrete = discrete
     self.balance = balance
     self.subsample_size = subsample_size
     self.ratio = ratio
     if penalty == 'perceptron':
         self.model = Perceptron(n_iter=n_iter)
     else:
         self.model = SGDClassifier(
             average=True, class_weight='balanced', shuffle=True,
             penalty=penalty)
     self.vectorizer = Vectorizer(
         r=self.r, d=self.d,
         normalization=self.normalization,
         inner_normalization=self.inner_normalization,
         discrete=self.discrete,
         nbits=self.nbits)
     return self
Пример #9
0
    def __init__(self,
                 complexity=3,
                 r=None,
                 d=None,
                 min_r=0,
                 min_d=0,
                 nbits=20,
                 normalization=True,
                 inner_normalization=True,
                 n=1,
                 min_n=2):
        """
        Arguments:


        complexity : int
          The complexity of the features extracted.

        r : int
          The maximal radius size.

        d : int
          The maximal distance size.

        min_r : int
          The minimal radius size.

        min_d : int
          The minimal distance size.

        nbits : int
          The number of bits that defines the feature space size: |feature space|=2^nbits.

        normalization : bool
          If set the resulting feature vector will have unit euclidean norm.

        inner_normalization : bool
          If set the feature vector for a specific combination of the radius and
          distance size will have unit euclidean norm.
          When used together with the 'normalization' flag it will be applied first and
          then the resulting feature vector will be normalized.

        n : int
          The maximal number of clusters used to discretized label vectors.

        min:n : int
          The minimal number of clusters used to discretized label vectors.
        """
        self.vectorizer = Vectorizer(complexity=complexity,
                                     r=r,
                                     d=d,
                                     min_r=min_r,
                                     min_d=min_d,
                                     nbits=nbits,
                                     normalization=normalization,
                                     inner_normalization=inner_normalization,
                                     n=n,
                                     min_n=min_n)
        self.vectorizers = list()
Пример #10
0
 def __init__(self, r=3, d=3, multiproc=1, squared_error=False):
     """Initialize."""
     self.vec = Vectorizer(r=r,
                           d=d,
                           normalization=False,
                           inner_normalization=False)
     self.multiproc = multiproc
     self.squared_error = squared_error
Пример #11
0
 def __init__(self,
              program=SGDClassifier(average=True,
                                    class_weight='balanced',
                                    shuffle=True)):
     """Construct."""
     self.program = program
     self.vectorizer = Vectorizer()
     self.params_vectorize = dict()
Пример #12
0
def generate_negatives_and_fit(iterable=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None):

    vectorizer = Vectorizer(complexity=vectorizer_complexity)
    iterable, iterable_neg = binary_classification_dataset_setup(
        iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order)
    model = fit(iterable, iterable_neg, vectorizer,
                n_jobs=-1, cv=3, n_iter_search=1)
    return model
Пример #13
0
def make_fold_vectorize(complexity=3, nbits=15, fold=None, boundaries=None):
    """Curry parameters in vectorizer."""
    vec = Vectorizer(complexity=complexity, nbits=nbits)
    vectorize = curry(lambda vec, graphs: vec.transform(graphs))(vec)

    cwindow_reweight = curry(_window_reweight)(boundaries)
    fold_vectorize = compose(vectorize, map(cwindow_reweight), fold)
    return fold_vectorize
Пример #14
0
 def __init__(self, n_landmarks=5, n_neighbors=50):
     """init."""
     self.vec = Vectorizer(r=3,
                           d=3,
                           normalization=False,
                           inner_normalization=False)
     self.n_neighbors = n_neighbors
     self.n_landmarks = n_landmarks
Пример #15
0
def make_graphs(smiles):
    eden_graph_generator = [smiles_to_eden(smi) for smi in smiles
                            ]  # Convert from SMILES to EdEN format
    graphs = [graph for graph in eden_graph_generator
              ]  # Compute graphs for each molecule
    vectorizer = Vectorizer(min_r=0, min_d=0, r=1, d=2)
    sparse = vectorizer.transform(
        graphs)  # Compute the NSPDK features and store in a sparse array
    return sparse
Пример #16
0
    def vectorizer_init(self, args):
        """Setup the conversion of graphs generated by the the pre_processor function to feature vectors.

        Returns the function used to calculate feature vectors from graphs prepared by the
        pre_processor function and a set of matching parameter choices.
        """
        vectorizer = Vectorizer()
        vectorizer_parameters = {'complexity': [2, 3, 4]}
        return vectorizer, vectorizer_parameters
Пример #17
0
 def __init__(self,
              vectorizer=Vectorizer(),
              multiproc=1,
              squared_error=False):
     """init."""
     self.desired_distances = None
     self.reference_vecs = None
     self.vectorizer = vectorizer
     self.multiproc = multiproc
     self.squared_error = squared_error
Пример #18
0
    def __init__(self,
                 min_subarray_size=7,
                 max_subarray_size=10,
                 min_motif_count=1,
                 min_cluster_size=1,
                 training_size=None,
                 negative_ratio=1,
                 shuffle_order=2,
                 n_iter_search=1,
                 complexity=4,
                 radius=None,
                 distance=None,
                 nbits=20,
                 clustering_algorithm=None,
                 n_jobs=4,
                 n_blocks=8,
                 block_size=None,
                 pre_processor_n_jobs=4,
                 pre_processor_n_blocks=8,
                 pre_processor_block_size=None,
                 random_state=1):
        self.n_jobs = n_jobs
        self.n_blocks = n_blocks
        self.block_size = block_size
        self.pre_processor_n_jobs = pre_processor_n_jobs
        self.pre_processor_n_blocks = pre_processor_n_blocks
        self.pre_processor_block_size = pre_processor_block_size
        self.training_size = training_size
        self.n_iter_search = n_iter_search
        self.complexity = complexity
        self.nbits = nbits
        # init vectorizer
        self.vectorizer = Vectorizer(complexity=self.complexity,
                                     r=radius,
                                     d=distance,
                                     nbits=self.nbits)
        self.seq_vectorizer = SeqVectorizer(complexity=self.complexity,
                                            r=radius,
                                            d=distance,
                                            nbits=self.nbits)
        self.negative_ratio = negative_ratio
        self.shuffle_order = shuffle_order
        self.clustering_algorithm = clustering_algorithm
        self.min_subarray_size = min_subarray_size
        self.max_subarray_size = max_subarray_size
        self.min_motif_count = min_motif_count
        self.min_cluster_size = min_cluster_size
        self.random_state = random_state
        random.seed(random_state)

        self.motives_db = defaultdict(list)
        self.motives = []
        self.clusters = defaultdict(list)
        self.cluster_models = []
        self.importances = []
Пример #19
0
def compute_NSPDK_features():
    import eden
    from eden.graph import Vectorizer
    from eden.converter.molecule.obabel import mol_file_to_iterable, obabel_to_eden
    mol_path = olfaction_prediction_path + '/data/sdf/'
    iter_mols = mol_file_to_iterable(mol_path + '/all_mol.sdf', 'sdf')
    iter_graphs = obabel_to_eden(iter_mols)

    vectorizer = Vectorizer(r=3, d=4)
    X = vectorizer.transform(iter_graphs)
    return X
Пример #20
0
 def __init__(self,
              min_count=2,
              max_n_neighbors=100,
              r=3,
              d=3,
              class_discretizer=2,
              class_std_discretizer=1,
              similarity_discretizer=10,
              size_discretizer=1,
              volume_discretizer=10,
              n_neighbors=10,
              improve=True):
     """init."""
     self.improve = improve
     self.n_neighbors = n_neighbors
     self.non_norm_vec = Vectorizer(r=r,
                                    d=d,
                                    normalization=False,
                                    inner_normalization=False)
     self.vec = Vectorizer(r=r,
                           d=d,
                           normalization=True,
                           inner_normalization=True)
     self.grammar = GrammarWrapper(radius_list=[1, 2, 3],
                                   thickness_list=[2],
                                   min_cip_count=min_count,
                                   min_interface_count=min_count,
                                   max_n_neighbors=max_n_neighbors,
                                   n_neigh_steps=1,
                                   max_neighborhood_size=max_n_neighbors)
     self.sim_cost_estimator = SimVolPredStdSizeMultiObjectiveCostEstimator(
         self.vec,
         class_discretizer=class_discretizer,
         class_std_discretizer=class_std_discretizer,
         similarity_discretizer=similarity_discretizer,
         size_discretizer=size_discretizer,
         volume_discretizer=volume_discretizer,
         improve=improve)
     self.cost_estimator = MultiObjectiveCostEstimator(
         self.non_norm_vec, improve)
     self.nn_estimator = NearestNeighbors(n_neighbors=n_neighbors)
Пример #21
0
def generate_negatives_and_evaluate(iterable=None,
                                    estimator=None,
                                    negative_shuffle_ratio=None,
                                    shuffle_order=None,
                                    vectorizer_complexity=None):

    vectorizer = Vectorizer(complexity=vectorizer_complexity)
    iterable, iterable_neg = binary_classification_dataset_setup(
        iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order)
    roc, apr = estimate(iterable, iterable_neg,
                        estimator, vectorizer, n_jobs=-1)
    return roc, apr
Пример #22
0
def _remove_similar_pairs(graphs):
    vec = Vectorizer(r=3, d=3,
                     normalization=False, inner_normalization=False)
    x = vec.transform(graphs)
    matrix = cosine_similarity(x)
    scores = np.array([1] * len(graphs))
    ids = min_similarity_selection(matrix,
                                   scores=scores,
                                   max_num=len(graphs) / 2)
    graphs = [graphs[i] for i in ids]
    logging.debug('similar pairs removal:%d' % len(graphs))
    return graphs
Пример #23
0
    def __init__(self,
                 transformer=None,
                 vectorizer=Vectorizer(complexity=4, nbits=13),
                 clustering_algo=DBSCAN(),
                 distance_std_factor=2,
                 min_cluster_size=2,
                 random_state=1):
        """Cluster sequences according to regions of interest and structural folding.

        Parameters
        ----------
        transformer : initialized PreProcessor object
            Transforms sequences to graphs that encode secondary structure information
            and weights nucleotides according to user defined list of intervals.

        vectorizer : initialized Vectorizer object
            Transforms graphs to sparse vectors.

        clustering_algo : scikit-learn clustering algorithm
            Clusters sparse vectors in a finite number of classes.

        distance_std_factor : int (default 2)
            How many standard deviations less than the mean pairwise distance is the maximal
            distance required to join an instance in a cluster.

        min_cluster_size : int (default 2)
            Minimal size of any cluster.

        random_state: int (default 1)
            Random seed.

        Attributes
        ----------
        predictions : list(int)
            List of cluster ids, one per instance.

        clusters : defaultdict(list)
            Dictionary with cluster id as key and list of sequences as variable.

        data_matrix : Scipy sparse matrix (Compressed Sparse Row matrix)
            List of sparse vectors resulting from the transformation of sequences into structures.
        """
        self.name = self.__class__.__name__
        self.transformer = transformer
        self.vectorizer = vectorizer
        self.clustering_algo = clustering_algo
        self.distance_std_factor = distance_std_factor
        self.min_cluster_size = min_cluster_size
        self.clusters = defaultdict(list)
        self.predictions = list()
        self.data_matrix = None
        self.random_state = random_state
        random.seed(self.random_state)
Пример #24
0
 def __init__(self, min_count=2, max_n_neighbors=100, r=3, d=3):
     """init."""
     self.vec = Vectorizer(r=r,
                           d=d,
                           normalization=False,
                           inner_normalization=False)
     self.grammar = GrammarWrapper(radius_list=[1, 2, 3],
                                   thickness_list=[2],
                                   min_cip_count=min_count,
                                   min_interface_count=min_count,
                                   max_n_neighbors=max_n_neighbors,
                                   n_neigh_steps=1,
                                   max_neighborhood_size=max_n_neighbors)
Пример #25
0
    def _vectorize_graphs(self, graphs):
        """Vectorize the RNAplfold graphs using EDeN."""
        if self.verbose:
            print("Vectorizing (complexity: %i, hashing: %i bits)..." %
                  (self.complexity, self.nbits),
                  end=' ')
            sys.stdout.flush()

        vec = Vectorizer(complexity=self.complexity, nbits=self.nbits)
        x_sparse = eden_vectorize(graphs, vectorizer=vec, n_jobs=self.njobs)

        if self.verbose:
            print("Done.\n")
            sys.stdout.flush()
        return x_sparse.todense()
Пример #26
0
def smiles2nspdk(input_path, complexity, nbits, save_path):
    """
    Smiles strings to nspdk descriptors
    :param input_path: path to file with SMILES
    :param complexity: descriptor complexity
    :param nbits: bits of descriptor
    :param save_path:
    :return:
    """
    vec = Vectorizer(complexity=complexity, nbits=nbits)
    smiles_list = load_dataset(input_path)
    res = vec.transform(list(smiles_strings_to_nx(smiles_list))).todense()
    output = open(save_path, "w")
    for row in res:
        np.savetxt(output, row)
 def __init__(self,
              radius_list=None,
              thickness_list=None,
              min_cip_count=3,
              vectorizer=Vectorizer(complexity=3),
              min_interface_count=2,
              nbit=20,
              node_entity_check=lambda x, y: True):
     self.productions = {}
     self.min_interface_count = min_interface_count
     self.radius_list = radius_list
     self.thickness_list = thickness_list
     self.min_cip_count = min_cip_count
     self.vectorizer = vectorizer
     self.hash_bitmask = 2**nbit - 1
     self.nbit = nbit
     # checked when extracting grammar. see graphtools
     self.node_entity_check = node_entity_check
     self.prep_is_outdated = True
Пример #28
0
    def vectorize(self, g):
        """ Vectorize graph nodes
        
        Return: a matrix in which rows are the vectors that represents for nodes        
        """

        vec = Vectorizer(nbits=self.nbits,
                         d=self.d,
                         r=self.r,
                         discrete=self.discrete)

        M = vec.vertex_transform([g])[0]
        M_reduce = []
        for idx in range(self.n_nodes):
            vec = M[idx, :]
            for l in range(1, self.L):
                vec = vec + M[idx + l * self.n_nodes, :]
            M_reduce.append(vec)
        M = vstack(M_reduce)
        return M
Пример #29
0
def _outliers(graphs, k=3):
    vec = Vectorizer(r=3, d=3,
                     normalization=False, inner_normalization=False)
    x = vec.transform(graphs)
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(x)
    neigbhbors = knn.kneighbors(x, return_distance=False)
    outlier_list = []
    non_outlier_list = []
    for i, ns in enumerate(neigbhbors):
        not_outlier = False
        for n in ns[1:]:
            if i in list(neigbhbors[n, :]):
                not_outlier = True
                break
        if not_outlier is False:
            outlier_list.append(i)
        else:
            non_outlier_list.append(i)
    return outlier_list, non_outlier_list
    def __init__(self,
                 radius_list=[0, 1],
                 thickness_list=[1, 2],
                 grammar=None,
                 core_interface_pair_remove_threshold=2,
                 interface_remove_threshold=2,
                 complexity=3,
                 vectorizer=Vectorizer(complexity=3),
                 estimator=estimator_wrapper.estimator_wrapper()):

        self.complexity = complexity
        self.feasibility_checker = FeasibilityChecker()
        self.postprocessor = processing.PostProcessor()
        self.vectorizer = vectorizer
        # lists of int
        self.radius_list = [int(2 * r) for r in radius_list]
        self.thickness_list = [int(2 * t) for t in thickness_list]
        # scikit  classifier
        self.estimatorobject = estimator
        # grammar object
        self.local_substitutable_graph_grammar = grammar
        # cips hashes will be masked with this, this is unrelated to the vectorizer
        self.hash_bitmask = pow(2, 20) - 1
        # we will save current graph at every intervalth step of sampling and attach to graphinfos[graphs]
        self.sampling_interval = None
        # how many sampling steps are done
        self.n_steps = None
        # current step in sampling proces of a single graph
        self.step = None
        # how often do we try to get a cip from the current graph  in sampling
        self.select_cip_max_tries = None
        # sample path
        self.sample_path = None

        self.local_substitutable_graph_grammar = LocalSubstitutableGraphGrammar(
            self.radius_list,
            self.thickness_list,
            complexity=self.complexity,
            cip_remove_threshold=core_interface_pair_remove_threshold,
            interface_remove_threshold=interface_remove_threshold,
            nbit=20)