示例#1
0
def get_data(assay_id, pos_count=100,neg_count=100, selectall=False):
    '''
    :param assay_id:
    :param pos_count:
    :param neg_count:
    :return:
    pos/neg count restrict the number of graphs that are returned
    '''
    active_X = pipe(assay_id, download_active, babel_load, vectorize)
    inactive_X = pipe(assay_id, download_inactive, babel_load, vectorize)
    X = vstack((active_X, inactive_X))
    y = np.array([1] * active_X.shape[0] + [-1] * inactive_X.shape[0])
    esti = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=4, loss='log')
    esti.fit(X,y)

    if not selectall:
        select_p= lambda x: selection_iterator(x,np.random.choice(active_X.shape[0], pos_count, replace=False))
        select_n= lambda x: selection_iterator(x,np.random.choice(inactive_X.shape[0], neg_count, replace=False))
        print "selecting pos graphs: %d/%d neg graphs in set %d/%d"  % (pos_count, active_X.shape[0], neg_count, inactive_X.shape[0])
    else:
        select_p = lambda x:x
        select_n = lambda x:x

    graphs_p = list(pipe(assay_id, download_active,load_sdf, select_p,lambda x: map(rdkmol_to_nx,x)))
    graphs_n = list(pipe(assay_id, download_active,load_sdf, select_n,lambda x: map(rdkmol_to_nx,x)))

    print {'active':active_X.shape[0], 'inactive':inactive_X.shape[0]}
    return X, y, graphs_p, graphs_n,esti
示例#2
0
文件: model.py 项目: teresa-m/EDeN
 def _select_data_matrices(self, iterable_pos, iterable_neg,
                           n_active_learning_iterations=2,
                           size_positive=-1,
                           size_negative=100,
                           lower_bound_threshold_positive=-1,
                           upper_bound_threshold_positive=1,
                           lower_bound_threshold_negative=-1,
                           upper_bound_threshold_negative=1):
     # select the initial ids simply as the first occurrences
     if size_positive != -1:
         positive_ids = range(size_positive)
     if size_negative != -1:
         negative_ids = range(size_negative)
     # iterate: select instances according to current model and create novel
     # data matrix to fit the model in next round
     for i in range(n_active_learning_iterations):
         # make data matrix on selected instances
         # if this is the first iteration or we need to select positives
         if i == 0 or size_positive != -1:
             iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3)
             if size_positive == -1:  # if we take all positives
                 data_matrix_pos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer)
             else:  # otherwise use selection
                 data_matrix_pos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids),
                                                     fit_vectorizer=self.fit_vectorizer)
         # if this is the first iteration or we need to select negatives
         if i == 0 or size_negative != -1:
             iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3)
             if size_negative == -1:  # if we take all negatives
                 data_matrix_neg = self._data_matrix(iterable_neg_, fit_vectorizer=False)
             else:  # otherwise use selection
                 data_matrix_neg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids),
                                                     fit_vectorizer=False)
         # assemble data matrix
         data_matrix, y = self._assemble_data_matrix(data_matrix_pos, data_matrix_neg)
         # stop the fitting procedure at the last-1 iteration and return data_matrix,y
         if i == n_active_learning_iterations - 1:
             break
         # fit the estimator on selected instances
         self.estimator.fit(data_matrix, y)
         # use the trained estimator to select the next instances
         if size_positive != -1:
             positive_ids = self._bounded_selection(iterable_pos__,
                                                    size=size_positive,
                                                    lower_bound_threshold=lower_bound_threshold_positive,
                                                    upper_bound_threshold=upper_bound_threshold_positive)
         if size_negative != -1:
             negative_ids = self._bounded_selection(iterable_neg__,
                                                    size=size_negative,
                                                    lower_bound_threshold=lower_bound_threshold_negative,
                                                    upper_bound_threshold=upper_bound_threshold_negative)
     return data_matrix, y
示例#3
0
 def _active_learning_data_matrices(self, iterable_pos, iterable_neg,
                                    n_active_learning_iterations=2,
                                    size_positive=-1,
                                    size_negative=100,
                                    lower_bound_threshold_positive=-1,
                                    upper_bound_threshold_positive=1,
                                    lower_bound_threshold_negative=-1,
                                    upper_bound_threshold_negative=1):
     # select the initial ids simply as the first occurrences
     if size_positive != -1:
         positive_ids = range(size_positive)
     if size_negative != -1:
         negative_ids = range(size_negative)
     # iterate: select instances according to current model and create novel data matrix to fit the model in next round
     for i in range(n_active_learning_iterations):
         # make data matrix on selected instances
         # if this is the first iteration or we need to select positives
         if i == 0 or size_positive != -1:
             iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3)
             if size_positive == -1:  # if we take all positives
                 Xpos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer)
             else:  # otherwise use selection
                 Xpos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids), fit_vectorizer=self.fit_vectorizer)
         # if this is the first iteration or we need to select negatives
         if i == 0 or size_negative != -1:
             iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3)
             if size_negative == -1:  # if we take all negatives
                 Xneg = self._data_matrix(iterable_neg_, fit_vectorizer=False)
             else:  # otherwise use selection
                 Xneg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids), fit_vectorizer=False)
         # assemble data matrix
         X, y = self._assemble_data_matrix(Xpos, Xneg)
         # stop the fitting procedure at the last-1 iteration and return X,y
         if i == n_active_learning_iterations - 1:
             break
         # fit the estimator on selected instances
         self.estimator.fit(X, y)
         # use the trained estimator to select the next instances
         if size_positive != -1:
             positive_ids = self._bounded_selection(
                 iterable_pos__, size=size_positive, lower_bound_threshold=lower_bound_threshold_positive, upper_bound_threshold=upper_bound_threshold_positive)
         if size_negative != -1:
             negative_ids = self._bounded_selection(
                 iterable_neg__, size=size_negative, lower_bound_threshold=lower_bound_threshold_negative, upper_bound_threshold=upper_bound_threshold_negative)
     return X, y
def get_sequences_with_names(size=9999, rand=True):
    it = fasta_to_sequence("../toolsdata/%s.fa" % RFAM)
    it = list(it)
    if rand:
        #sequences , boring = random_bipartition_iter(it,.9,random_state=random.random())
        r = range(len(it))
        random.shuffle(r)
        return selection_iterator(it, r[:size])
    else:
        sequences = itertools.islice(it, size)
    return sequences
示例#5
0
def make_data(assay_id,
              repeats=3,
              train_sizes=[50]):

    X,y,graphs_p,graphs_n, esti = get_data(assay_id,selectall=True)

    print 'indicator of tak-ease:'
    print eden_tricks.task_difficulty(X,y)

    for size in train_sizes:
        for repeat in range(repeats):
            poslist = np.random.permutation(range(len(graphs_p)))[:size]
            neglist = np.random.permutation(range(len(graphs_n)))[:size]
            #r={}
            #r['pos']= list(selection_iterator(graphs_p, poslist))
            #r['neg']= list(selection_iterator(graphs_n, neglist))
            neg= list(selection_iterator(graphs_n, neglist))
            pos= list(selection_iterator(graphs_p, poslist))
            for samplerid, sampler in enumerate(make_samplers_chem()):
                yield task(samplerid,size,repeat,sampler,copy.deepcopy(neg),copy.deepcopy(pos))
    yield esti