Exemplo n.º 1
0
def batch_update(parallel, data, row, col):
    U_ = U[row, :]
    V_ = V[col, :]
    bu_ = bu[row]
    bm_ = bm[col]


    du = parallel(delayed(gred_u)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data)))
    dv = parallel(delayed(gred_v)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data)))
    dbu = parallel(delayed(gred_bu)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data)))
    dbm = parallel(delayed(gred_bm)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data)))
    if method=='sgd':
        for i in xrange(len(data)):
            U_[i,:] -= eta*du[i]
            V_[i,:] -= eta*dv[i]
            bu_[i] -= eta*dbu[i]
            bm_[i] -= eta*dbm[i]
        for c, i in enumerate(row):
            U[i,:]=U_[c,:]
            bu[i]=bu_[c]
        for c, j in enumerate(col):
            V[j,:]=V_[c,:]
            bm[j]=bm_[c]
    elif method=='adagrad':
        for c, i in enumerate(row):
            gdu[i] += np.dot(du[c], du[c])
            gdbu[i] += np.dot(dbu[c], dbu[c])
            U[i,:]-=eta*du[c]/sqrt(gdu[i]+epislon)
            bu[i]-=eta*dbu[c]/sqrt(gdbu[i]+epislon)
        for c, j in enumerate(col):
            gdv[j] += np.dot(dv[c], dv[c])
            gdbm[j] += np.dot(dbm[c], dbm[c])
            V[j,:]-=eta*dv[c]/sqrt(gdv[i]+epislon)
            bm[j]-=eta*dbm[c]/sqrt(gdbm[i]+epislon)
Exemplo n.º 2
0
    def executeWithStart(self, desc, function, data, *args, **kwargs):
        #Splitting task
        tSplitter = TaskSplitter()
        nbJobs, splittedData, starts = tSplitter.partition(self._nbParal, data)

        #Logging
        self.setTask(1, ("Starting parallelization : "+desc))

        #Parallelization
        parallelizer = Parallel(n_jobs=nbJobs, temp_folder=self._tmpFolder,
                                verbose=self.verbosity,)

        if len(args) == 0:
            if len(kwargs) == 0:
                allData = parallelizer(delayed(function)(
                    splittedData[i], startIndex=starts[i])
                    for i in xrange(nbJobs))
            else:
                allData = parallelizer(delayed(function)(
                    splittedData[i], startIndex=starts[i], **kwargs)
                    for i in xrange(nbJobs))

        elif len(kwargs) == 0:
            allData = parallelizer(delayed(function)(
                splittedData[i], startIndex=starts[i], *args)
                for i in xrange(nbJobs))

        else:
            allData = parallelizer(delayed(function)(
                splittedData[i], startIndex=starts[i], *args, **kwargs)
                for i in xrange(nbJobs))

        self.endTask()

        return allData
def orig_main():
    if len(sys.argv) == 4:
        path, adjective, n_jobs = sys.argv[1:]
        n_jobs = int(n_jobs)
        print "Training the adjective %s for the phase %s" % (
                adjective)

        loaded_features = load_adjective_phase(path)
        p = Parallel(n_jobs=n_jobs,verbose=10)
        p(delayed(orig_train_adjective_phase_classifier)(path, adjective, loaded_features))

    elif len(sys.argv) == 3:
        path, n_jobs = sys.argv[1:]
        n_jobs = int(n_jobs)
        print "Training the all adjectives"
        loaded_features = load_adjective_phase(path)
 
        p = Parallel(n_jobs=n_jobs,verbose=10)
        p(delayed(orig_train_adjective_phase_classifier)(path, adjective, loaded_features) 
            for adjective in adjectives)
                                                      
    else:
        print "Usage:"
        print "%s path adjective n_jobs" % sys.argv[0]
        print "%s path n_jobs" % sys.argv[0]
        print "Path to the base directory"
def warmstart_all_parallel(x, y, x_test, y_test, fname_in='results_softmax_regression_mnist', fname_out='results_softmax_regression_warmstart_mnist', model_type='softmax_regression', w_diff_term_crit=0.0001, learning_rate=0.0001, regularizations = [100., 10., 1., 0.1, 0.01, 0.001, 0.]):
    pretrained_models = pickle.load(open(fname_in, 'rb'))
    if model_type == 'softmax_regression':
        #previous_loss_train=None, previous_regularization_penalty_train=None
        results = joblib.Parallel(n_jobs=47)(delayed(tf_softmax_regression.train_softmax)
                                             (
                                             x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000,
                                             w_diff_term_crit=w_diff_term_crit, verbose=True,
                                             regularization=regularizations[target_i],
                                             model=pretrained_models[init_i]['model'],
                                             regularization_initialization=pretrained_models[init_i]['regularization'],
                                             previous_loss_train=pretrained_models[init_i]['loss_train'],
                                             previous_regularization_penalty_train=pretrained_models[init_i]['regularization_penalty_train']
                                         ) for target_i in xrange(0, len(regularizations))
                                           for init_i in xrange(0, len(pretrained_models))
                                         )
    elif model_type == 'linear_regression':
        results = joblib.Parallel(n_jobs=47)(delayed(tf_linear_regression.train)
                                                 (
                                                 x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000,
                                                 w_diff_term_crit=w_diff_term_crit, verbose=True,
                                                 regularization=regularizations[target_i],
                                                 model=pretrained_models[init_i]['model'],
                                                 regularization_initialization=pretrained_models[init_i][
                                                     'regularization']
                                             ) for target_i in xrange(0, len(regularizations))
                                             for init_i in xrange(0, len(pretrained_models))
                                             )
    pickle.dump(results, open(fname_out, 'wb'))
Exemplo n.º 5
0
    def predict_(self, X, probability=False):
        """Predict class for X.

        The predicted class of an input sample is a vote by the individual searchlights.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.

        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes.
        """

        # votes = []
        # for v in range(self.n_best):
        #     votes += [self.estimators_[v].predict(np.array([x.get_data()[self.best_spheres[v]] for x in X]))]

        if not isinstance(X, dict):
            raise ValueError("X has to be a dict")

        if self.base_estimator._estimator_type == "searchlight_ensemble":
            self.votes = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                delayed(_vote)(e, X[roi_id][0], probability) for roi_id, e in self.estimators_.items()
            )
        else:
            self.votes = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                delayed(_vote)(e, X[roi_id], probability) for roi_id, e in self.estimators_.items()
            )

        self.votes_pooled = np.array(self.votes).swapaxes(0, 1).dot(self.vote_weighting) / sum(self.vote_weighting)
Exemplo n.º 6
0
def get_split_scores(factory,thresholds,formula,
                     metric = None,#p.e. usability entropy
                     use_joblib = False,
                     joblib_backend = 'threading',
                     n_jobs = -1,
                     min_events_fraction_leaf = 0.,verbose = False):

    if metric == None:
        metric = penalized_usability_entropy
    if min_events_fraction_leaf <=1:
        min_events_fraction_leaf = int(min_events_fraction_leaf*sum(factory.weights))
    if verbose:
        print min_events_fraction_leaf, sum(factory.weights)

    if not use_joblib:
        scores = np.repeat(float("inf"),len(thresholds))
        for i,(feature,cut,_) in enumerate(thresholds):
            predicate =  (factory.events[:,feature] > cut)

            #skip the edge cases... (inf penalty)
            if np.all(predicate) or (not np.any(predicate)):
                #if this split does not split, fuggedaboutit
                continue 
            if min_events_fraction_leaf>0:
                #get rid of too uneven a cuts
                sum_weight = np.sum(factory.weights)
                true_weight = np.sum(factory.weights[predicate])
                false_weight = sum_weight - true_weight
                if true_weight < min_events_fraction_leaf or false_weight < min_events_fraction_leaf:
                    if verbose: print "t:",true_weight,"f:",false_weight, "discarded"
                    continue
                if verbose: print "t:",true_weight,"f:",false_weight, "passed"
            #compute score
            subFactories = factory.split_by(predicate)
            scores[i] = metric(formula,*subFactories)
    else:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count() +1 - n_jobs
       
        indices = [0]+[len(thresholds)*(i+1)/n_jobs for i in range(n_jobs)]
        thresholdSections = [thresholds[indices[i]:indices[i+1]] for i in range(n_jobs)]
        
        if joblib_backend == 'threading':
            factory = [deepcopy(factory) for i in range(n_jobs)]
            formula = [deepcopy(formula) for i in range(n_jobs)]
            metric = [deepcopy(metric) for i in range(n_jobs)] #in case it has some internal data
            
            jobs = (joblib.delayed(get_split_scores)(factory[i],thresholdSection, formula[i],
                                                 metric=metric[i],use_joblib = False,
                                                 min_events_fraction_leaf = min_events_fraction_leaf,
                                                 verbose = verbose)
                                    for i,thresholdSection in enumerate(thresholdSections))
        else:
            jobs = (joblib.delayed(get_split_scores)(factory,thresholdSection, formula,
                                                 metric=metric,use_joblib = False,
                                                 min_events_fraction_leaf = min_events_fraction_leaf,
                                                 verbose = verbose)
                                    for thresholdSection in thresholdSections)
        scores = np.hstack(joblib.Parallel(n_jobs = n_jobs, backend = joblib_backend)(jobs))
    return scores
Exemplo n.º 7
0
    def fit(self, imgs, y=None, confounds=None):
        """Compute the mask and the ICA maps across subjects

        Parameters
        ----------
        imgs: list of Niimg-like objects
            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
            Data on which PCA must be calculated. If this is a list,
            the affine is considered the same for all.

        confounds: CSV file path or 2D matrix
            This parameter is passed to nilearn.signal.clean. Please see the
            related documentation for details
        """
        MultiPCA.fit(self, imgs, y=y, confounds=confounds)
        random_state = check_random_state(self.random_state)

        seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init)
        if (LooseVersion(sklearn.__version__).version > [0, 12]):
            # random_state in fastica was added in 0.13
            results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(fastica)(self.components_.T,
                    whiten=True, fun='cube', random_state=seed)
                for seed in seeds)
        else:
            results = Parallel(n_jobs=1, verbose=self.verbose)(
                delayed(fastica)(self.components_.T, whiten=True, fun='cube')
                for seed in seeds)

        ica_maps_gen_ = (result[2].T for result in results)
        ica_maps_and_sparsities = ((ica_map,
                                    np.sum(np.abs(ica_map), axis=1).max())
                                   for ica_map in ica_maps_gen_)
        ica_maps, _ = min(ica_maps_and_sparsities, key=itemgetter(-1))

        # Thresholding
        ratio = None
        if isinstance(self.threshold, float):
            ratio = self.threshold
        elif self.threshold == 'auto':
            ratio = 1.
        elif self.threshold is not None:
            raise ValueError("Threshold must be None, "
                             "'auto' or float. You provided %s." %
                             str(self.threshold))
        if ratio is not None:
            abs_ica_maps = np.abs(ica_maps)
            threshold = scoreatpercentile(
                abs_ica_maps,
                100. - (100. / len(ica_maps)) * ratio)
            ica_maps[abs_ica_maps < threshold] = 0.
        self.components_ = ica_maps

        # flip signs in each component so that peak is +ve
        for component in self.components_:
            if component.max() < -component.min():
                component *= -1

        return self
Exemplo n.º 8
0
def plot_learning_curves_across_topics(n_runs, start_idx, stop_idx, estimators_dict, comment=None):
  """
  TODO Most probably buggy
  """
  for topic_id, data in texts_vote_lists_truths_by_topic_id.iteritems():
    print 'Loading topic %s' % topic_id
    texts, vote_lists, truths = data
    n_documents = len(texts)

    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(texts)
    text_similarity = cosine_similarity(tfidf)

    x = np.arange(start_idx, stop_idx)

    y_by_estimator = dict( (estimator, []) for estimator in estimators_dict.keys() )

    for estimator_name, estimator_and_args in estimators_dict.iteritems():
      print 'Calculating for %s' % estimator_name
      estimator, args, active_pars = estimator_and_args
      if active_pars is None:
        sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
          vote_lists, truths, text_similarity, idx, False, *args) for idx in xrange(n_runs) )
      else:
        sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, 
          vote_lists, truths, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) )      

      good_slices = [ s[start_idx:] for s in sequences if s is not None ]
      if good_slices:
        results = np.vstack(good_slices)

        begin_accuracies = results[:, 0]
        end_accuracies = results[:, -1]
        
        begin_accuracies.dump("pickles/%s-%s-begin-accuracies--.pkl" % (topic_id, estimator_name) )
        end_accuracies.dump("pickles/%s-%s-end-accuracies--.pkl" % (topic_id, estimator_name))

        # We will then need to vstack and avg though all the topic accuracies for each estimator
        y_by_estimator[estimator_name].append( np.mean(results, axis=0) )
      else:
        print 'Topic %s is not represented with estimator %s' % (topic_id, estimator_name)

    result_by_estimator = {}

    for estimator_name, mean_accuracy_sequences in y_by_estimator.iteritems():
      if mean_accuracy_sequences:
        to_avg = np.vstack(mean_accuracy_sequences)
        result_by_estimator[estimator_name] = np.mean(to_avg, axis=0)
      else:
        print "Nope"
  if comment:
    title = 'Across topics, %s runs, %s' % (n_runs, comment)
  else:
    title = 'Across topics, %s runs' % topic_id
  plot_learning_curve(title, x, result_by_estimator, 'Votes sampled', 'Accuracy')
def train_all_parallel(x, y, x_test, y_test, fname='results_softmax_regression_mnist', model_type='softmax_regression', w_diff_term_crit=0.0001, learning_rate=0.0001, regularizations = [100., 10., 1., 0.1, 0.01, 0.001, 0.]):
    if model_type == 'softmax_regression':
        results = joblib.Parallel(n_jobs=47)(delayed( tf_softmax_regression.train_softmax)(
            x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000,
            regularization=regularizations[reg_i], w_diff_term_crit=w_diff_term_crit, verbose=True) for i_par in range(48) for reg_i in xrange(0, len(regularizations)))

    elif model_type == 'linear_regression':
        results = joblib.Parallel(n_jobs=47)(delayed(tf_linear_regression.train)(
            x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000,
            regularization=regularizations[reg_i], w_diff_term_crit=w_diff_term_crit, verbose=True) for i_par in range(48) for
                                             reg_i in xrange(0, len(regularizations)))

    pickle.dump(results, open(fname, 'wb'))
def main():
    """
    if len(sys.argv) == 6:
        database, path, adjective, phase, sensor = sys.argv[1:]
        train_single_dataset(database, path, adjective, phase, sensor)
    """
    if len(sys.argv) == 6:
        database, path, adjective, phase, n_jobs = sys.argv[1:]
        n_jobs = int(n_jobs)
        print "Training the adjectives %s and for phase %s" %(
            adjective, phase)
        p = Parallel(n_jobs=n_jobs,verbose=10)
        p(delayed(create_single_dataset)(database, path, adjective, phase))

    if len(sys.argv) == 5:
        database, path, adjective, n_jobs = sys.argv[1:]
        n_jobs = int(n_jobs)
        print "Training all the phases for adjective %s" %(
                    adjective)
        p = Parallel(n_jobs=n_jobs,verbose=10)
        p(delayed(create_single_dataset)(database, path, adjective, phase)
            for phase in itertools.product(phases))
            #    create_single_dataset(database, path, adjective, phase))

    elif len(sys.argv) == 3:
        database, path = sys.argv[1:]
        #n_jobs = int(n_jobs)
        print "Training all combinations of adjectives and phases"
        #p = Parallel(n_jobs=n_jobs,verbose=10)
        #p(delayed(create_single_dataset)(database, path, adjective, phase)
        #for adjective, phase in itertools.product(adjectives,
        #                                          phases))
        base_directory = path
        untrained_directory = os.path.join(base_directory, "untrained_adjectives")
        hmm_feature_directory = os.path.join(base_directory, "adjective_phase_set")
        check_dir(hmm_feature_directory)
        for adj_f in os.listdir(untrained_directory):
            full_adj_path = os.path.join(untrained_directory, adj_f)
            adj_obj = cPickle.load(open(full_adj_path))
            assert isinstance(adj_obj, AdjectiveClassifier)
            create_single_dataset(database, hmm_feature_directory, adj_obj)
        #    create_single_dataset(database, path, adjective, "some_phase")
    else:
        print "Usage:"
        print "%s database path adjective phase n_jobs" % sys.argv[0]
        print "%s database path adjective n_jobs" % sys.argv[0]
        print "%s database path" % sys.argv[0]
        print "Files will be saved in path/adjective_phase_set"
Exemplo n.º 11
0
    def fit_transform(self, Z, **fit_params):
        """TODO: rewrite docstring
        Fit all transformers using X, transform the data and concatenate
        results.
        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Input data to be transformed.
        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        result = Parallel(n_jobs=self.n_jobs, backend="threading")(
            delayed(_fit_transform_one)(trans, name, Z,
                                        self.transformer_weights, **fit_params)
            for name, trans in self.transformer_list)

        Zs, transformers = zip(*result)
        self._update_transformer_list(transformers)

        X = reduce(lambda x, y: x.zip(y._rdd), Zs)
        for item in X.first():
            if sp.issparse(item):
                return X.map(lambda x: sp.hstack(x))
        X = X.map(lambda x: np.hstack(x))
Exemplo n.º 12
0
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples):
  """ Test if accuracy for estimators with given parameters is
      significantly better than that of the first estimator in the tuple
  """
  texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
  vectorizer = TfidfVectorizer()
  text_similarity = cosine_similarity(vectorizer.fit_transform(texts))

  accuracy_arrays = []
  for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples:
    stop_idx = votes_per_doc * len(texts)
    # Now get n_runs accuracies and put then into numpy arrays
    accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
        vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) )
    accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) )

  # Baseline
  result_row = []
  result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) )
  # T-tests
  for accuracy_array in accuracy_arrays[1:]:
    _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False)
    significance_indicator = lambda p: "*" if p < 0.01 else " "
    is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " "
    result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better))

  return "|".join(result_row)
Exemplo n.º 13
0
def cross_val_predict(
        estimator, X, y, loss=None, cv=8, n_jobs=1,
        verbose=0, fit_params=None, proba=False,
        pre_dispatch='2*n_jobs'):
    """
    """
    if isinstance(cv, int):
        cv1 = cross_validation.StratifiedKFold(y, cv)
    else:
        cv1 = cv
    fit_params = fit_params if fit_params is not None else {}
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    results = parallel(
        delayed(_cross_val_predict)(clone(estimator), X, y, train, test,
                                    verbose, fit_params, proba)
        for train, test in cv1)
    y_pred = np.zeros(len(y))
    scores = []
    for (mask, y_p) in results:
        y_pred[mask] = y_p
        if loss:
            y_test = y[mask]
            scores.append(-loss(y_test, y_p))
    if loss:
        scores = np.asarray(scores)

    return np.asarray(y_pred), scores
Exemplo n.º 14
0
 def fit(self, X, y=None):
     transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list)
     transformers = Parallel(n_jobs=self.n_jobs)(
         delayed(_fit_one_transformer)(trans, X[:,idx], y)
         for name, trans, idx in transformer_idx_list)
     self._update_transformer_list(transformers)
     return self
Exemplo n.º 15
0
    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError("The base estimator should implement a predict method")

        X = check_array(X, accept_sparse=True)

        y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X)
                                         for e in self.estimators_)

        return np.asarray(y).T
Exemplo n.º 16
0
def cluster(seqs, k, m):
    dispatcher = Parallel(n_jobs=N_JOBS, verbose=VERBOSE, pre_dispatch=PRE_DISPATCH)
    Q, R = embed(k, m, seqs, dispatcher)
    N, T = R.shape
    debug('computing pairwise distances')
    ds = dispatcher(delayed(cosine)(R[i, :], R[j, :]) for i in range(N) for j in range(i + 1, N))
    D = np.zeros((N, N), dtype=float)
    idx = 0
    for i in range(N):
        for j in range(i + 1, N):
            D[i, j] = ds[idx]
            D[j, i] = ds[idx]
            idx += 1
    # cluster
    debug('clustering using dbscan')
    db = DBSCAN(eps=0.01, min_samples=10, metric='precomputed').fit(D)
    # core = db.core_sample_indices_
    labels = db.labels_
    labelset = set(labels)
    n_clusters_ = len(labelset) - (1 if -1 in labelset else 0)
    debug('dbscan: found %d clusters' % n_clusters_)
    m = {}
    for l in labelset:
        m[l] = []
        for i, s in enumerate(seqs):
            if labels[i] == l:
                m[l].append(str(s.seq))
    return m
Exemplo n.º 17
0
    def _find_new_constraint(self, X, Y, joint_feature_gt, constraints, check=True):
        if self.n_jobs != 1:
            # do inference in parallel
            verbose = max(0, self.verbose - 3)
            Y_hat = Parallel(n_jobs=self.n_jobs, verbose=verbose)(
                delayed(loss_augmented_inference)(
                    self.model, x, y, self.w, relaxed=True)
                for x, y in zip(X, Y))
        else:
            Y_hat = self.model.batch_loss_augmented_inference(
                X, Y, self.w, relaxed=True)
        # compute the mean over joint_features and losses

        if getattr(self.model, 'rescale_C', False):
            djoint_feature = (joint_feature_gt
                              - self.model.batch_joint_feature(X, Y_hat, Y)) / len(X)
        else:
            djoint_feature = (joint_feature_gt
                              - self.model.batch_joint_feature(X, Y_hat)) / len(X)

        loss_mean = np.mean(self.model.batch_loss(Y, Y_hat))

        violation = loss_mean - np.dot(self.w, djoint_feature)
        if check and self._check_bad_constraint(
                violation, djoint_feature, loss_mean, constraints,
                break_on_bad=self.break_on_bad):
            raise NoConstraint
        return Y_hat, djoint_feature, loss_mean
Exemplo n.º 18
0
    def transform(self, traj_list):
        """Transform traj_list separately by each transformer, concatenate results.

        Parameters
        ----------
        trajectories : list (of mdtraj.Trajectory objects)
            Trajectories to featurize

        Returns
        -------
        Y : list (of np.ndarray)
            Y[i] is the featurized version of X[i]
            Y[i] will have shape (n_samples_i, n_features), where
            n_samples_i is the length of trajectory i and n_features
            is the total (concatenated) number of features in the
            concatenated list of featurizers.

        """
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(sklearn.pipeline._transform_one)(trans, name, traj_list, self.transformer_weights)
            for name, trans in self.transformer_list)

        X_i_stacked = [np.hstack([Xs[feature_ind][trj_ind] for feature_ind in range(len(Xs))]) for trj_ind in range(len(Xs[0]))]

        return X_i_stacked
def _intra_cluster_distances_block(X, labels, metric, n_jobs=1, **kwds):
    """Calculate the mean intra-cluster distance for sample i.
 
    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.
 
    labels : array, shape = [n_samples]
        label values for each sample
 
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.
 
    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.
 
    Returns
    -------
    a : array [n_samples_a]
        Mean intra-cluster distance
    """
    intra_dist = np.zeros(labels.size, dtype=float)
    values = Parallel(n_jobs=n_jobs)(
            delayed(_intra_cluster_distances_block_)
                (X[np.where(labels == label)[0]], metric, **kwds)
                for label in np.unique(labels))
    for label, values_ in zip(np.unique(labels), values):
        intra_dist[np.where(labels == label)[0]] = values_
    return intra_dist
Exemplo n.º 20
0
def decode_stash_parallel(stash, penalty, label_map, num_cpus=NUM_CPUS,
                          **viterbi_args):
    """Apply Viterbi decoding over a stash in parallel.

    Parameters
    ----------
    stash : biggie.Stash
        Stash of fretboard posteriors.
    penalty : scalar
        Self-transition penalty.
    label_map : callable object
        Map from frets to string labels.
    num_cpus : int
        Number of CPUs to use in parallel.
    **viterbi_args, other args to pass to util.viterbi

    Returns
    -------
    annotset : dict of pyjams.RangeAnnotations
        Range annotations under the same keys as the input stash.
    """
    assert not __interactive__
    keys = stash.keys()
    pool = Parallel(n_jobs=num_cpus)
    decode = delayed(decode_fretboard)
    results = pool(decode(stash.get(k), penalty, label_map) for k in keys)
    return {k: r for k, r in zip(keys, results)}
Exemplo n.º 21
0
    def fit(self, data, Y=None):
        if hasattr(data, 'copy'):
            # It's an array
            data = data.copy()
        else:
            # Probably a list
            data = copy.deepcopy(data)

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(_subject_pca)(subject_data,
                                 n_components=self.n_components, mem=memory)
            for subject_data in data)
        pcas = np.concatenate(pcas, axis=1)

        ica_maps = self._find_high_kurtosis(pcas, memory)

        del pcas
        self.maps_ = ica_maps
        if not self.maps_only:
            # Relearn the time series
            self.learn_from_maps(data)

        return self
Exemplo n.º 22
0
    def find_bmu(self, input_matrix, njb=1):
        """
        Finds the best matching unit (bmu) for each input data from the input matrix. It does all at once parallelizing
        the calculation instead of going through each input and running it against the codebook.

        :param input_matrix: numpy matrix representing inputs as rows and features/dimension as cols
        :param njb: number of jobs to parallelize the search
        :returns: the best matching unit for each input
        """
        dlen = input_matrix.shape[0]
        y2 = np.einsum("ij,ij->i", self.codebook.matrix, self.codebook.matrix)

        parallelizer = Parallel(n_jobs=njb, pre_dispatch="3*n_jobs")
        chunk_bmu_finder = delayed(_chunk_based_bmu_find)

        row_chunk = lambda part: part * dlen // njb
        col_chunk = lambda part: min((part + 1) * dlen // njb, dlen)

        b = parallelizer(
            chunk_bmu_finder(input_matrix[row_chunk(i) : col_chunk(i)], self.codebook.matrix, y2) for i in xrange(njb)
        )
        bmu = np.asarray(list(itertools.chain(*b))).T

        del b
        return bmu
Exemplo n.º 23
0
    def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # Now get sqnms in parallel
        self.sq_nms_ = dict(zip(cols,
                                Parallel(n_jobs=self.n_jobs)(
                                    delayed(_sq_norm_single)
                                    (X[nm]) for nm in cols)))

        return self
Exemplo n.º 24
0
    def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True)  # creates a copy -- we need all to be finite
        cols = _cols_if_none(X, self.cols)

        # ensure enough rows
        _validate_rows(X)

        # Now estimate the lambdas in parallel
        self.lambda_ = dict(zip(cols,
                                Parallel(n_jobs=self.n_jobs)(
                                    delayed(_yj_estimate_lambda_single_y)
                                    (X[nm]) for nm in cols)))

        return self
Exemplo n.º 25
0
def permutation_test_score(estimator, X, y, groups=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state),
            groups, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    return permutation_scores
Exemplo n.º 26
0
def cross_val_predict_proba(
        estimator, X, y, scoring='roc_auc', cv=8, n_jobs=1,
        verbose=0, fit_params=None,
        pre_dispatch='2*n_jobs'):
    """ Predict probabilities using cross-validation.
    """
    if isinstance(cv, int):
        cv1 = cross_validation.StratifiedKFold(y, cv)
    else:
        cv1 = cv

    fit_params = fit_params if fit_params is not None else {}
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    results = parallel(
        delayed(_cross_val_predict)(clone(estimator), X, y, train, test,
                                    verbose, fit_params, proba=True)
        for train, test in cv1)
    y_pred = np.zeros(len(y))
    scores = []
    for (mask, y_p) in results:
        y_pred[mask] = y_p
        if scoring == 'roc_auc':
            y_test = y[mask]
            if len(np.unique(y_test)) > 1:
                scores.append(compute_auc(y_test, y_p))
                # scores.append(roc_auc_score(y_test, y_p))
    return np.asarray(y_pred), np.asarray(scores)
 def prepare_merge_jobs(self, results):
     result_groups = grouper(results, self.split_bins)
     merge_jobs = []
     for result_group in result_groups:
         result_group = list(result_group)
         merge_jobs.append(joblib.delayed(self.load_and_merge_results_job)(result_group))
     return merge_jobs
Exemplo n.º 28
0
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order
Exemplo n.º 29
0
    def _parallel_learning(self, X, Y, w):
        n_samples = len(X)
        objective, positive_slacks = 0, 0
        verbose = max(0, self.verbose - 3)
        if self.batch_size is not None:
            raise ValueError("If n_jobs != 1, batch_size needs to" "be None")
        # generate batches of size n_jobs
        # to speed up inference
        if self.n_jobs == -1:
            n_jobs = cpu_count()
        else:
            n_jobs = self.n_jobs

        n_batches = int(np.ceil(float(len(X)) / n_jobs))
        slices = gen_even_slices(n_samples, n_batches)
        for batch in slices:
            X_b = X[batch]
            Y_b = Y[batch]
            candidate_constraints = Parallel(n_jobs=self.n_jobs, verbose=verbose)(
                delayed(find_constraint)(self.model, x, y, w) for x, y in zip(X_b, Y_b)
            )
            dpsi = np.zeros(self.model.size_psi)
            for x, y, constraint in zip(X_b, Y_b, candidate_constraints):
                y_hat, delta_psi, slack, loss = constraint
                if slack > 0:
                    objective += slack
                    dpsi += delta_psi
                    positive_slacks += 1
            w = self._solve_subgradient(dpsi, n_samples, w)
        return objective, positive_slacks, w
Exemplo n.º 30
0
def optimal_allocation_with_skopt(t, X, Y, n=10, n_parallel=4, const_income=True):
    # [0, 1]
    nn = 2
    opt_fun = _fun
    if const_income:
        nn = 1
        opt_fun = _fun_constant_income
    dimensions = [Real(0, 1)] * nn * (t - 1)
    optimizer = skopt.Optimizer(
            dimensions, base_estimator='gp', random_state=1
            # n_random_starts=None, n_initial_points=10, acq_func='gp_hedge', acq_optimizer='auto', acq_func_kwargs=None, acq_optimizer_kwargs=None
            )
    # fun = functools.partial(_fun, t, X, Y)
    fun = opt_fun(t, X, Y)
    if n_parallel <= 1:
        print('not parallel')
        for i in range(n):
                suggested = optimizer.ask()
                y = fun(suggested)
                optimizer.tell(suggested, y)
                print('iteration: {}, {}, {}'.format(i, suggested, y))
    else:
        # something not working here
        print('parallel')
        n_left = n
        for i in range(0, n, max(n_parallel, 1)):
            suggested = optimizer.ask(n_points=min(n_left, n_parallel))
            n_left -= n_parallel
            print(n_left)
            y = Parallel()(delayed(fun)(x) for x in suggested)
            optimizer.tell(suggested, y)
            print('iteration: {}, {}, {}, {}'.format(i, suggested, y, action_to_zeroone(np.array(suggested))))
    print('min is', min(optimizer.yi))
    return optimizer
Exemplo n.º 31
0
def compute_thresholds(epochs,
                       method='bayesian_optimization',
                       random_state=None,
                       picks=None,
                       verbose='progressbar',
                       n_jobs=1):
    """Compute thresholds for each channel.

    Parameters
    ----------
    epochs : instance of mne.Epochs
        The epochs objects whose thresholds must be computed.
    method : str
        'bayesian_optimization' or 'random_search'
    random_state : int seed, RandomState instance, or None (default)
        The seed of the pseudo random number generator to use
    picks : ndarray, shape(n_channels,) | None
        The channels to be considered for autoreject. If None, defaults
        to data channels {'meg', 'eeg'}.
    verbose : 'tqdm', 'tqdm_notebook', 'progressbar' or False
        The verbosity of progress messages.
        If `'progressbar'`, use `mne.utils.ProgressBar`.
        If `'tqdm'`, use `tqdm.tqdm`.
        If `'tqdm_notebook'`, use `tqdm.tqdm_notebook`.
        If False, suppress all output messages.
    n_jobs : int
        The number of jobs.

    Examples
    --------
    For example, we can compute the channel-level thresholds for all the
    EEG sensors this way:
        >>> compute_thresholds(epochs)
    """
    if method not in ['bayesian_optimization', 'random_search']:
        raise ValueError('`method` param not recognized')
    picks = _handle_picks(epochs.info, picks)
    _check_data(epochs, picks, verbose=verbose, ch_constraint='data_channels')
    sub_picks = _check_sub_picks(picks=picks, info=epochs.info)
    if sub_picks is not False:
        threshes = dict()
        for ch_type, this_picks in sub_picks:
            threshes.update(
                compute_thresholds(epochs=epochs,
                                   method=method,
                                   random_state=random_state,
                                   picks=this_picks,
                                   verbose=verbose,
                                   n_jobs=n_jobs))
    else:
        n_epochs = len(epochs)
        epochs_interp = clean_by_interp(epochs, picks=picks, verbose=verbose)
        data = np.concatenate((epochs.get_data(), epochs_interp.get_data()),
                              axis=0)  # non-data channels will be duplicate
        y = np.r_[np.zeros((n_epochs, )), np.ones((n_epochs, ))]
        cv = StratifiedShuffleSplit(y,
                                    n_iter=10,
                                    test_size=0.2,
                                    random_state=random_state)

        ch_names = epochs_interp.ch_names

        my_thresh = delayed(_compute_thresh)
        verbose = 51 if verbose is not False else 0  # send output to stdout
        threshes = Parallel(n_jobs=n_jobs, verbose=verbose)(my_thresh(
            data[:, pick], cv=cv, method=method, random_state=random_state)
                                                            for pick in picks)
        threshes = {ch_names[p]: thresh for p, thresh in zip(picks, threshes)}
    return threshes
Exemplo n.º 32
0
def permuted_ols(tested_vars,
                 target_vars,
                 confounding_vars=None,
                 model_intercept=True,
                 n_perm=10000,
                 two_sided_test=True,
                 random_state=None,
                 n_jobs=1,
                 verbose=0):
    """Massively univariate group analysis with permuted OLS.

    Tested variates are independently fitted to target variates descriptors
    (e.g. brain imaging signal) according to a linear model solved with an
    Ordinary Least Squares criterion.
    Confounding variates may be included in the model.
    Permutation testing is used to assess the significance of the relationship
    between the tested variates and the target variates [1, 2]. A max-type
    procedure is used to obtain family-wise corrected p-values.

    The specific permutation scheme implemented here is the one of
    Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme
    conveys more sensitivity than alternative schemes. This holds for
    neuroimaging applications, as discussed in details in [2].

    Permutations are performed on parallel computing units. Each of them
    performs a fraction of permutations on the whole dataset. Thus, the max
    t-score amongst data descriptors can be computed directly, which avoids
    storing all the computed t-scores.

    The variates should be given C-contiguous. target_vars are fortran-ordered
    automatically to speed-up computations.

    Parameters
    ----------
    tested_vars : array-like, shape=(n_samples, n_regressors)
      Explanatory variates, fitted and tested independently from each others.

    target_vars : array-like, shape=(n_samples, n_descriptors)
      fMRI data, trying to be explained by explanatory and confounding
      variates.

    confounding_vars : array-like, shape=(n_samples, n_covars)
      Confounding variates (covariates), fitted but not tested.
      If None, no confounding variate is added to the model
      (except maybe a constant column according to the value of
      `model_intercept`)

    model_intercept : bool,
      If True, a constant column is added to the confounding variates
      unless the tested variate is already the intercept.

    n_perm : int,
      Number of permutations to perform.
      Permutations are costly but the more are performed, the more precision
      one gets in the p-values estimation.

    two_sided_test : boolean,
      If True, performs an unsigned t-test. Both positive and negative
      effects are considered; the null hypothesis is that the effect is zero.
      If False, only positive effects are considered as relevant. The null
      hypothesis is that the effect is zero or negative.

    random_state : int or None,
      Seed for random number generator, to have the same permutations
      in each computing units.

    n_jobs : int,
      Number of parallel workers.
      If 0 is provided, all CPUs are used.
      A negative number indicates that all the CPUs except (|n_jobs| - 1) ones
      will be used.

    verbose: int, optional
        verbosity level (0 means no message).

    Returns
    -------
    pvals : array-like, shape=(n_regressors, n_descriptors)
      Negative log10 p-values associated with the significance test of the
      n_regressors explanatory variates against the n_descriptors target
      variates. Family-wise corrected p-values.

    score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors)
      t-statistic associated with the significance test of the n_regressors
      explanatory variates against the n_descriptors target variates.
      The ranks of the scores into the h0 distribution correspond to the
      p-values.

    h0_fmax : array-like, shape=(n_perm, )
      Distribution of the (max) t-statistic under the null hypothesis
      (obtained from the permutations). Array is sorted.

    References
    ----------
    [1] Anderson, M. J. & Robinson, J. (2001).
        Permutation tests for linear models.
        Australian & New Zealand Journal of Statistics, 43(1), 75-88.
    [2] Winkler, A. M. et al. (2014).
        Permutation inference for the general linear model.
        Neuroimage.
    [3] Freedman, D. & Lane, D. (1983).
        A nonstochastic interpretation of reported significance levels.
        J. Bus. Econ. Stats., 1(4), 292-298

    """
    # initialize the seed of the random generator
    rng = check_random_state(random_state)

    # check n_jobs (number of CPUs)
    if n_jobs == 0:  # invalid according to joblib's conventions
        raise ValueError("'n_jobs == 0' is not a valid choice. "
                         "Please provide a positive number of CPUs, or -1 "
                         "for all CPUs, or a negative number (-i) for "
                         "'all but (i-1)' CPUs (joblib conventions).")
    elif n_jobs < 0:
        n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1)
    else:
        n_jobs = min(n_jobs, joblib.cpu_count())
    # make target_vars F-ordered to speed-up computation
    if target_vars.ndim != 2:
        raise ValueError(
            "'target_vars' should be a 2D array. "
            "An array with %d dimension%s was passed" %
            (target_vars.ndim, "s" if target_vars.ndim > 1 else ""))
    target_vars = np.asfortranarray(target_vars)  # efficient for chunking
    n_descriptors = target_vars.shape[1]

    # check explanatory variates dimensions
    if tested_vars.ndim == 1:
        tested_vars = np.atleast_2d(tested_vars).T
    n_samples, n_regressors = tested_vars.shape

    # check if explanatory variates is intercept (constant) or not
    if (n_regressors == 1 and np.unique(tested_vars).size == 1):
        intercept_test = True
    else:
        intercept_test = False

    # optionally add intercept
    if model_intercept and not intercept_test:
        if confounding_vars is not None:
            confounding_vars = np.hstack(
                (confounding_vars, np.ones((n_samples, 1))))
        else:
            confounding_vars = np.ones((n_samples, 1))

    ### OLS regression on original data
    if confounding_vars is not None:
        # step 1: extract effect of covars from target vars
        covars_orthonormalized = orthonormalize_matrix(confounding_vars)
        if not covars_orthonormalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Confounding variates not C_CONTIGUOUS.')
            covars_orthonormalized = np.ascontiguousarray(
                covars_orthonormalized)
        targetvars_normalized = normalize_matrix_on_axis(
            target_vars).T  # faster with F-ordered target_vars_chunk
        if not targetvars_normalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Target variates not C_CONTIGUOUS.')
            targetvars_normalized = np.ascontiguousarray(targetvars_normalized)
        beta_targetvars_covars = np.dot(targetvars_normalized,
                                        covars_orthonormalized)
        targetvars_resid_covars = targetvars_normalized - np.dot(
            beta_targetvars_covars, covars_orthonormalized.T)
        targetvars_resid_covars = normalize_matrix_on_axis(
            targetvars_resid_covars, axis=1)
        # step 2: extract effect of covars from tested vars
        testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1)
        beta_testedvars_covars = np.dot(testedvars_normalized,
                                        covars_orthonormalized)
        testedvars_resid_covars = testedvars_normalized - np.dot(
            beta_testedvars_covars, covars_orthonormalized.T)
        testedvars_resid_covars = normalize_matrix_on_axis(
            testedvars_resid_covars, axis=1).T.copy()
    else:
        targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T
        testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy()
        covars_orthonormalized = None
    # check arrays contiguousity (for the sake of code efficiency)
    if not targetvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Target variates not C_CONTIGUOUS.')
        targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars)
    if not testedvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Tested variates not C_CONTIGUOUS.')
        testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars)
    # step 3: original regression (= regression on residuals + adjust t-score)
    # compute t score for original data
    scores_original_data = _t_score_with_covars_and_normalized_design(
        testedvars_resid_covars, targetvars_resid_covars.T,
        covars_orthonormalized)
    if two_sided_test:
        sign_scores_original_data = np.sign(scores_original_data)
        scores_original_data = np.fabs(scores_original_data)

    ### Permutations
    # parallel computing units perform a reduced number of permutations each
    if n_perm > n_jobs:
        n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int)
        n_perm_chunks[-1] += n_perm % n_jobs
    elif n_perm > 0:
        warnings.warn('The specified number of permutations is %d and '
                      'the number of jobs to be performed in parallel has '
                      'set to %s. This is incompatible so only %d jobs will '
                      'be running. You may want to perform more permutations '
                      'in order to take the most of the available computing '
                      'ressources.' % (n_perm, n_jobs, n_perm))
        n_perm_chunks = np.ones(n_perm, dtype=int)
    else:  # 0 or negative number of permutations => original data scores only
        if two_sided_test:
            scores_original_data = (scores_original_data *
                                    sign_scores_original_data)
        return np.asarray([]), scores_original_data, np.asarray([])
    # actual permutations, seeded from a random integer between 0 and maximum
    # value represented by np.int32 (to have a large entropy).
    ret = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(
        joblib.delayed(_permuted_ols_on_chunk)(
            scores_original_data,
            testedvars_resid_covars,
            targetvars_resid_covars.T,
            covars_orthonormalized,
            n_perm_chunk=n_perm_chunk,
            intercept_test=intercept_test,
            two_sided_test=two_sided_test,
            random_state=rng.random_integers(np.iinfo(np.int32).max))
        for n_perm_chunk in n_perm_chunks)
    # reduce results
    scores_as_ranks_parts, h0_fmax_parts = zip(*ret)
    h0_fmax = np.hstack((h0_fmax_parts))
    scores_as_ranks = np.zeros((n_regressors, n_descriptors))
    for scores_as_ranks_part in scores_as_ranks_parts:
        scores_as_ranks += scores_as_ranks_part
    # convert ranks into p-values
    pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm)

    # put back sign on scores if it was removed in the case of a two-sided test
    # (useful to distinguish between positive and negative effects)
    if two_sided_test:
        scores_original_data = scores_original_data * sign_scores_original_data

    return -np.log10(pvals), scores_original_data.T, h0_fmax[0]
Exemplo n.º 33
0
directory = 'CES_RESULTS'
subdirectory = 'ORDER'
assert exists(project_path)
dirnames = sorted(filter(isdir, glob('%s/weka.classifiers.*' % project_path)))

# load and parse project properties
p = load_properties(project_path)
seeds = int(p['seeds'])
metric = p['metric']
RULE = p['RULE']
use_cluster = True if p['useCluster'] in ['Y', 'y', 'yes', 'true', 'True'
                                          ] else False
start_state = '1'  #initialize ensemble with top model
max_num_clsf = len(dirnames) * seeds
sizes = range(1, max_num_clsf + 1)

if not exists('%s/%s/' % (project_path, directory)):
    makedirs('%s/%s/' % (project_path, directory))

for o in range(seeds):
    if not exists("%s/%s/%s%i" % (project_path, directory, subdirectory, o)):
        makedirs("%s/%s/%s%i" % (project_path, directory, subdirectory, o))

all_parameters = list(
    product([code_dir], [project_path], sizes, range(seeds), [RULE],
            [start_state], [metric]))
Parallel(n_jobs=get_num_cores(), verbose=50)(delayed(CES_ens)(parameters)
                                             for parameters in all_parameters)

print "\nDone!"
Exemplo n.º 34
0
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0):
    """ GLM fit for an fMRI data matrix

    Parameters
    ----------
    Y : array of shape (n_time_points, n_voxels)
        The fMRI data.

    X : array of shape (n_time_points, n_regressors)
        The design matrix.

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'.

    bins : int, optional
        Maximum number of discrete bins for the AR(1) coef histogram.

    n_jobs : int, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : int, optional
        The verbosity level. Defaut is 0

    Returns
    -------
    labels : array of shape (n_voxels,),
        A map of values on voxels used to identify the corresponding model.

    results : dict,
        Keys correspond to the different labels values
        values are RegressionResults instances corresponding to the voxels.

    """
    acceptable_noise_models = ['ar1', 'ols']
    if noise_model not in acceptable_noise_models:
        raise ValueError(
            "Acceptable noise models are {0}. You provided 'noise_model={1}'".\
                format(acceptable_noise_models, noise_model))

    if Y.shape[0] != X.shape[0]:
        raise ValueError(
            'The number of rows of Y should match the number of rows of X.'
            ' You provided X with shape {0} and Y with shape {1}'.\
                format(X.shape, Y.shape))

    # Create the model
    ols_result = OLSModel(X).fit(Y)

    if noise_model == 'ar1':
        # compute and discretize the AR1 coefs
        ar1 = ((ols_result.resid[1:] * ols_result.resid[:-1]).sum(axis=0) /
               (ols_result.resid ** 2).sum(axis=0))
        del ols_result
        ar1 = (ar1 * bins).astype(np.int) * 1. / bins
        # Fit the AR model acccording to current AR(1) estimates
        results = {}
        labels = ar1
        # Parallelize by creating a job per ARModel
        vals = np.unique(ar1)
        ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_ar_model_fit)(X, val, Y[:, labels == val]) for val in vals)
        for val, result in zip(vals, ar_result):
            results[val] = result
        del vals
        del ar_result

    else:
        labels = np.zeros(Y.shape[1])
        results = {0.0: ols_result}

    return labels, results
Exemplo n.º 35
0
    def _fit(self, X, y, sample_weight, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y, sample_weight = check_arrays(X,
                                           y,
                                           sample_weight,
                                           allow_lists=True,
                                           sparse_format='csr')

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)

        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print(
                    "Fitting {0} folds for each of {1} candidates, totalling"
                    " {2} fits".format(len(cv), n_candidates,
                                       n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        # first fit at each grid point using the maximum n_estimators
        param_grid = self.param_grid.copy()
        param_grid['n_estimators'] = [self.max_n_estimators]
        grid = ParameterGrid(param_grid)

        pre_dispatch = self.pre_dispatch

        clfs = Parallel(n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                            base_estimator, clf_params, X, y, sample_weight,
                            train, test, self.verbose, **self.fit_params)
                                                   for clf_params in grid
                                                   for train, test in cv)

        # now use the already fitted ensembles but trancate to N estimators for
        # N from 1 to n_estimators_max - 1 (inclusive)
        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(score_each_boost)
                (clf, clf_params, self.min_n_estimators, X, y, sample_weight,
                 self.score_func, train, test, self.verbose)
                for clf, clf_params, train, test in clfs)

        out = reduce(operator.add, [zip(*stage) for stage in out])
        # out is now a list of triplet: score, estimator_params, n_test_samples

        n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1
        n_fits = len(out)
        n_folds = len(cv)

        grid_scores = list()
        for block in range(0, n_fits, n_folds * n_estimators_points):
            for grid_start in range(block, block + n_estimators_points):
                n_test_samples = 0
                score = 0
                all_scores = list()
                for this_score, parameters, this_n_test_samples in \
                        out[grid_start:
                            grid_start + n_folds * n_estimators_points:
                            n_estimators_points]:
                    all_scores.append(this_score)
                    if self.iid:
                        this_score *= this_n_test_samples
                    score += this_score
                    n_test_samples += this_n_test_samples
                if self.iid:
                    score /= float(n_test_samples)
                else:
                    score /= float(n_folds)
                grid_scores.append(
                    _CVScoreTuple(parameters, score, np.array(all_scores)))

        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            fit_params = self.fit_params
            if sample_weight is not None:
                fit_params = fit_params.copy()
                fit_params['sample_weight'] = sample_weight
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
Exemplo n.º 36
0
    def fit(self, X, y):
        """Fit estimators from the training set (X, y).

        Returns
        -------
        self : object
            Returns self.
        """

        if not isinstance(X, dict):
            raise ValueError("X has to be a dict")

        self.classes_ = np.unique(y)

        estimators = dict()
        for modality, Xm in X.items():
            for roi_id, x in Xm.items():
                estimator = clone(self.base_estimators[modality])
                estimator.id = (modality, roi_id)
                estimators[estimator.id] = estimator

        y_pred = {k: np.full(len(y), np.nan) for k in estimators.keys()}
        t0 = time.time()
        print('Start [1]')
        for f, (train_index, test_index) in enumerate(LeaveOneOut()):
            y_train = [y[i] for i in train_index]

            estimators_fit = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                backend="threading")(delayed(_parallel_build_estimator)(
                    e, np.array([X[id[0]][id[1]][i]
                                 for i in train_index]), y_train)
                                     for id, e in estimators.items())
            estimators_fit = {e.id: e for e in estimators_fit}
            # for roi_id, e in estimators_fit.items():
            #     e.predict([X[roi_id][i] for i in test_index])
            y_pred_ = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                    delayed(_vote)(e, [X[id[0]][id[1]][i]
                                       for i in test_index], False)
                    for id, e in estimators_fit.items())
            for i, id in enumerate(estimators.keys()):
                y_pred[id][test_index] = y_pred_[i]
        print('[1] Elapsed time: %.2f secs' % (time.time() - t0))

        for i, id in enumerate(estimators.keys()):
            self.priors[(self.classes_[0], self.classes_[0])] = np.mean(
                y_pred[id][y == self.classes_[0]] == self.classes_[0])
            self.priors[(self.classes_[1],
                         self.classes_[0])] = 1 - self.priors[
                             (self.classes_[0], self.classes_[0])]
            self.priors[(self.classes_[1], self.classes_[1])] = np.mean(
                y_pred[id][y == self.classes_[1]] == self.classes_[1])
            self.priors[(self.classes_[0],
                         self.classes_[1])] = 1 - self.priors[
                             (self.classes_[1], self.classes_[1])]

        t0 = time.time()
        estimators = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                delayed(_parallel_build_estimator)(e, X[id[0]][id[1]], y)
                for id, e in estimators.items())
        print('[2] Elapsed time: %.2f secs' % (time.time() - t0))

        self.estimators_ = {e.id: e for e in estimators}

        return self
Exemplo n.º 37
0
    def forward(self, features):
        regs = 0
        self.weights = self._get_weights(self.log_alpha)

        self.revised_arch_index()
        if self.args.early_fix_arch:
            if len(self.fix_arch_index.keys()) > 0:
                for key, value_lst in self.fix_arch_index.items():
                    self.weights[key, :].zero_()
                    self.weights[key, value_lst[0]] = 1

        cate_prob = F.softmax(self.log_alpha, dim=-1)
        self.cate_prob = cate_prob.clone().detach()
        loss_alpha = torch.log(
            (self.weights * F.softmax(self.log_alpha, dim=-1)).sum(-1)).sum()
        self.weights.requires_grad_()

        inferences = 0
        max_index = self.weights.argmax().item()
        cur_weights = self.weights
        cur_index = 0

        from sklearn.externals.joblib import Parallel, delayed
        names_all = []
        for name1 in self.columns:
            for name2 in self.columns:
                if self.args.multi_operation:
                    cur_weights = self.weights[cur_index]
                    max_index = cur_weights.argmax().item()
                    cur_index += 1
                if self.args.ofm:
                    name1_embedding = self.embedding_all[name1][max_index](
                        features[name1])
                    name2_embedding = self.embedding_all[name2][max_index](
                        features[name2])
                else:
                    name1_embedding = self.embedding_all[name1](
                        features[name1])
                    name2_embedding = self.embedding_all[name2](
                        features[name2])
                names_all.append([
                    name1_embedding, name2_embedding,
                    cur_weights.view(-1, ), self.FC[name1 + ":" + name2]
                ])
        res = Parallel(n_jobs=8, backend="threading")(
            delayed(MixedBinary)(para1, para2, para3, para4)
            for para1, para2, para3, para4 in names_all)
        inferences = sum(res)
        # for name1 in self.columns:
        #     for name2 in self.columns:
        #         if self.args.multi_operation:
        #             cur_weights = self.weights[cur_index]
        #             max_index = cur_weights.argmax().item()
        #             cur_index += 1
        #         if self.args.ofm:
        #             name1_embedding = self.embedding_all[name1][max_index](features[name1])
        #             name2_embedding = self.embedding_all[name2][max_index](features[name2])
        #         else:
        #             name1_embedding = self.embedding_all[name1](features[name1])
        #             name2_embedding = self.embedding_all[name2](features[name2])
        #         regs += self.reg * (torch.norm(name1_embedding) + torch.norm(name2_embedding))
        #         name1_embedding_trans = self.mlp_p(name1_embedding.view(-1, 1)).view(name1_embedding.size())
        #         name2_embedding_trans = self.mlp_p(name2_embedding.view(-1, 1)).view(name2_embedding.size())
        #         inferences += MixedBinary(name1_embedding_trans, name2_embedding_trans, cur_weights.view(-1,), self.FC[name1 + ":" + name2])
        loss = (inferences - features["label"])**2
        weighted_loss = torch.mean(
            torch.sum(torch.mul(features["pos_weights"], loss), dim=1))
        self.weights.grad = torch.zeros_like(self.weights)
        (weighted_loss + loss_alpha).backward()
        self.block_reward = self.weights.grad.data.sum(-1)
        self.log_alpha.grad.data.mul_(self.block_reward.view(-1, 1))

        return inferences, weighted_loss, loss_alpha
Exemplo n.º 38
0
    mapper = DataFrameMapper([([nv], preprocessing.StandardScaler())
                              for nv in metric_cols])

    if (dn == 0) or (n_draws is not None):
        # Norm columns for variance estimation
        variance_mapper = DataFrameMapper([
            ([nv], preprocessing.StandardScaler())
            for nv in (list(metric_cols) + ['interview_age'])
        ])
        var_df = raw_df.copy(deep=True)
        var_df.loc[:, list(metric_cols) +
                   ['interview_age']] = variance_mapper.fit_transform(raw_df)

        print("Estimate variance contributions for each metric", flush=True)
        var_res = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(
            run_variance_metric_perm)(pn, perms[pn], metric, var_df)
                                                 for metric in metric_cols)
    else:
        var_res = None

    #var_res = run_variance_perm(pn, perms[pn], raw_df, metric_cols)

    # set up perm
    raw_df.loc[:,
               'deviceserialnumber'] = raw_df.loc[perms[pn],
                                                  'deviceserialnumber'].values

    # draw samples
    strata = ['deviceserialnumber']
    balance = ['gender', 'ehi_y_ss_scoreb']
    order = ['interview_age']
Exemplo n.º 39
0
    #     paradigm=paradigm, frametimes=frametimes,
    #     drift_model=drift_model, hrf_model=hrf_model)
    # ProgressReport().finish_dir(subject_output_dir)

    return dict(subject_id=subject_id,
                mask=mask_path,
                effects_maps=effects_maps,
                z_maps=z_maps,
                contrasts=contrasts)


# first level GLM
mem = Memory(os.path.join(output_dir, "cache_dir"))
n_jobs = min(n_jobs, len(subject_ids))
first_levels = Parallel(n_jobs=n_jobs)(
    delayed(mem.cache(do_subject_glm))(subject_id)
    for subject_id in subject_ids)

# run second-level GLM
group_zmaps = group_one_sample_t_test(
    [subject_data["mask"] for subject_data in first_levels],
    [subject_data["effects_maps"] for subject_data in first_levels],
    first_levels[0]["contrasts"],
    output_dir,
    threshold=2.)
plot_prob_atlas([zmap for zmap in group_zmaps.values() if "_minus_" in zmap],
                threshold=1.2,
                view_type="filled_contours")
plt.savefig("group_zmaps.png")
show()
    def fit(self, subjects, y=None):
        """Compute cross-validated group-sparse precisions.

        Parameters
        ----------
        subjects : list of numpy.ndarray with shapes (n_samples, n_features)
            input subjects. Each subject is a 2D array, whose columns contain
            signals. Sample number can vary from subject to subject, but all
            subjects must have the same number of features (i.e. of columns.)

        Attributes
        ----------
        covariances_ : numpy.ndarray, shape (n_features, n_features, n_subjects)
            covariance matrices, one per subject.

        precisions_ : numpy.ndarray, shape (n_features, n_features, n_subjects)
            precision matrices, one per subject. All matrices have the same
            sparsity pattern (if a coefficient is zero for a given matrix, it
            is also zero for every other.)

        alpha_ : float
            selected value for penalization parameter.

        cv_alphas_ : list of float
            all penalization parameter values explored.

        cv_scores_ : numpy.ndarray with shape (n_alphas, n_folds)
            scores obtained on test set for each value of the penalization
            parameter explored.

        Returns
        =======
        self: GroupSparseCovarianceCV
            the object instance itself.
        """
        # Empirical covariances
        emp_covs, n_samples = \
                  empirical_covariances(subjects, assume_centered=False)
        n_subjects = emp_covs.shape[2]

        # One cv generator per subject must be created, because each subject
        # can have a different number of samples from the others.
        cv = []
        for k in range(n_subjects):
            cv.append(
                sklearn.cross_validation.check_cv(self.cv,
                                                  subjects[k],
                                                  None,
                                                  classifier=False))

        path = list()  # List of (alpha, scores, covs)
        n_alphas = self.alphas

        if isinstance(n_alphas, collections.Sequence):
            alphas = list(self.alphas)
            n_alphas = len(alphas)
            n_refinements = 1
        else:
            n_refinements = self.n_refinements
            alpha_1, _ = compute_alpha_max(emp_covs, n_samples)
            alpha_0 = 1e-2 * alpha_1
            alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
                                 n_alphas)[::-1]

        covs_init = itertools.repeat(None)
        for i in range(n_refinements):
            # Compute the cross-validated loss on the current grid
            train_test_subjs = []
            for train_test in zip(*cv):
                assert (len(train_test) == n_subjects)
                train_test_subjs.append(
                    zip(*[(subject[train, :], subject[test, :])
                          for subject, (train,
                                        test) in zip(subjects, train_test)]))
            if self.early_stopping:
                probes = [
                    EarlyStopProbe(test_subjs, verbose=self.verbose)
                    for _, test_subjs in train_test_subjs
                ]
            else:
                probes = itertools.repeat(None)

            this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(group_sparse_covariance_path)(
                    train_subjs,
                    alphas,
                    test_subjs=test_subjs,
                    max_iter=self.max_iter_cv,
                    tol=self.tol_cv,
                    verbose=self.verbose,
                    debug=self.debug,
                    # Warm restart is useless with early stopping.
                    precisions_init=None if self.early_stopping else prec_init,
                    probe_function=probe)
                for (train_subjs, test_subjs), prec_init, probe in zip(
                    train_test_subjs, covs_init, probes))

            # this_path[i] is a tuple (precisions_list, scores)
            # - scores: scores obtained with the i-th folding, for each value
            #   of alpha.
            # - precisions_list: corresponding precisions matrices, for each
            #   value of alpha.
            precisions_list, scores = zip(*this_path)
            # now scores[i][j] is the score for the i-th folding, j-th value of
            # alpha (analoguous for precisions_list)
            precisions_list = zip(*precisions_list)
            scores = [np.mean(sc) for sc in zip(*scores)]
            # scores[i] is the mean score obtained for the i-th value of alpha.

            path.extend(zip(alphas, scores, precisions_list))
            path = sorted(path, key=operator.itemgetter(0), reverse=True)

            # Find the maximum score (avoid using the built-in 'max' function
            # to have a fully-reproducible selection of the smallest alpha in
            # case of equality)
            best_score = -np.inf
            last_finite_idx = 0
            for index, (alpha, this_score, _) in enumerate(path):
                if this_score >= .1 / np.finfo(np.float).eps:
                    this_score = np.nan
                if np.isfinite(this_score):
                    last_finite_idx = index
                if this_score >= best_score:
                    best_score = this_score
                    best_index = index

            # Refine the grid
            if best_index == 0:
                # We do not need to go back: we have chosen
                # the highest value of alpha for which there are
                # non-zero coefficients
                alpha_1 = path[0][0]
                alpha_0 = path[1][0]
                covs_init = path[0][2]
            elif (best_index == last_finite_idx
                  and not best_index == len(path) - 1):
                # We have non-converged models on the upper bound of the
                # grid, we need to refine the grid there
                alpha_1 = path[best_index][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index][2]
            elif best_index == len(path) - 1:
                alpha_1 = path[best_index][0]
                alpha_0 = 0.01 * path[best_index][0]
                covs_init = path[best_index][2]
            else:
                alpha_1 = path[best_index - 1][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index - 1][2]
            alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
                                 len(alphas) + 2)
            alphas = alphas[1:-1]
            if n_refinements > 1:
                logger.log("[GroupSparseCovarianceCV] Done refinement "
                           "% 2i out of %i" % (i + 1, n_refinements),
                           verbose=self.verbose)

        path = list(zip(*path))
        cv_scores_ = list(path[1])
        alphas = list(path[0])

        self.cv_scores_ = np.array(cv_scores_)
        self.alpha_ = alphas[best_index]
        self.cv_alphas_ = alphas

        # Finally, fit the model with the selected alpha
        logger.log("Final optimization", verbose=self.verbose)
        self.covariances_ = emp_covs
        self.precisions_ = _group_sparse_covariance(emp_covs,
                                                    n_samples,
                                                    self.alpha_,
                                                    tol=self.tol,
                                                    max_iter=self.max_iter,
                                                    verbose=self.verbose,
                                                    debug=self.debug)
        return self
Exemplo n.º 41
0
    if not os.path.exists('/'.join(
        [bird_db_loc, species, subject_id, 'TextGrids'])):
        os.makedirs('/'.join([bird_db_loc, species, subject_id, 'TextGrids']))

        # save wav
    if not os.path.exists(wav_location):
        try:
            urllib.request.urlretrieve(wav, wav_location)
        except HTTPError:
            print('Could not retreive ' + wav)
    # save textgrid
    if not os.path.exists(grid_location):
        try:
            urllib.request.urlretrieve(
                'http://taylor0.biology.ucla.edu/birdDBQuery/Files/' +
                text_grid, grid_location)
        except HTTPError:
            print('Could not retreive ' +
                  'http://taylor0.biology.ucla.edu/birdDBQuery/Files/' +
                  text_grid)


if parallel:
    with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
        parallel(
            delayed(downloadBirdDB)(row)
            for idx, row in tqdm(song_db.iterrows(), total=len(song_db)))
else:
    for idx, row in tqdm(song_db.iterrows(), total=len(song_db)):
        downloadBirdDB(row)
Exemplo n.º 42
0
    def fit(self, X, y, **fit_params):
        ''' determine the factor levels, build the fits '''
        #--- determine groups
        Xg = self.cat_trans.fit_transform(X)
        assert (Xg.shape[1] == 1)
        self.varname = self.cat_trans.get_feature_names()[0]

        levels, counts = np.unique(Xg.iloc[:, 0], return_counts=True)
        idx = np.array(list(reversed(np.argsort(counts))))
        levels, counts, coverage = [
            np.take(x, idx) for x in [levels, counts, counts / np.sum(counts)]
        ]

        #--- decide which levels to take
        self.levels_ = [
        ]  #regular levels with enough coverage one group/subpipe per
        self.default_levels_ = [
        ]  #munched up levels with not enough coverage, all project on the last group/subpipe
        self.coverage_ = [
        ]  #the coverage of each group subpipe, where the last entry is for the default_levels if they exist
        for l, c in zip(levels, coverage):
            if len(self.levels_) < self.max_levels and c >= self.min_coverage:
                self.levels_.append(l)
                self.coverage_.append(c)
            else:
                self.default_levels_.append(l)

        #--- insert the default key if neccessary
        if len(self.levels_) < len(levels):
            self.default_key_ = self.default_name
        else:
            self.default_key_ = None

        if self.default_key_ is not None:
            self.levels_.append(self.default_name)
            self.coverage_.append(1. - sum(self.coverage_))

        logger.trace('grouping')
        #--- translate labels to group_indexes
        self.lg_dict = {l: g for g, l in enumerate(self.levels_)}

        def xghelper(v):
            res = self.lg_dict.get(v)
            if res is not None:
                return res
            if v in self.default_levels_:
                return self.lg_dict.get(self.default_key_)
            raise Exception(
                "Unknown level '%s' encountered for variable '%s', and no default enabled"
                % (v, self.varname))

        xgroups = Xg.iloc[:, 0].apply(xghelper).values

        logger.trace("pre")
        #--- compute the pre_pipe result and split up into groups
        Xt = self.pre_trans.fit_transform(X, y)
        if not self.take_pre_only:
            if isinstance(Xt, pd.SparseDataFrame):
                Xt = Xt.to_dense()
            Xt = pd.concat([X, Xt], axis=1)
        if self.propagate_disc_labels:
            self.level_encoder_ = OneHotTransformer(
                sparse_output=False).fit(Xg)
            Xgt = self.level_encoder_.transform(Xg)
            #from sklearn.preprocessing import LabelEncoder
            #self.level_encoder_ = LabelEncoder().
            #self.level_encoder_.classes_ = np.array(levels)
            #Xgt = Xg.apply(self.level_encoder_.transform, axis=1)
            Xt = pd.concat([Xt, Xgt], axis=1)

        Xtgroups = {gk: df for gk, df in Xt.groupby(xgroups)}
        ygroups = {gk: df for gk, df in y.groupby(xgroups)}

        logger.trace("segment fit")
        #--- create pipes and fit them for every group

        self.sub_pipes_ = [copy.deepcopy(self.sub_pipe) for l in self.levels_]
        pls = Parallel(n_jobs=self.n_jobs)(delayed(_fit_one_fittable)(
            self.sub_pipes_[gk], Xtgroups[gk], ygroups[gk])
                                           for gk in Xtgroups.keys())
        self.sub_pipes_ = pls

        self.coverage_ = np.array(
            [df.shape[0] / X.shape[0] for gk, df in Xtgroups.items()])
        return self
Exemplo n.º 43
0
def k_means(X,
            n_clusters,
            init='k-means++',
            precompute_distances='auto',
            n_init=10,
            max_iter=300,
            verbose=False,
            tol=1e-4,
            random_state=None,
            copy_x=True,
            n_jobs=1,
            return_n_iter=False):
    """K-means clustering algorithm.
    Read more in the :ref:`User Guide <k_means>`.
    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.
    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).
        'auto' : do not precompute distances if n_samples * n_clusters > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.
        True : always precompute distances
        False : never precompute distances
    tol : float, optional
        The relative increment in the results before declaring convergence.
    verbose : boolean, optional
        Verbosity mode.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    copy_x : boolean, optional
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
    return_n_iter : bool, optional
        Whether or not to return the number of iterations.
    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    best_n_iter: int
        Number of iterations corresponding to the best results.
        Returned only if `return_n_iter` is set to True.
    """
    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    best_inertia = np.infty
    X = as_float_array(X, copy=copy_x)
    tol = _tolerance(X, tol)

    # If the distances are precomputed every job will create a matrix of shape
    # (n_clusters, n_samples). To stop KMeans from eating up memory we only
    # activate this if the created matrix is guaranteed to be under 100MB. 12
    # million entries consume a little under 100MB if they are of type double.
    if precompute_distances == 'auto':
        n_samples = X.shape[0]
        precompute_distances = (n_clusters * n_samples) < 12e6
    elif isinstance(precompute_distances, bool):
        pass
    else:
        raise ValueError("precompute_distances should be 'auto' or True/False"
                         ", but a value of %r was passed" %
                         precompute_distances)

    # subtract of mean of x for more accurate distance computations
    if not sp.issparse(X) or hasattr(init, '__array__'):
        X_mean = X.mean(axis=0)
    if not sp.issparse(X):
        # The copy was already done above
        X -= X_mean

    if hasattr(init, '__array__'):
        init = check_array(init, dtype=np.float64, copy=True)
        _validate_center_shape(X, n_clusters, init)

        init -= X_mean
        if n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d' %
                n_init,
                RuntimeWarning,
                stacklevel=2)
            n_init = 1

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None
    if n_jobs == 1:
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = _kmeans_single(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                precompute_distances=precompute_distances,
                tol=tol,
                x_squared_norms=x_squared_norms,
                random_state=random_state)
            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
    else:
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_kmeans_single)(
                X,
                n_clusters,
                max_iter=max_iter,
                init=init,
                verbose=verbose,
                tol=tol,
                precompute_distances=precompute_distances,
                x_squared_norms=x_squared_norms,
                # Change seed to ensure variety
                random_state=seed) for seed in seeds)
        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if not sp.issparse(X):
        if not copy_x:
            X += X_mean
        best_centers += X_mean

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
    else:
        return best_centers, best_labels, best_inertia
Exemplo n.º 44
0
    return all_args


def run_grid(paths):
    for path in paths:
        os.chdir(path)
        os.system("grid -i grid.in")


def run_dock6(paths):
    for path in paths:
        os.chdir(path)
        os.system("dock6 -i anchor_grow_dock.in")


n_jobs = 4
total_n_paths = 500
base_path = "/Users/tud51931/projects/murA/MurA-dock-MSMs"
#rec_path = "/Users/tud51931/projects/murA/MurA-MSM-mol2"

#setup_working_dirs(base_path)

indices_args = gen_paths(n_jobs, total_n_paths, base_path)
#print indices_args
#print len(indices_args)

Parallel(n_jobs=n_jobs,
         verbose=True)(delayed(run_grid)(indices) for indices in indices_args)
Parallel(n_jobs=n_jobs,
         verbose=True)(delayed(run_dock6)(indices) for indices in indices_args)
def monkeypatch_fit(self, X, y=None, groups=None, **fit_params):
    if self.fit_params is not None:
        warnings.warn('"fit_params" as a constructor argument was '
                      'deprecated in version 0.19 and will be removed '
                      'in version 0.21. Pass fit parameters to the '
                      '"fit" method instead.', DeprecationWarning)
        if fit_params:
            warnings.warn('Ignoring fit_params passed as a constructor '
                          'argument in favor of keyword arguments to '
                          'the "fit" method.', RuntimeWarning)
        else:
            fit_params = self.fit_params
    estimator = self.estimator
    cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

    scorers, self.multimetric_ = _check_multimetric_scoring(
        self.estimator, scoring=self.scoring)

    if self.multimetric_:
        if self.refit is not False and (
                not isinstance(self.refit, six.string_types) or
                # This will work for both dict / list (tuple)
                self.refit not in scorers):
            raise ValueError("For multi-metric scoring, the parameter "
                             "refit must be set to a scorer key "
                             "to refit an estimator with the best "
                             "parameter setting on the whole data and "
                             "make the best_* attributes "
                             "available for that metric. If this is not "
                             "needed, refit should be set to False "
                             "explicitly. %r was passed." % self.refit)
        else:
            refit_metric = self.refit
    else:
        refit_metric = 'score'

    X, y, groups = indexable(X, y, groups)
    n_splits = cv.get_n_splits(X, y, groups)
    # Regenerate parameter iterable for each fit
    candidate_params = list(self._get_param_iterator())
    n_candidates = len(candidate_params)
    if self.verbose > 0:
        print("Fitting {0} folds for each of {1} candidates, totalling"
              " {2} fits".format(n_splits, n_candidates,
                                 n_candidates * n_splits))

    base_estimator = clone(self.estimator)
    pre_dispatch = self.pre_dispatch

    # ===================================================================
    # BEGIN MONKEYPATCH MODIFICATION
    # ===================================================================

    parallel_cv = cv.split(X, y, groups)

    if type(self.pipeline_split_idx) == int and isinstance(base_estimator,
                                                           Pipeline):
        split_idx = self.pipeline_split_idx

        pre_pipe_steps = base_estimator.steps[:split_idx]
        new_pipe_steps = base_estimator.steps[split_idx:]
        memory = base_estimator.memory

        pre_pipe = Pipeline(pre_pipe_steps, memory)

        if len(new_pipe_steps) == 1:
            est_name, base_estimator = new_pipe_steps[0]
        else:
            est_name = None
            base_estimator = Pipeline(new_pipe_steps, memory)

        fit_params_pre_pipe = {}
        steps_pre_pipe = [tup[0] for tup in pre_pipe_steps]
        fit_param_keys = fit_params.keys()

        for pname in fit_param_keys:
            step, param = pname.split('__', 1)

            if step in steps_pre_pipe:
                fit_params_pre_pipe[pname] = fit_params.pop(pname)
            elif step == est_name:
                fit_params[param] = fit_params.pop(pname)

        if est_name is not None:
            for dic in candidate_params:
                for k in dic:
                    step, param = k.split('__', 1)

                    if step == est_name:
                        dic.update({param: dic.pop(k)})

        try:
            X = pre_pipe.fit_transform(X, **fit_params_pre_pipe)
        except TypeError:
            raise RuntimeError('Pipeline before pipeline_split_idx requires '
                               'fitting to y. Please initialize with an '
                               'earlier index.')

    if self.transform_before_grid and isinstance(base_estimator, Pipeline):
        pipe = base_estimator
        est_name, base_estimator = pipe.steps.pop()
        X_cv, y_cv, parallel_cv = [], [], []
        sample_count = 0

        fit_params_est = {}
        fit_param_keys = fit_params.keys()

        for pname in fit_param_keys:
            step, param = pname.split('__', 1)
            if step == est_name:
                fit_params_est[param] = fit_params.pop(pname)

        for dic in candidate_params:
            for k in dic:
                step, param = k.split('__', 1)

                if step == est_name:
                    dic.update({param: dic.pop(k)})

        for (train, test) in cv.split(X, y, groups):
            if y is not None:
                if isinstance(X, pd.DataFrame):
                    pipe.fit(X.iloc[train], y.iloc[train], **fit_params)
                else:
                    pipe.fit(X[train], y[train], **fit_params)
                y_cv.append(y)
            else:
                if isinstance(X, pd.DataFrame):
                    pipe.fit(X.iloc[train], **fit_params)
                else:
                    pipe.fit(X[train], **fit_params)

            X_cv.append(pipe.transform(X))

            train = train + sample_count
            test = test + sample_count
            sample_count += len(train)
            sample_count += len(test)

            parallel_cv.append((train, test))

        if isinstance(X, pd.DataFrame):
            X = pd.concat(tuple(X_cv))
        else:
            X = np.vstack(tuple(X_cv))

        if y is not None:
            if isinstance(y, pd.Series):
                y = pd.concat(tuple(y_cv))
            else:
                y = np.hstack(tuple(y_cv))

            if 'sample_weight' in fit_params_est:
                samp_weight = fit_params_est['sample_weight']
                fit_params_est['sample_weight'] = np.tile(samp_weight,
                                                          len(y_cv))

        fit_params = fit_params_est

    out = Parallel(
        n_jobs=self.n_jobs, verbose=self.verbose,
        pre_dispatch=pre_dispatch
    )(delayed(monkeypatch_fit_and_score)
      (clone(base_estimator), X, y, scorers, train,
                              test, self.verbose, parameters,
                              fit_params=fit_params,
                              return_train_score=self.return_train_score,
                              return_n_test_samples=True,
                              return_times=True, return_parameters=False,
                              error_score=self.error_score)
      for parameters, (train, test) in product(candidate_params,
                                               parallel_cv))

    # ===================================================================
    # END MONKEYPATCH MODIFICATION
    # ===================================================================

    # if one choose to see train score, "out" will contain train score info
    if self.return_train_score:
        (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)
    else:
        (test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)

    # test_score_dicts and train_score dicts are lists of dictionaries and
    # we make them into dict of lists
    test_scores = _aggregate_score_dicts(test_score_dicts)
    if self.return_train_score:
        train_scores = _aggregate_score_dicts(train_score_dicts)

    # TODO: replace by a dict in 0.21
    results = (DeprecationDict() if self.return_train_score == 'warn'
               else {})

    def _store(key_name, array, weights=None, splits=False, rank=False):
        """A small helper to store the scores/times to the cv_results_"""
        # When iterated first by splits, then by parameters
        # We want `array` to have `n_candidates` rows and `n_splits` cols.
        array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                          n_splits)
        if splits:
            for split_i in range(n_splits):
                # Uses closure to alter the results
                results["split%d_%s"
                        % (split_i, key_name)] = array[:, split_i]

        array_means = np.average(array, axis=1, weights=weights)
        results['mean_%s' % key_name] = array_means
        # Weighted std is not directly available in numpy
        array_stds = np.sqrt(np.average((array -
                                         array_means[:, np.newaxis]) ** 2,
                                        axis=1, weights=weights))
        results['std_%s' % key_name] = array_stds

        if rank:
            results["rank_%s" % key_name] = np.asarray(
                rankdata(-array_means, method='min'), dtype=np.int32)

    _store('fit_time', fit_time)
    _store('score_time', score_time)
    # Use one MaskedArray and mask all the places where the param is not
    # applicable for that candidate. Use defaultdict as each candidate may
    # not contain all the params
    param_results = defaultdict(partial(MaskedArray,
                                        np.empty(n_candidates,),
                                        mask=True,
                                        dtype=object))
    for cand_i, params in enumerate(candidate_params):
        for name, value in params.items():
            # An all masked empty array gets created for the key
            # `"param_%s" % name` at the first occurence of `name`.
            # Setting the value at an index also unmasks that index
            param_results["param_%s" % name][cand_i] = value

    results.update(param_results)
    # Store a list of param dicts at the key 'params'
    results['params'] = candidate_params

    # NOTE test_sample counts (weights) remain the same for all candidates
    test_sample_counts = np.array(test_sample_counts[:n_splits],
                                  dtype=np.int)
    for scorer_name in scorers.keys():
        # Computed the (weighted) mean and std for test scores alone
        _store('test_%s' % scorer_name, test_scores[scorer_name],
               splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            prev_keys = set(results.keys())
            _store('train_%s' % scorer_name, train_scores[scorer_name],
                   splits=True)

            if self.return_train_score == 'warn':
                for key in set(results.keys()) - prev_keys:
                    message = (
                        'You are accessing a training score ({!r}), '
                        'which will not be available by default '
                        'any more in 0.21. If you need training scores, '
                        'please set return_train_score=True').format(key)
                    # warn on key access
                    results.add_warning(key, message, FutureWarning)

    # For multi-metric evaluation, store the best_index_, best_params_ and
    # best_score_ iff refit is one of the scorer names
    # In single metric evaluation, refit_metric is "score"
    if self.refit or not self.multimetric_:
        self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
        self.best_params_ = candidate_params[self.best_index_]
        self.best_score_ = results["mean_test_%s" % refit_metric][
            self.best_index_]

    if self.refit:
        self.best_estimator_ = clone(base_estimator).set_params(
            **self.best_params_)
        if y is not None:
            self.best_estimator_.fit(X, y, **fit_params)
        else:
            self.best_estimator_.fit(X, **fit_params)

    # Store the only scorer not as a dict for single metric evaluation
    self.scorer_ = scorers if self.multimetric_ else scorers['score']

    self.cv_results_ = results
    self.n_splits_ = n_splits

    return self
Exemplo n.º 46
0
    def transform(self, X):
        """Apply the encoding to a dataframe.

        This method will encode the features in the test frame with the
        levels discovered in the ``fit`` computation.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'ohe_')
        X, _ = check_dataframe(X, cols=self.cols)

        # validate that fit cols in test set
        cols = self.fit_cols_
        validate_test_set_columns(cols, X.columns)

        # fit params that we need
        ohe = self.ohe_
        lenc = self.le_
        sep = self.sep
        drop = self.drop_one_level

        # Do transformations in parallel
        transformations = list(Parallel(n_jobs=self.n_jobs)(
            delayed(_le_transform)(
                col=col, vec=X[col].values, le=lenc[col],
                handle=self.handle_unknown, sep=sep)
            for col in cols))

        # This is another pass of O(N), but it's not performing any incremental
        # transformations of any sort. It just traverses the list of affected
        # columns, extending the column order list and tracking the columns to
        # drop. All of the heavy lifting for the transformations was handled
        # in parallel above.
        col_order = []
        drops = []
        for col, vec_trans, classes in transformations:
            X[col] = vec_trans
            col_order.extend(classes)

            # if we want to drop one, just drop the last
            if drop and len(classes) > 1:
                drops.append(classes[-1])

        # now we can get the transformed OHE
        ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
                                              columns=col_order)

        # set the index to be equal to X's for a smooth concat
        ohe_trans.index = X.index

        # if we're dropping one level, do so now
        if drops:
            ohe_trans = ohe_trans.drop(drops, axis=1)

        # drop the original columns from X
        X = X.drop(cols, axis=1)

        # We might have dropped ALL columns from X. And if that's the case, we
        # can just return the encoded columns
        if not X.columns.tolist():
            return dataframe_or_array(ohe_trans, self.as_df)

        # otherwise concat the new columns
        X = pd.concat([X, ohe_trans], axis=1)  # type: pd.DataFrame
        return dataframe_or_array(X, self.as_df)
Exemplo n.º 47
0
    def _evaluate_individuals(self,
                              individuals,
                              features,
                              target,
                              sample_weight=None,
                              groups=None):
        """Determine the fit of the provided individuals.

        Parameters
        ----------
        individuals: a list of DEAP individual
            One individual is a list of pipeline operators and model parameters that can be
            compiled by DEAP into a callable function
        features: numpy.ndarray {n_samples, n_features}
            A numpy matrix containing the training and testing features for the individual's evaluation
        target: numpy.ndarray {n_samples}
            A numpy matrix containing the training and testing target for the individual's evaluation
        sample_weight: array-like {n_samples}, optional
            List of sample weights to balance (or un-balanace) the dataset target as needed
        groups: array-like {n_samples, }, optional
            Group labels for the samples used while splitting the dataset into train/test set

        Returns
        -------
        fitnesses_ordered: float
            Returns a list of tuple value indicating the individual's fitness
            according to its performance on the provided data

        """
        if self.max_time_mins:
            total_mins_elapsed = (datetime.now() -
                                  self._start_datetime).total_seconds() / 60.
            if total_mins_elapsed >= self.max_time_mins:
                raise KeyboardInterrupt(
                    '{} minutes have elapsed. TPOT will close down.'.format(
                        total_mins_elapsed))

        # Check we do not evaluate twice the same individual in one pass.
        _, unique_individual_indices = np.unique(
            [str(ind) for ind in individuals], return_index=True)
        unique_individuals = [
            ind for i, ind in enumerate(individuals)
            if i in unique_individual_indices
        ]

        # return fitness scores
        operator_counts = {}
        # 4 lists of DEAP individuals, their sklearn pipelines and their operator counts for parallel computing
        eval_individuals_str = []
        sklearn_pipeline_list = []

        for individual in unique_individuals:
            # Disallow certain combinations of operators because they will take too long or take up too much RAM
            # This is a fairly hacky way to prevent TPOT from getting stuck on bad pipelines and should be improved in a future release
            individual_str = str(individual)
            sklearn_pipeline_str = generate_pipeline_code(
                expr_to_tree(individual, self._pset), self.operators)
            if sklearn_pipeline_str.count('PolynomialFeatures') > 1:
                if self.verbosity > 2:
                    self._pbar.write(
                        'Invalid pipeline encountered. Skipping its evaluation.'
                    )
                self.evaluated_individuals_[individual_str] = (5000.,
                                                               -float('inf'))
                if not self._pbar.disable:
                    self._pbar.update(1)
            # Check if the individual was evaluated before
            elif individual_str in self.evaluated_individuals_:
                if self.verbosity > 2:
                    self._pbar.write(
                        'Pipeline encountered that has previously been evaluated during the '
                        'optimization process. Using the score from the previous evaluation.'
                    )
                if not self._pbar.disable:
                    self._pbar.update(1)
            else:
                try:
                    # Transform the tree expression into an sklearn pipeline
                    sklearn_pipeline = self._toolbox.compile(expr=individual)

                    # Fix random state when the operator allows
                    self._set_param_recursive(sklearn_pipeline.steps,
                                              'random_state', 42)
                    # Setting the seed is needed for XGBoost support because XGBoost currently stores
                    # both a seed and random_state, and they're not synced correctly.
                    # XGBoost will raise an exception if random_state != seed.
                    if 'XGB' in sklearn_pipeline_str:
                        self._set_param_recursive(sklearn_pipeline.steps,
                                                  'seed', 42)

                    # Count the number of pipeline operators as a measure of pipeline complexity
                    operator_count = self._operator_count(individual)
                    operator_counts[individual_str] = max(1, operator_count)
                except Exception:
                    self.evaluated_individuals_[individual_str] = (
                        5000., -float('inf'))
                    if not self._pbar.disable:
                        self._pbar.update(1)
                    continue
                eval_individuals_str.append(individual_str)
                sklearn_pipeline_list.append(sklearn_pipeline)

        # evalurate pipeline
        resulting_score_list = []
        # chunk size for pbar update
        for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4):
            jobs = []
            for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx +
                                                          self.n_jobs * 4]:
                job = delayed(_wrapped_cross_val_score)(
                    sklearn_pipeline=sklearn_pipeline,
                    features=features,
                    target=target,
                    cv=self.cv,
                    scoring_function=self.scoring_function,
                    sample_weight=sample_weight,
                    max_eval_time_mins=self.max_eval_time_mins,
                    groups=groups)
                jobs.append(job)
            parallel = Parallel(n_jobs=self.n_jobs,
                                verbose=0,
                                pre_dispatch='2*n_jobs')
            tmp_result_score = parallel(jobs)

            # update pbar
            for val in tmp_result_score:
                if not self._pbar.disable:
                    self._pbar.update(1)
                if val == 'Timeout':
                    if self.verbosity > 2:
                        self._pbar.write(
                            'Skipped pipeline #{0} due to time out. '
                            'Continuing to the next pipeline.'.format(
                                self._pbar.n))
                    resulting_score_list.append(-float('inf'))
                else:
                    resulting_score_list.append(val)

        for resulting_score, individual_str in zip(resulting_score_list,
                                                   eval_individuals_str):
            if type(resulting_score) in [float, np.float64, np.float32]:
                self.evaluated_individuals_[individual_str] = (
                    operator_counts[individual_str], resulting_score)
            else:
                raise ValueError('Scoring function does not return a float.')

        return [
            self.evaluated_individuals_[str(individual)]
            for individual in individuals
        ]
Exemplo n.º 48
0
    def fit(self, X, Y, constraints=None, warm_start=None, initialize=True):
        """Learn parameters using cutting plane method.

        Parameters
        ----------
        X : iterable
            Traing instances. Contains the structured input objects.
            No requirement on the particular form of entries of X is made.

        Y : iterable
            Training labels. Contains the strctured labels for inputs in X.
            Needs to have the same length as X.

        contraints : iterable
            Known constraints for warm-starts. List of same length as X.
            Each entry is itself a list of constraints for a given instance x .
            Each constraint is of the form [y_hat, delta_psi, loss], where
            y_hat is a labeling, ``delta_psi = psi(x, y) - psi(x, y_hat)``
            and loss is the loss for predicting y_hat instead of the true label
            y.

        initialize : boolean, default=True
            Whether to initialize the model for the data.
            Leave this true except if you really know what you are doing.
        """
        print("Training n-slack dual structural SVM")
        cvxopt.solvers.options['show_progress'] = self.verbose > 3
        if initialize:
            self.model.initialize(X, Y)
        self.w = np.zeros(self.model.size_psi)
        n_samples = len(X)
        stopping_criterion = False
        if constraints is None:
            # fresh start
            constraints = [[] for i in xrange(n_samples)]
            self.last_active = [[] for i in xrange(n_samples)]
            self.objective_curve_ = []
            self.primal_objective_curve_ = []
            self.timestamps_ = [time()]
        else:
            # warm start
            objective = self._solve_n_slack_qp(constraints, n_samples)
        try:
            # catch ctrl+c to stop training
            # we have to update at least once after going through the dataset
            for iteration in xrange(self.max_iter):
                # main loop
                self.timestamps_.append(time() - self.timestamps_[0])
                if self.verbose > 0:
                    print("iteration %d" % iteration)
                if self.verbose > 2:
                    print(self)
                new_constraints = 0
                # generate slices through dataset from batch_size
                if self.batch_size < 1 and not self.batch_size == -1:
                    raise ValueError("batch_size should be integer >= 1 or -1,"
                                     "got %s." % str(self.batch_size))
                batch_size = (self.batch_size
                              if self.batch_size != -1 else len(X))
                n_batches = int(np.ceil(float(len(X)) / batch_size))
                slices = gen_even_slices(n_samples, n_batches)
                indices = np.arange(n_samples)
                slack_sum = 0
                for batch in slices:
                    new_constraints_batch = 0
                    verbose = max(0, self.verbose - 3)
                    X_b = X[batch]
                    Y_b = Y[batch]
                    indices_b = indices[batch]
                    candidate_constraints = Parallel(
                        n_jobs=self.n_jobs, verbose=verbose)(
                            delayed(find_constraint)(self.model, x, y, self.w)
                            for x, y in zip(X_b, Y_b))

                    # for each batch, gather new constraints
                    for i, x, y, constraint in zip(indices_b, X_b, Y_b,
                                                   candidate_constraints):
                        # loop over samples in batch
                        y_hat, delta_psi, slack, loss = constraint
                        slack_sum += slack

                        if self.verbose > 3:
                            print("current slack: %f" % slack)

                        if not loss > 0:
                            # can have y != y_hat but loss = 0 in latent svm.
                            # we need this here as dpsi is then != 0
                            continue

                        if self._check_bad_constraint(y_hat, slack,
                                                      constraints[i]):
                            continue

                        constraints[i].append([y_hat, delta_psi, loss])
                        new_constraints_batch += 1

                    # after processing the slice, solve the qp
                    if new_constraints_batch:
                        objective = self._solve_n_slack_qp(
                            constraints, n_samples)
                        new_constraints += new_constraints_batch

                self.objective_curve_.append(objective)
                self._compute_training_loss(X, Y, iteration)

                primal_objective = (self.C * slack_sum + np.sum(self.w**2) / 2)
                self.primal_objective_curve_.append(primal_objective)

                if self.verbose > 0:
                    print("new constraints: %d, "
                          "cutting plane objective: %f primal objective: %f" %
                          (new_constraints, objective, primal_objective))

                if new_constraints == 0:
                    print("no additional constraints")
                    stopping_criterion = True

                if (iteration > 1 and self.objective_curve_[-1] -
                        self.objective_curve_[-2] < self.tol):
                    print("objective converged.")
                    stopping_criterion = True

                if stopping_criterion:
                    if (self.switch_to is not None
                            and self.model.inference_method != self.switch_to):
                        print("Switching to %s inference" %
                              str(self.switch_to))
                        self.model.inference_method_ = \
                            self.model.inference_method
                        self.model.inference_method = self.switch_to
                        stopping_criterion = False
                        continue
                    else:
                        break

                if self.verbose > 5:
                    print(self.w)

                if self.logger is not None:
                    self.logger(self, iteration)
        except KeyboardInterrupt:
            pass
        if self.logger is not None:
            self.logger(self, 'final')

        self.constraints_ = constraints
        if self.verbose and self.n_jobs == 1:
            print("calls to inference: %d" % self.model.inference_calls)
        return self
    # featdict['behavior_timestamp_click_month_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.month.nunique()
    # featdict['behavior_timestamp_click_day_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.day.nunique()
    # featdict['behavior_timestamp_click_hour_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.hour.nunique()
    # featdict['behavior_timestamp_click_minute_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.minute.nunique()

    return featdict


train_id = pd.read_csv('./train_id.csv')
test_id = pd.read_csv('./test_id.csv')

train_id = pd.read_csv('./train_id.csv')
test_id = pd.read_csv('./test_id.csv')

train_feat = Parallel(n_jobs=30)(
    delayed(feature_agg)(i, './train/' + id + '.hdf')
    for i, id in enumerate(train_id['user_id'].iloc[:]))
test_feat = Parallel(n_jobs=30)(
    delayed(feature_agg)(i, './test/' + id + '.hdf')
    for i, id in enumerate(test_id['user_id'].iloc[:]))
train_feat = pd.DataFrame(train_feat)
test_feat = pd.DataFrame(test_feat)

train_feat = pd.merge(train_feat, train_id, on='user_id', how='left')

params = {
    'learning_rate': 0.01,
    'min_child_samples': 5,
    'max_depth': -1,
    'lambda_l1': 2,
    'boosting': 'gbdt',
Exemplo n.º 50
0
def _cpu_map(fun, param_grid, n_jobs, verbose):
    return Parallel(
        n_jobs=n_jobs,
        verbose=verbose,
        backend="threading",  # any sklearn backend should work here
    )(delayed(fun)(params) for params in param_grid)
Exemplo n.º 51
0
    def fit(self, X, y):
        """Build a Bagging ensemble of estimators from the training
           set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).


        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        X, y = check_X_y(X, y, ['csr', 'csc'])

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        if self.warm_start and self.oob_score:
            raise ValueError("Out of bag estimate only available"
                             " if warm_start=False")

        if hasattr(self, "oob_score_") and self.warm_start:
            del self.oob_score_

        if not self.warm_start or len(self.estimators_) == 0:
            # Free allocated memory, if any
            self.estimators_ = []
            self.estimators_samples_ = []
            self.estimators_features_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
            return self

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            n_more_estimators, self.n_jobs)

        # Advance random state to state after training
        # the first n_estimators
        if self.warm_start and len(self.estimators_) > 0:
            random_state.randint(MAX_INT, size=len(self.estimators_))

        seeds = random_state.randint(MAX_INT, size=n_more_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            # TEF: changed following call to balanced procedure:
            delayed(_parallel_build_balanced_estimators)(
                n_estimators[i],
                self,
                X,
                y,
                seeds[starts[i]:starts[i + 1]],
                verbose=self.verbose) for i in range(n_jobs))

        # Reduce
        self.estimators_ += list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_samples_ += list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.estimators_features_ += list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Exemplo n.º 52
0
Arquivo: build.py Projeto: tgadf/pymva
def grid_search_early_stopping(estimator,
                               param_grid,
                               verbose,
                               scoring,
                               cv,
                               X,
                               y,
                               early_stopping_rounds,
                               eval_set_size,
                               n_jobs=1,
                               iid=True,
                               refit=True,
                               pre_dispatch='2*n_jobs',
                               error_score='raise'):
    ''' This is from scikit-learn package.
    '''

    parameter_iterable = ParameterGrid(param_grid)
    scorer_ = check_scoring(estimator, scoring=scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    if verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(len(cv), n_candidates,
                                     n_candidates * len(cv)))

    base_estimator = clone(estimator)

    pre_dispatch = pre_dispatch

    out = Parallel(
        n_jobs=n_jobs,
        verbose=2 if verbose > 0 else 0,
        pre_dispatch=pre_dispatch)(delayed(_fit_and_score)(
            clone(base_estimator),
            X,
            y,
            scorer_,
            train,
            test,
            2 if verbose > 0 else 0,
            parameters, {
                "early_stopping_rounds": early_stopping_rounds,
                "eval_metric": get_xgboost_eval_metric(scoring),
                "eval_set": [_safe_split(estimator, X, y, test, train)],
                "verbose": True if verbose > 1 else False
            },
            return_parameters=True,
            error_score=error_score) for parameters in parameter_iterable
                                   for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in \
                out[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    best_score_ = best.mean_validation_score

    if refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)

        if y is not None:
            best_estimator, _, _ = fit_estimator_early_stopping(
                best_estimator, X, y, scoring, early_stopping_rounds,
                eval_set_size, verbose)
        else:
            raise ValueError('y is required.')

    return best_estimator, best.parameters, grid_scores
Exemplo n.º 53
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        self._random_state = check_random_state(self.random_state)
        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        R = list(self.cost_parameter_max.values())[0]

        if self.cost_parameter_min is None:
            Rmin = 1
        else:
            Rmin = list(self.cost_parameter_min.values())[0]

        n_candidates = hyperband_num_per_run(self.eta, R, Rmin)
        log.debug(
            "Fitting %d folds for each of %d candidates, totalling "
            "%d fits.", n_splits, n_candidates, n_candidates * n_splits)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))

        out = []
        smax = int(np.floor(np.log(R / Rmin) / np.log(self.eta)))
        B = (smax + 1.0) * R

        # This code is hyperband, but I have swapped the order of the
        # inner and outer loops to expose more parallelism. Fun.
        Ts = []
        ns = []
        rs = []
        for s in range(smax, -1, -1):
            ns.append(int(np.ceil(B / R * np.power(self.eta, s) / (s + 1.0))))
            rs.append(int(R / np.power(self.eta, s)))
            Ts.append(
                list(
                    ParameterSampler(self.param_distributions,
                                     ns[-1],
                                     random_state=self._random_state)))
        nums = copy.copy(ns)
        # these are the offsets to the hyperparameter configurations for
        # each value of s in the loop above
        # they get updated as the loop over the different rounds get run
        # below
        offsets = [0] + list(np.cumsum(
            np.array(nums) * n_splits).astype(int))[:-1]

        # iterate the maximum number of times for each resource budget
        # configuration.
        # If we should skip an interation, T will be an empty list
        for rnd in range(0, smax + 1):
            # set the costs for this round
            r_rnd = []
            for ind, s in enumerate(range(smax, -1, -1)):
                _r = int(rs[ind] * np.power(self.eta, rnd))
                r_rnd += [_r] * nums[ind]

            # run the jobs
            _jobs = []
            for parameters, _r in zip(itertools.chain.from_iterable(Ts),
                                      r_rnd):
                _parameters = copy.deepcopy(parameters)
                _parameters.update(
                    {list(self.cost_parameter_max.keys())[0]: _r})
                for train, test in cv_iter:
                    _jobs.append(
                        delayed(_fit_and_score)(
                            clone(base_estimator),
                            X,
                            y,
                            self.scorer_,
                            train,
                            test,
                            self.verbose,
                            _parameters,
                            fit_params=fit_params,
                            return_train_score=self.return_train_score,
                            return_n_test_samples=True,
                            return_times=True,
                            return_parameters=True,
                            error_score=self.error_score))
            _out = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=pre_dispatch)(_jobs)
            out += _out

            # now post-process
            new_Ts = []
            new_nums = []
            for ind, s in enumerate(range(smax, -1, -1)):
                n_i = int(np.floor(ns[ind] / np.power(self.eta, rnd)))
                num_to_keep = int(np.floor(n_i / self.eta))
                # keep for next round only if num_to_keep > 0 AND
                # the round after this round will be executed
                # in otherwords, you only need to cut the configurations
                # down by eta if you are going to test them in the next
                # round
                if num_to_keep > 0 and rnd < s:
                    _out_s = _out[offsets[ind]:(offsets[ind] +
                                                nums[ind] * n_splits)]
                    results, _ = self._process_outputs(_out_s, n_splits)
                    sind = np.argsort(results["rank_test_score"])
                    msk = np.zeros(len(results['rank_test_score']))
                    msk[sind[0:num_to_keep]] = 1
                    msk = msk.astype(bool)
                    new_Ts.append(
                        [p for k, p in enumerate(results['params']) if msk[k]])
                    new_nums.append(num_to_keep)
                else:
                    new_Ts.append([])
                    new_nums.append(0)

            Ts = new_Ts
            nums = new_nums
            offsets = [0] + list(
                np.cumsum(np.array(nums) * n_splits).astype(int))[:-1]

        results, best_index = self._process_outputs(out, n_splits)
        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits
        self.multimetric_ = False
        if not hasattr(self, 'best_score_'):
            self.best_score_ = results['mean_test_score'][best_index]
        if not hasattr(self, 'best_params_'):
            self.best_params_ = results['params'][best_index]

        if self.refit:
            best_estimator = clone(self.estimator).set_params(
                **self.cv_results_['params'][self.best_index_])

            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)

            self.best_estimator_ = best_estimator

        return self
Exemplo n.º 54
0
    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        rs = self._get_random_state()

        # Create dataset
        ds = get_dataset(X, order="fortran")
        n_samples = ds.get_n_samples()
        n_features = ds.get_n_features()

        if self.penalty != "l1/l2" and self.multiclass:
            raise NotImplementedError(
                "True multiclass options not implemented "
                "for non group-lasso(l1/l2) penalties.")

        # Create label transformers
        #neg_label = 0 if self.penalty == "nn" else -1
        reencode = self.penalty == "l1/l2"
        y, n_classes, n_vectors = self._set_label_transformers(y,
                                                               reencode,
                                                               neg_label=-1)
        Y = np.asfortranarray(self.label_binarizer_.transform(y),
                              dtype=np.float64)

        # Initialize coefficients
        if not self.warm_start or self.coef_ is None:
            self.C_init = self.C
            self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
            self._init_errors(Y)

        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
        indices = np.arange(n_features, dtype=np.int32)

        max_steps = self._get_max_steps()

        # Learning
        if self.penalty == "l1/l2":
            tol = self.tol
            #n_min = np.min(np.sum(Y == 1, axis=0))
            #tol *= max(n_min, 1) / n_samples

            vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
            model = _primal_cd(self, self.coef_, self.errors_, ds, y, Y,
                               -1, self.multiclass, indices, 12,
                               self._get_loss(), self.selection, self.permute,
                               self.termination, self.C, self.alpha,
                               self.max_iter, max_steps, self.shrinking, vinit,
                               rs, tol, self.callback, self.n_calls,
                               self.verbose)
            viol = model[0]
            if self.warm_start and len(self.violation_init_) == 0:
                self.violation_init_[0] = viol

        elif self.penalty in ("l1", "l2", "nn"):
            penalty = self._get_penalty()

            n_pos = np.zeros(n_vectors)
            vinit = self.C / self.C_init * np.ones_like(n_pos)
            for k in xrange(n_vectors):
                n_pos[k] = np.sum(Y[:, k] == 1)
                vinit[k] *= self.violation_init_.get(k, 0)
            n_neg = n_samples - n_pos
            tol = self.tol * np.maximum(np.minimum(n_pos, n_neg),
                                        1) / n_samples

            jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y,
                                        Y, k, False, indices, penalty,
                                        self._get_loss(), self.selection,
                                        self.permute, self.termination, self.C,
                                        self.alpha, self.max_iter, max_steps,
                                        self.shrinking, vinit[k], rs, tol[k],
                                        self.callback, self.n_calls,
                                        self.verbose)
                    for k in xrange(n_vectors))
            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, coefs, errors = zip(*model)
            self.coef_ = np.asarray(coefs)
            self.errors_ = np.asarray(errors)

            for k in range(n_vectors):
                if self.warm_start and not k in self.violation_init_:
                    self.violation_init_[k] = viol[k]

        if self.debiasing:
            nz = self.coef_ != 0

            if not self.warm_debiasing:
                self.coef_ = np.zeros((n_vectors, n_features),
                                      dtype=np.float64)
                self._init_errors(Y)

            indices = np.arange(n_features, dtype=np.int32)
            jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y,
                                        Y, k, False, indices[nz[k]], 2,
                                        self._get_loss(), "cyclic",
                                        self.permute, "violation_sum", self.Cd,
                                        1.0, self.max_iter, max_steps, False,
                                        0, rs, self.tol, self.callback,
                                        self.n_calls, self.verbose)
                    for k in xrange(n_vectors))
            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, coefs, errors = zip(*model)
            self.coef_ = np.asarray(coefs)
            self.errors_ = np.asarray(errors)

        return self
Exemplo n.º 55
0
def plot_partial_dependence(gbrt,
                            X,
                            features,
                            feature_names=None,
                            label=None,
                            n_cols=3,
                            grid_resolution=100,
                            percentiles=(0.05, 0.95),
                            n_jobs=1,
                            verbose=0,
                            ax=None,
                            line_kw=None,
                            contour_kw=None,
                            **fig_kw):
    """Partial dependence plots for ``features``.
    The ``len(features)`` plots are arranged in a grid with ``n_cols``
    columns. Two-way partial dependence plots are plotted as contour
    plots.
    Read more in the :ref:`User Guide <partial_dependence>`.
    Parameters
    ----------
    gbrt : BaseGradientBoosting
        A fitted gradient boosting model.
    X : array-like, shape=(n_samples, n_features)
        The data on which ``gbrt`` was trained.
    features : seq of tuples or ints
        If seq[i] is an int or a tuple with one int value, a one-way
        PDP is created; if seq[i] is a tuple of two ints, a two-way
        PDP is created.
    feature_names : seq of str
        Name of each feature; feature_names[i] holds
        the name of the feature with index i.
    label : object
        The class label for which the PDPs should be computed.
        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
    n_cols : int
        The number of columns in the grid plot (default: 3).
    percentiles : (low, high), default=(0.05, 0.95)
        The lower and upper percentile used to create the extreme values
        for the PDP axes.
    grid_resolution : int, default=100
        The number of equally spaced points on the axes.
    n_jobs : int
        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
        Defaults to 1.
    verbose : int
        Verbose output during PD computations. Defaults to 0.
    ax : Matplotlib axis object, default None
        An axis object onto which the plots will be drawn.
    line_kw : dict
        Dict with keywords passed to the ``pylab.plot`` call.
        For one-way partial dependence plots.
    contour_kw : dict
        Dict with keywords passed to the ``pylab.plot`` call.
        For two-way partial dependence plots.
    fig_kw : dict
        Dict with keywords passed to the figure() call.
        Note that all keywords not recognized above will be automatically
        included here.
    Returns
    -------
    fig : figure
        The Matplotlib Figure object.
    axs : seq of Axis objects
        A seq of Axis objects, one for each subplot.
    Examples
    --------
    >>> from sklearn.datasets import make_friedman1
    >>> from sklearn.ensemble import GradientBoostingRegressor
    >>> X, y = make_friedman1()
    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
    ...
    """
    import matplotlib.pyplot as plt
    from matplotlib import transforms
    from matplotlib.ticker import MaxNLocator
    from matplotlib.ticker import ScalarFormatter

    # if not isinstance(gbrt, BaseGradientBoosting):
    #     raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
    if gbrt.estimators_.shape[0] == 0:
        raise ValueError('Call %s.fit before partial_dependence' %
                         gbrt.__class__.__name__)

    # set label_idx for multi-class GBRT
    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
        if label is None:
            raise ValueError('label is not given for multi-class PDP')
        label_idx = np.searchsorted(gbrt.classes_, label)
        if gbrt.classes_[label_idx] != label:
            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
    else:
        # regression and binary classification
        label_idx = 0

    X = check_array(X, dtype=DTYPE, order='C')
    if gbrt.n_features != X.shape[1]:
        raise ValueError('X.shape[1] does not match gbrt.n_features')

    if line_kw is None:
        line_kw = {'color': 'green'}
    if contour_kw is None:
        contour_kw = {}

    # convert feature_names to list
    if feature_names is None:
        # if not feature_names use fx indices as name
        feature_names = [str(i) for i in range(gbrt.n_features)]
    elif isinstance(feature_names, np.ndarray):
        feature_names = feature_names.tolist()

    def convert_feature(fx):
        if isinstance(fx, six.string_types):
            try:
                fx = feature_names.index(fx)
            except ValueError:
                raise ValueError('Feature %s not in feature_names' % fx)
        return fx

    # convert features into a seq of int tuples
    tmp_features = []
    for fxs in features:
        if isinstance(fxs, (numbers.Integral, ) + six.string_types):
            fxs = (fxs, )
        try:
            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
        except TypeError:
            raise ValueError('features must be either int, str, or tuple '
                             'of int/str')
        if not (1 <= np.size(fxs) <= 2):
            raise ValueError('target features must be either one or two')

        tmp_features.append(fxs)

    features = tmp_features

    names = []
    try:
        for fxs in features:
            l = []
            # explicit loop so "i" is bound for exception below
            for i in fxs:
                l.append(feature_names[i])
            names.append(l)
    except IndexError:
        raise ValueError('features[i] must be in [0, n_features) '
                         'but was %d' % i)

    # compute PD functions
    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed(
        partial_dependence
    )(gbrt, fxs, X=X, grid_resolution=grid_resolution, percentiles=percentiles)
                                                         for fxs in features)

    # get global min and max values of PD grouped by plot type
    pdp_lim = {}
    for pdp, axes in pd_result:
        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
        n_fx = len(axes)
        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
        min_pd = min(min_pd, old_min_pd)
        max_pd = max(max_pd, old_max_pd)
        pdp_lim[n_fx] = (min_pd, max_pd)

    # create contour levels for two-way plots
    if 2 in pdp_lim:
        Z_level = np.linspace(*pdp_lim[2], num=8)

    if ax is None:
        fig = plt.figure(**fig_kw)
    else:
        fig = ax.get_figure()
        fig.clear()

    n_cols = min(n_cols, len(features))
    n_rows = int(np.ceil(len(features) / float(n_cols)))
    axs = []
    for i, fx, name, (pdp, axes) in zip(count(), features, names, pd_result):
        ax = fig.add_subplot(n_rows, n_cols, i + 1)

        if len(axes) == 1:
            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
        else:
            # make contour plot
            assert len(axes) == 2
            XX, YY = np.meshgrid(axes[0], axes[1])
            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
            CS = ax.contour(XX,
                            YY,
                            Z,
                            levels=Z_level,
                            linewidths=0.5,
                            colors='k')
            ax.contourf(XX,
                        YY,
                        Z,
                        levels=Z_level,
                        vmax=Z_level[-1],
                        vmin=Z_level[0],
                        alpha=0.75,
                        **contour_kw)
            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)

        # plot data deciles + axes labels
        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
        trans = transforms.blended_transform_factory(ax.transData,
                                                     ax.transAxes)
        ylim = ax.get_ylim()
        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
        ax.set_xlabel(name[0])
        ax.set_ylim(ylim)

        # prevent x-axis ticks from overlapping
        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
        tick_formatter = ScalarFormatter()
        tick_formatter.set_powerlimits((-3, 4))
        ax.xaxis.set_major_formatter(tick_formatter)

        if len(axes) > 1:
            # two-way PDP - y-axis deciles + labels
            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
            trans = transforms.blended_transform_factory(
                ax.transAxes, ax.transData)
            xlim = ax.get_xlim()
            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
            ax.set_ylabel(name[1])
            # hline erases xlim
            ax.set_xlim(xlim)
        else:
            ax.set_ylabel('Partial dependence')

        if len(axes) == 1:
            ax.set_ylim(pdp_lim[1])
        axs.append(ax)

    fig.subplots_adjust(bottom=0.15,
                        top=0.7,
                        left=0.1,
                        right=0.95,
                        wspace=0.4,
                        hspace=0.3)
    return fig, axs
Exemplo n.º 56
0
    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values.

        Returns
        -------
        self : regressor
            Returns self.
        """
        rs = self._get_random_state()

        # Create dataset
        ds = get_dataset(X, order="fortran")
        n_features = ds.get_n_features()

        self.outputs_2d_ = len(y.shape) == 2
        if self.outputs_2d_:
            Y = y
        else:
            Y = y.reshape(-1, 1)
        Y = np.asfortranarray(Y, dtype=np.float64)
        y = np.empty(0, dtype=np.int32)
        n_vectors = Y.shape[1]

        # Initialize coefficients
        if not self.warm_start or self.coef_ is None:
            self.C_init = self.C
            self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
            self._init_errors(Y)

        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
        indices = np.arange(n_features, dtype=np.int32)

        if self.penalty == "l1/l2":
            vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
            model = _primal_cd(self, self.coef_, self.errors_,
                               ds, y, Y, -1, False, indices, 12,
                               self._get_loss(), self.selection, self.permute,
                               self.termination, self.C, self.alpha,
                               self.max_iter, self.max_steps, self.shrinking,
                               vinit, rs, self.tol, self.callback,
                               self.n_calls, self.verbose)
            viol = model[0]
            if self.warm_start and len(self.violation_init_) == 0:
                self.violation_init_[0] = viol
        else:
            penalty = self._get_penalty()
            vinit = np.asarray(
                [self.violation_init_.get(k, 0)
                 for k in xrange(n_vectors)]) * self.C / self.C_init

            jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y,
                                        Y, k, False, indices, penalty,
                                        self._get_loss(), self.selection,
                                        self.permute, self.termination, self.C,
                                        self.alpha, self.max_iter,
                                        self.max_steps, self.shrinking,
                                        vinit[k], rs, self.tol, self.callback,
                                        self.n_calls, self.verbose)
                    for k in xrange(n_vectors))

            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, self.coef_, self.error_ = zip(*model)
            self.coef_ = np.asarray(self.coef_)
            self.error_ = np.asarray(self.error_)

            if self.warm_start and not k in self.violation_init_:
                self.violation_init_[k] = viol

        return self
Exemplo n.º 57
0
    def transform_imgs(self, imgs_list, confounds=None, copy=True, n_jobs=1):
        """Prepare multi subject data in parallel

        Parameters
        ----------

        imgs_list: list of Niimg-like objects
            See http://nilearn.github.io/manipulating_visualizing/manipulating_images.html#niimg.
            List of imgs file to prepare. One item per subject.

        confounds: list of confounds, optional
            List of confounds (2D arrays or filenames pointing to CSV
            files). Must be of same length than imgs_list.

        copy: boolean, optional
            If True, guarantees that output array has no memory in common with
            input array.

        n_jobs: integer, optional
            The number of cpus to use to do the computation. -1 means
            'all cpus'.
        
        Returns
        -------
        region_signals: list of 2D numpy.ndarray
            List of signal for each element per subject.
            shape: list of (number of scans, number of elements)
        """

        if not hasattr(self, 'mask_img_'):
            raise ValueError(
                'It seems that %s has not been fitted. '
                'You must call fit() before calling transform().' %
                self.__class__.__name__)
        target_fov = None
        if self.target_affine is None:
            # Force resampling on first image
            target_fov = 'first'

        niimg_iter = _iter_check_niimg(imgs_list,
                                       ensure_ndim=None,
                                       atleast_4d=False,
                                       target_fov=target_fov,
                                       memory=self.memory,
                                       memory_level=self.memory_level,
                                       verbose=self.verbose)

        if confounds is None:
            confounds = itertools.repeat(None, len(imgs_list))

        # Ignore the mask-computing params: they are not useful and will
        # just invalidate the cache for no good reason
        # target_shape and target_affine are conveyed implicitly in mask_img
        params = get_params(
            self.__class__,
            self,
            ignore=['mask_img', 'mask_args', 'mask_strategy', 'copy'])

        func = self._cache(
            filter_and_mask,
            ignore=['verbose', 'memory', 'memory_level', 'copy'])
        data = Parallel(n_jobs=n_jobs)(
            delayed(func)(imgs,
                          self.mask_img_,
                          params,
                          memory_level=self.memory_level,
                          memory=self.memory,
                          verbose=self.verbose,
                          confounds=cfs,
                          copy=copy)
            for imgs, cfs in izip(niimg_iter, confounds))
        return [d[0] for d in data]
 def applyAugmentation(self):
     self.readImagesAndAnnotations()
     Parallel(n_jobs=-1)(delayed(readAndGenerateImageSegmentation)(
         self.outputPath, self.generators, self.labelsExtension, x)
                         for x in enumerate(self.imagePaths))
Exemplo n.º 59
0
    def fit(self, X, y):
        """Fit estimators from the training set (X, y).

        Returns
        -------
        self : object
            Returns self.
        """

        if not isinstance(X, dict):
            raise ValueError("X has to be a dict")

        if self.base_estimator._estimator_type == 'classifier':
            self.classes_ = np.unique(y)

        self.set_random_state()

        estimators = dict()
        for roi_id, x in X.items():
            estimator = clone(self.base_estimator)
            estimator.roi_id = roi_id
            if self.base_estimator._estimator_type == 'searchlight_ensemble':
                estimator.set_params(process_mask_img=x[1])
            estimators[roi_id] = estimator

        if self.vote_graded:
            y_pred = {k: np.full(len(y), np.nan) for k in X.keys()}
            for f, (train_index, test_index) in enumerate(LeaveOneOut()):
                y_train = [y[i] for i in train_index]

                if self.base_estimator._estimator_type == 'searchlight_ensemble':
                    estimators_fit = Parallel(
                        n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        backend="threading")(
                            delayed(_parallel_build_estimator)(
                                e, [X[roi_id][0][i]
                                    for i in train_index], y_train)
                            for roi_id, e in estimators.items())
                    estimators_fit = {e.roi_id: e for e in estimators_fit}
                    y_pred_ = Parallel(
                        n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        backend="threading")(
                            delayed(_vote)(
                                e, [X[roi_id][0][i]
                                    for i in test_index], False)
                            for roi_id, e in estimators_fit.items())
                else:
                    estimators_fit = Parallel(
                        n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        backend="threading")(
                            delayed(_parallel_build_estimator)(
                                e, [X[roi_id][i]
                                    for i in train_index], y_train)
                            for roi_id, e in estimators.items())
                    estimators_fit = {e.roi_id: e for e in estimators_fit}
                    y_pred_ = Parallel(
                        n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        backend="threading")(
                            delayed(_vote)(
                                e, [X[roi_id][i] for i in test_index], False)
                            for roi_id, e in estimators_fit.items())
                for i, roi_id in enumerate(X.keys()):
                    y_pred[roi_id][test_index] = y_pred_[i]

            self.vote_weighting = [
                np.mean(v == np.array(y)) for v in y_pred.values()
            ]
            if not np.any(self.vote_weighting):
                self.vote_weighting = 1e-10 * np.ones(len(self.vote_weighting))
        else:
            self.vote_weighting = np.ones(len(X.keys())) / len(X.keys())

        if self.base_estimator._estimator_type == 'searchlight_ensemble':
            estimators = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                    delayed(_parallel_build_estimator)(e, X[roi_id][0], y)
                    for roi_id, e in estimators.items())
        else:
            estimators = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
                    delayed(_parallel_build_estimator)(e, X[roi_id], y)
                    for roi_id, e in estimators.items())

        self.estimators_ = {e.roi_id: e for e in estimators}

        return self
Exemplo n.º 60
0
    exp_prefix = experiment_name + '/lambda=' + str(
        rho) + '__' + 'beta=' + str(beta) + '__' + 'metric=' + str(metric)
    print("-" * 30, exp_prefix)
    os.makedirs('./%s/%s/%s' % (res_folder, data_prefix, exp_prefix),
                exist_ok=True)
    lml = LaundryML(data_prefix, test_prefix, exp_prefix, res_folder, k, opt,
                    rho, beta, metric, maj_pos, min_pos, sensitve_attr,
                    non_sensitve_attr, decision_attr)
    lml.run()


#create_rules(data_prefix=params['data_prefix'], original_dataset_path=params['original_dataset_path'])
#time.sleep(35)

# searching for rationalization models
for _metric in params['metrics']:
    for _lambdak in params['lambdas']:
        Parallel(n_jobs=-1)(
            delayed(bench)(beta=_beta, rho=_lambdak, metric=_metric)
            for _beta in params['betas'])

# plotting results
for _metric in params['metrics']:
    for _lambdak in params['lambdas']:
        for _beta in params['betas']:
            exp_prefix = params['experiment_name'] + '/lambda=' + str(
                _lambdak) + '__' + 'beta=' + str(
                    _beta) + '__' + 'metric=' + str(_metric)
            print("=" * 30, exp_prefix)
            enumplot(params['res_folder'], params['data_prefix'], exp_prefix)
            #get_audit(data_prefix, exp_prefix)