def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) pred = Parallel(n_jobs=n_jobs)( delayed(_cross_val_predict)( clone(estimator), X, y, train, test, predict_fun) for train, test in cv) pred = np.concatenate(pred) if cv.indices: index = np.concatenate([test for _, test in cv]) else: index = np.concatenate([np.where(test)[0] for _, test in cv]) ## pred[index] = pred doesn't work as expected pred[index] = pred.copy() if refit: return pred, clone(estimator).fit(X,y) else: return pred
#X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean)) X_pl_n_clean = preprocess.load_data('data/plural_n.txt', labels=False) X_pl_n = v_pl.transform(X_pl_n_clean) #X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean)) scores = [] n_steps = 100 print "size \tratio\tsg_score\tpl_score\tscore \tsg_std \tpl_std \tstd" for train_proportion in np.linspace(0.1, 1, 10): train_size = len(X_sg) * train_proportion steps = [ shuffle(X_sg_p, y_sg, X_pl_p, y_pl, n_samples=train_size) for k in xrange(n_steps) ] step_scores = Parallel(n_jobs=-1, verbose=False)( delayed(nouns_score)(*step, X_sg_test=X_sg_n, X_pl_test=X_pl_n) for step in steps) step_scores = np.array(step_scores) score = np.r_[train_size, train_proportion, step_scores.mean(axis=0), step_scores.std(axis=0)] print "%d\t%.2f\t%.6f\t%.6f\t%.6f\t%.4e\t%.4e\t%.4e" % tuple(score) scores.append(score) print "Pickling scores..." scores = np.array(scores) plot(scores) np.save("train_size_i", scores)
def _cpu_map(fun, param_grid, n_jobs, verbose=True): return Parallel( n_jobs=n_jobs, verbose=verbose, backend="threading", # any sklearn backend should work here )(delayed(fun)(params) for params in param_grid)
def mean_img(imgs, target_affine=None, target_shape=None, verbose=0, n_jobs=1): """ Compute the mean of the images (in the time dimension of 4th dimension) Note that if list of 4D images are given, the mean of each 4D image is computed separately, and the resulting mean is computed after. Parameters ========== imgs: Niimg-like object or iterable of Niimg-like objects See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. Images to mean. target_affine: numpy.ndarray, optional If specified, the image is resampled corresponding to this new affine. target_affine can be a 3x3 or a 4x4 matrix target_shape: tuple or list, optional If specified, the image will be resized to match this new shape. len(target_shape) must be equal to 3. A target_affine has to be specified jointly with target_shape. verbose: int, optional Controls the amount of verbosity: higher numbers give more messages (0 means no messages). n_jobs: integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. Returns ======= mean: nibabel.Nifti1Image mean image """ if (isinstance(imgs, _basestring) or not isinstance(imgs, collections.Iterable)): imgs = [imgs, ] imgs_iter = iter(imgs) first_img = check_niimg(next(imgs_iter)) # Compute the first mean to retrieve the reference # target_affine and target_shape if_needed n_imgs = 1 running_mean, first_affine = _compute_mean(first_img, target_affine=target_affine, target_shape=target_shape) if target_affine is None or target_shape is None: target_affine = first_affine target_shape = running_mean.shape[:3] for this_mean in Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_compute_mean)(n, target_affine=target_affine, target_shape=target_shape) for n in imgs_iter): n_imgs += 1 # _compute_mean returns (mean_img, affine) this_mean = this_mean[0] running_mean += this_mean running_mean = running_mean / float(n_imgs) return new_img_like(first_img, running_mean, target_affine)
def data_summary(table_schema, table, fname, sample_size=1.0, sample_rows=100, output_root='', keep_images=False, n_jobs=1): """ Summarize basic information of all columns in a data table based on the provided data schema Parameters ---------- table_schema: pandas DataFrame schema of the table, should contain data types of each column table: pandas DataFrame the data table fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the summary (useful for large tables) float: sample size in percentage sample_rows: integer number of rows to get data samples output_root: string the root directory for the output file keep_images: boolean whether to keep all generated images n_jobs: int the number of jobs to run in parall """ # check sample_size if sample_size > 1: if int(sample_size) != sample_size: raise ValueError('sample_size: only accept integer when it is > 1.0') if sample_size > table.shape[0]: print("sample_size: %d is larger than the data size: %d" % (sample_size, table.shape[0])) # check output_root if output_root != '': if not os.path.isdir(output_root): raise ValueError('output_root: root not exists') # get data samples before sample_size data_sample = table.sample(sample_rows).reset_index(drop=True) # calculate the sample size if sample_size <= 1.0: sample_size = int(table.shape[0] * sample_size) if sample_size < table.shape[0]: table = table.sample(sample_size).reset_index(drop=True) exclude_features, check_features = _check_features(table_schema) # temp dir to store all the images generated img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # create a new workbook to store everything wb = openpyxl.Workbook() # list of results all_results = [] # key features key_features = check_features['key'] if len(key_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in key_features) all_results += key_results ws = wb.create_sheet(title=u'key') # write the final result to work sheet _insert_string_results(key_results, ws, 25) # numeric features numeric_features = check_features['numeric'] if len(numeric_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)( delayed(_check_numeric)(col, table[[col]], img_dir) for col in numeric_features) all_results += numeric_results ws = wb.create_sheet(title=u'numeric') # write the final result to work sheet _insert_numeric_results(numeric_results, ws, 35, img_dir) # string features string_features = check_features['str'] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in string_features) all_results += string_results ws = wb.create_sheet(title=u'string') # write the final result to work sheet _insert_string_results(string_results, ws, 25) # date features date_features = check_features['date'] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table['%s_numeric' %(col)] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(table[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_check_date)('%s_numeric' % col, table[['%s_numeric' % col, col]], img_dir) for col in date_features) all_results += date_results ws = wb.create_sheet(title=u'date') # write the final result to work sheet _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True) # write schema ws = wb['Sheet'] ws.title = 'summary' out_schema = table_schema[['column', 'type']] out_schema['check'] = 'Ok' # output error features error_indices = [] if len(exclude_features) > 0: out_schema['check'] = out_schema.apply(lambda x : 'exclude' if x['column'] in exclude_features else x['check'], axis=1) error_indices += list(out_schema[out_schema['column'].isin(exclude_features)].index.values) # tidy up the output error_msg_dict = {} correct_info = [] for result in all_results: if 'error_msg' in result.keys(): error_msg_dict[result['column']] = result['error_msg'] else: if type(result['result_df']) == list: result_df = result['result_df'][0] else: result_df = result['result_df'] info = pd.Series(result_df['value'].values, index=result_df['feature']).to_dict() correct_info.append(info) correct_info_df = pd.DataFrame(correct_info) out_schema = out_schema.merge(correct_info_df, on='column', how='left') if len(error_msg_dict) > 0: out_schema['check'] = out_schema.apply(lambda x : error_msg_dict[x['column']] if x['column'] in error_msg_dict.keys() else x['check'], axis=1) error_indices += list(out_schema[out_schema['column'].isin(error_msg_dict.keys())].index.values) # Check for non present columns for c in ['value_min', 'value_mean', 'value_median', 'value_max']: if c not in out_schema.columns: out_schema[c] = np.nan if 'date_min' in out_schema.columns.values: order_columns = ['column', 'type', 'check', 'sample_value', 'nan_rate', 'num_uni', 'value_min', 'value_mean', 'value_median', 'value_max', 'date_min', 'date_max'] else: order_columns = ['column', 'type', 'check', 'sample_value', 'nan_rate', 'num_uni', 'value_min', 'value_mean', 'value_median', 'value_max'] _ = _insert_df(out_schema[order_columns], ws, header=True) if len(error_indices) > 0: for idx in error_indices: ws['C%d' %(idx+2)].style = 'Bad' _adjust_ws(ws=ws, row_height=25) # write data samples ws = wb.create_sheet(title=u'sample') _ = _insert_df(data_sample, ws, header=True, head_color=True, bold_first_column=False) _adjust_ws(ws=ws, row_height=20) wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' %(fname))) # remove all temp images if not keep_images: shutil.rmtree(img_dir)
def fit(self, X=None, y=None): n_alpha_grid_points = 4 self.error_fro_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.error_supp_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.error_fp_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.error_fn_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.grid_ = np.linspace(5, 200, self.n_grid_points) #self.grid_ = np.logspace(np.log10(2), np.log10(200), self.n_grid_points) if self.adj_type=='erdos-renyi': self.alphas_ = np.logspace(-2.3,np.log10(.025), n_alpha_grid_points)[::1] #self.alphas_ = np.linspace(0.95, 0.99, n_alpha_grid_points)[::-1] else: self.alphas_ = np.logspace(np.log(.15),np.log10(.4), n_alpha_grid_points)[::1] self.ks_ = [] for aidx, alpha in enumerate(self.alphas_): if self.verbose: print ('at alpha {} ({}/{})'.format( alpha, aidx, n_alpha_grid_points, )) # draw a new fixed graph for alpha cov, prec, adj = new_graph(self.n_features, alpha, adj_type=self.adj_type,random_sign=False,seed=1) n_nonzero_prec = np.count_nonzero(np.triu(adj,1).flat) self.ks_.append(n_nonzero_prec) mcmc_prng = np.random.RandomState(2) # cov, prec = _new_graph(self.n_features, alpha) # n_nonzero_prec = np.count_nonzero(prec.flat) # self.ks_.append(n_nonzero_prec) if self.verbose: print (' Graph has {} nonzero entries'.format(n_nonzero_prec)) for sidx, sample_grid in enumerate(self.grid_): n_samples = int(sample_grid * self.n_features) # Debugging # print alpha, n_samples # model selection (once) X = mvn(n_samples, self.n_features, cov,random_state=mcmc_prng) ms_estimator = clone(self.model_selection_estimator) ms_estimator.fit(X) lam = getattr(ms_estimator, self.penalty_) if self.verbose: display_lam = lam if isinstance(lam, np.ndarray): display_lam = np.linalg.norm(lam) print (' ({}/{}), n_samples = {}, selected lambda = {}'.format( sidx, self.n_grid_points, n_samples, display_lam)) # setup default trial estimator trial_estimator = QuicGraphLasso(lam=lam, mode='default', init_method='corrcoef') # estimate statistical power errors = Parallel( n_jobs=self.n_jobs, verbose=False, backend='threading', #max_nbytes=None, #batch_size=1, )( delayed(ae_trial)( trial_estimator, n_samples, self.n_features, cov, adj, random_state=mcmc_prng ) for nn in range(self.n_trials)) error_fro, error_supp, error_fp, error_fn, _ = zip(*errors) self.error_fro_[aidx, sidx] = np.mean(error_fro) self.error_supp_[aidx, sidx] = np.mean(error_supp) self.error_fp_[aidx, sidx] = np.mean(error_fp) self.error_fn_[aidx, sidx] = np.mean(error_fn) if self.verbose: print ('Results at this row:') print (' fro = {}'.format(self.error_fro_[aidx, :])) print (' supp = {}'.format(self.error_supp_[aidx, :])) print (' fp = {}'.format(self.error_fp_[aidx, :])) print (' fn = {}'.format(self.error_fn_[aidx, :])) self.is_fitted = True return self
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0): """ GLM fit for an fMRI data matrix Parameters ---------- Y : array of shape (n_time_points, n_voxels) The fMRI data. X : array of shape (n_time_points, n_regressors) The design matrix. noise_model : {'ar1', 'ols'}, optional The temporal variance model. Defaults to 'ar1'. bins : int, optional Maximum number of discrete bins for the AR(1) coef histogram. n_jobs : int, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : int, optional The verbosity level. Defaut is 0 Returns ------- labels : array of shape (n_voxels,), A map of values on voxels used to identify the corresponding model. results : dict, Keys correspond to the different labels values values are RegressionResults instances corresponding to the voxels. """ acceptable_noise_models = ['ar1', 'ols'] if noise_model not in acceptable_noise_models: raise ValueError( "Acceptable noise models are {0}. You provided 'noise_model={1}'".\ format(acceptable_noise_models, noise_model)) if Y.shape[0] != X.shape[0]: raise ValueError( 'The number of rows of Y should match the number of rows of X.' ' You provided X with shape {0} and Y with shape {1}'.\ format(X.shape, Y.shape)) # Create the model ols_result = OLSModel(X).fit(Y) if noise_model == 'ar1': # compute and discretize the AR1 coefs ar1 = ((ols_result.resid[1:] * ols_result.resid[:-1]).sum(axis=0) / (ols_result.resid ** 2).sum(axis=0)) del ols_result ar1 = (ar1 * bins).astype(np.int) * 1. / bins # Fit the AR model acccording to current AR(1) estimates results = {} labels = ar1 # Parallelize by creating a job per ARModel vals = np.unique(ar1) ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_ar_model_fit)(X, val, Y[:, labels == val]) for val in vals) for val, result in zip(vals, ar_result): results[val] = result del vals del ar_result else: labels = np.zeros(Y.shape[1]) results = {0.0: ols_result} return labels, results
def fit(self, X, y): """Fit estimators from the training set (X, y). Returns ------- self : object Returns self. """ if not isinstance(X, dict): raise ValueError("X has to be a dict") if self.base_estimator._estimator_type == "classifier": self.classes_ = np.unique(y) self.set_random_state() estimators = dict() for roi_id, x in X.items(): estimator = clone(self.base_estimator) estimator.roi_id = roi_id if self.base_estimator._estimator_type == "searchlight_ensemble": estimator.set_params(process_mask_img=x[1]) estimators[roi_id] = estimator if self.vote_graded: y_pred = {k: np.full(len(y), np.nan) for k in X.keys()} for f, (train_index, test_index) in enumerate(LeaveOneOut(len(y))): y_train = [y[i] for i in train_index] if self.base_estimator._estimator_type == "searchlight_ensemble": estimators_fit = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, [X[roi_id][0][i] for i in train_index], y_train) for roi_id, e in estimators.items() ) estimators_fit = {e.roi_id: e for e in estimators_fit} y_pred_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)(e, [X[roi_id][0][i] for i in test_index], False) for roi_id, e in estimators_fit.items() ) else: estimators_fit = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, [X[roi_id][i] for i in train_index], y_train) for roi_id, e in estimators.items() ) estimators_fit = {e.roi_id: e for e in estimators_fit} y_pred_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)(e, [X[roi_id][i] for i in test_index], False) for roi_id, e in estimators_fit.items() ) for i, roi_id in enumerate(X.keys()): y_pred[roi_id][test_index] = y_pred_[i] self.vote_weighting = [np.mean(v == np.array(y)) for v in y_pred.values()] if not np.any(self.vote_weighting): self.vote_weighting = 1e-10 * np.ones(len(self.vote_weighting)) else: self.vote_weighting = np.ones(len(X.keys())) / len(X.keys()) if self.base_estimator._estimator_type == "searchlight_ensemble": estimators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, X[roi_id][0], y) for roi_id, e in estimators.items() ) else: estimators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, X[roi_id], y) for roi_id, e in estimators.items() ) self.estimators_ = {e.roi_id: e for e in estimators} return self
# paradigm=paradigm, frametimes=frametimes, # drift_model=drift_model, hrf_model=hrf_model) # ProgressReport().finish_dir(subject_output_dir) return dict(subject_id=subject_id, mask=mask_path, effects_maps=effects_maps, z_maps=z_maps, contrasts=contrasts) # first level GLM mem = Memory(os.path.join(output_dir, "cache_dir")) n_jobs = min(n_jobs, len(subject_ids)) first_levels = Parallel(n_jobs=n_jobs)( delayed(mem.cache(do_subject_glm))(subject_id) for subject_id in subject_ids) # run second-level GLM group_zmaps = group_one_sample_t_test( [subject_data["mask"] for subject_data in first_levels], [subject_data["effects_maps"] for subject_data in first_levels], first_levels[0]["contrasts"], output_dir, threshold=2.) plot_prob_atlas([zmap for zmap in group_zmaps.values() if "_minus_" in zmap], threshold=1.2, view_type="filled_contours") plt.savefig("group_zmaps.png") show()
s['lat'] = results['geometry']['location']['lat'] s['lng'] = results['geometry']['location']['lng'] return 'ok', s elif jsondict['status'] == 'OVER_QUERY_LIMIT': return 'keyIncrement', _get_coordinate(url, keys, keyI + 1, attempt_time) elif jsondict['status'] == 'ZERO_RESULTS': return 'zero', None elif jsondict['status'] == 'UNKNOWN_ERROR': return _get_coordinate(url, keys, keyI, attempt_time + 1) else: return 'keyError,parameterError', None if __name__ == '__main__': adrsTable = pd.read_csv('fmtedAddress.csv') keys = [] n_jobs = 10 l = [] try: table = Parallel(n_jobs=n_jobs)( delayed(task_distribute)(adrsTable.Address[adrsTable.lat.isnull( )].dropna().unique(), keys, start_index, n_jobs) for start_index in range(n_jobs)) for s_list in table: l.extend(s_list) finally: if len(l) > 0: pd.concat(l, axis=1).T.to_csv('coordinate.csv', index=False)
def fit(self, X, y=None, groups=None, **fit_params): if self.fit_params is not None: warnings.warn( '"fit_params" as a constructor argument was ' 'deprecated in version 0.19 and will be removed ' 'in version 0.21. Pass fit parameters to the ' '"fit" method instead.', DeprecationWarning) if fit_params: warnings.warn( 'Ignoring fit_params passed as a constructor ' 'argument in favor of keyword arguments to ' 'the "fit" method.', RuntimeWarning) else: fit_params = self.fit_params estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) if self.multimetric_: if self.refit is not False and ( not isinstance(self.refit, six.string_types) or # This will work for both dict / list (tuple) self.refit not in scorers): raise ValueError("For multi-metric scoring, the parameter " "refit must be set to a scorer key " "to refit an estimator with the best " "parameter setting on the whole data and " "make the best_* attributes " "available for that metric. If this is not " "needed, refit should be set to False " "explicitly. %r was passed." % self.refit) else: refit_metric = self.refit else: refit_metric = 'score' X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) # Regenerate parameter iterable for each fit candidate_params = list(self._get_param_iterator()) n_candidates = len(candidate_params) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(_fit_and_score)( clone(base_estimator), X, y, scorers, train, test, self.verbose, parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, error_score=self.error_score, return_estimator=True) for parameters, ( train, test) in product(candidate_params, cv.split(X, y, groups))) n_candidates = len(candidate_params) n_folds = cv.get_n_splits() self.cv_estimators = [] for i in range(n_candidates): current_slice = out[(i * n_folds):((i + 1) * n_folds)] self.cv_estimators.append( ('model_%d' % (i + 1), [info[-1]['estimator'] for info in current_slice])) out = [info[:-1] for info in out] self.folds = list(cv.split(X, y, groups)) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, score_time) = zip(*out) else: (test_score_dicts, test_sample_counts, fit_time, score_time) = zip(*out) # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists test_scores = _aggregate_score_dicts(test_score_dicts) if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters # We want `array` to have `n_candidates` rows and `n_splits` cols. array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): # Uses closure to alter the results results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt( np.average((array - array_means[:, np.newaxis])**2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray(rankdata( -array_means, method='min'), dtype=np.int32) _store('fit_time', fit_time) _store('score_time', score_time) # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict( partial(MaskedArray, np.empty(n_candidates, ), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_%s' % scorer_name, train_scores[scorer_name], splits=True) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.multimetric_: self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_params_ = candidate_params[self.best_index_] self.best_score_ = results["mean_test_%s" % refit_metric][self.best_index_] if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) if y is not None: self.best_estimator_.fit(X, y, **fit_params) else: self.best_estimator_.fit(X, **fit_params) # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] self.cv_results_ = results self.n_splits_ = n_splits return self
def fit(self, imgs, y=None, confounds=None): """Compute the mask and the components Parameters ---------- imgs: list of Niimg-like objects See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. Data on which the PCA must be calculated. If this is a list, the affine is considered the same for all. """ # Hack to support single-subject data: if isinstance(imgs, (_basestring, nibabel.Nifti1Image)): imgs = [imgs] # This is a very incomplete hack, as it won't work right for # single-subject list of 3D filenames if len(imgs) == 0: # Common error that arises from a null glob. Capture # it early and raise a helpful message raise ValueError('Need one or more Niimg-like objects as input, ' 'an empty list was given.') if confounds is None: confounds = itertools.repeat(None, len(imgs)) # First, learn the mask if not isinstance(self.mask, (NiftiMasker, MultiNiftiMasker)): self.masker_ = MultiNiftiMasker(mask_img=self.mask, smoothing_fwhm=self.smoothing_fwhm, target_affine=self.target_affine, target_shape=self.target_shape, standardize=self.standardize, low_pass=self.low_pass, high_pass=self.high_pass, mask_strategy='epi', t_r=self.t_r, memory=self.memory, memory_level=self.memory_level, n_jobs=self.n_jobs, verbose=max(0, self.verbose - 1)) else: try: self.masker_ = clone(self.mask) except TypeError as e: # Workaround for a joblib bug: in joblib 0.6, a Memory object # with cachedir = None cannot be cloned. masker_memory = self.mask.memory if masker_memory.cachedir is None: self.mask.memory = None self.masker_ = clone(self.mask) self.mask.memory = masker_memory self.masker_.memory = Memory(cachedir=None) else: # The error was raised for another reason raise e for param_name in [ 'target_affine', 'target_shape', 'smoothing_fwhm', 'low_pass', 'high_pass', 't_r', 'memory', 'memory_level' ]: our_param = getattr(self, param_name) if our_param is None: # Default value continue if getattr(self.masker_, param_name) is not None: warnings.warn('Parameter %s of the masker overriden' % param_name) setattr(self.masker_, param_name, our_param) # Masker warns if it has a mask_img and is passed # imgs to fit(). Avoid the warning by being careful # when calling fit. if self.masker_.mask_img is None: self.masker_.fit(imgs) else: self.masker_.fit() self.mask_img_ = self.masker_.mask_img_ parameters = get_params(MultiNiftiMasker, self) # Remove non specific and redudent parameters for param_name in [ 'memory', 'memory_level', 'confounds', 'verbose', 'n_jobs' ]: parameters.pop(param_name, None) parameters['detrend'] = True # Now do the subject-level signal extraction (i.e. data-loading + # PCA) subject_pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(session_pca)(img, self.masker_.mask_img_, parameters, n_components=self.n_components, memory=self.memory, memory_level=self.memory_level, confounds=confound, verbose=self.verbose, random_state=self.random_state) for img, confound in zip(imgs, confounds)) subject_pcas, subject_svd_vals = zip(*subject_pcas) if len(imgs) > 1: if not self.do_cca: for subject_pca, subject_svd_val in \ zip(subject_pcas, subject_svd_vals): subject_pca *= subject_svd_val[:, np.newaxis] data = np.empty( (len(imgs) * self.n_components, subject_pcas[0].shape[1]), dtype=subject_pcas[0].dtype) for index, subject_pca in enumerate(subject_pcas): if self.n_components > subject_pca.shape[0]: raise ValueError('You asked for %i components. ' 'This is larger than the single-subject ' 'data size (%d).' % (self.n_components, subject_pca.shape[0])) data[index * self.n_components:(index + 1) * self.n_components] = subject_pca data, variance, _ = self._cache(randomized_svd, func_memory_level=3)( data.T, n_components=self.n_components, transpose=True, random_state=self.random_state) # as_ndarray is to get rid of memmapping data = as_ndarray(data.T) else: data = subject_pcas[0] variance = subject_svd_vals[0] self.components_ = data self.variance_ = variance return self
f1.append(f1_score(y_test, y_test_pred)) return scores_train, scores_test, precision, f1 print('{:<13} {:<16} {:<13} {:<16} {:<13} {:<16} {:<13} {:<16} {:<}'.format( '~|Acc@Train', 'IQR|Acc@Train', '~|Acc@Test', 'IQR|Acc@Test', '~|Prec@Test', 'IQR|Prec@Test', '~|F1@Test', 'IQR|F1@Test', 'Config')) for func, funcname in funcs: try: func.set_params(n_jobs=1) except Exception: pass result = Parallel(n_jobs=n_jobs, verbose=0)(delayed(parallel_fit)( func, seeds[seeds_per_job[i]:seeds_per_job[i + 1]], X, y, test_size) for i in range(n_jobs)) scores_train, scores_test, precision, f1 = zip(*result) scores_train = list(itertools.chain.from_iterable(scores_train)) scores_test = list(itertools.chain.from_iterable(scores_test)) precision = list(itertools.chain.from_iterable(precision)) f1 = list(itertools.chain.from_iterable(f1)) if funcname is None: funcname = str(func) funcname = funcname[:funcname.find('(')] print( '{:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<13.3f} {:<16.5f} {:<}' .format(np.median(scores_train), np.subtract(*np.percentile(scores_train, [75, 25])), np.median(scores_test), np.subtract(*np.percentile(scores_test, [75, 25])),
def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data X, y = check_X_y(X, y, ['csr', 'csc']) # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or len(self.estimators_) == 0: # Free allocated memory, if any self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( # TEF: changed following call to balanced procedure: delayed(_parallel_build_balanced_estimators)( n_estimators[i], self, X, y, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_samples_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[2] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=True, binarize=False) X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix="$", n=5, return_vect=True, binarize=False) X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False) X_sg_n = v_sg.transform(X_sg_n_clean) # X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean)) X_pl_n_clean = preprocess.load_data("data/plural_n.txt", labels=False) X_pl_n = v_pl.transform(X_pl_n_clean) # X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean)) scores = [] n_steps = 100 print "size \tratio\tsg_score\tpl_score\tscore \tsg_std \tpl_std \tstd" for train_proportion in np.linspace(0.1, 1, 10): train_size = len(X_sg) * train_proportion steps = [shuffle(X_sg_p, y_sg, X_pl_p, y_pl, n_samples=train_size) for k in xrange(n_steps)] step_scores = Parallel(n_jobs=-1, verbose=False)( delayed(nouns_score)(*step, X_sg_test=X_sg_n, X_pl_test=X_pl_n) for step in steps ) step_scores = np.array(step_scores) score = np.r_[train_size, train_proportion, step_scores.mean(axis=0), step_scores.std(axis=0)] print "%d\t%.2f\t%.6f\t%.6f\t%.6f\t%.4e\t%.4e\t%.4e" % tuple(score) scores.append(score) print "Pickling scores..." scores = np.array(scores) plot(scores) np.save("train_size_i", scores)
def fit(self, X, y=None): """Fit (estimates) the clusters. Parameters ---------- X : ndarray, shape (n_trials, n_channels, n_channels) ndarray of SPD matrices. y : ndarray | None (default None) Not used, here for compatibility with sklearn API. Returns ------- self : Kmeans instance The Kmean instance. """ if (self.init is not 'random') | (self.n_init == 1): # no need to iterate if init is not random labels, inertia, mdm = _fit_single(X, y, n_clusters=self.n_clusters, init=self.init, random_state=self.seed, metric=self.metric, max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs) else: numpy.random.seed(self.seed) seeds = numpy.random.randint(numpy.iinfo(numpy.int32).max, size=self.n_init) if self.n_jobs == 1: res = [] for i in range(self.n_init): res = _fit_single(X, y, n_clusters=self.n_clusters, init=self.init, random_state=seeds[i], metric=self.metric, max_iter=self.max_iter, tol=self.tol) labels, inertia, mdm = zip(res) else: res = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(_fit_single)(X, y, n_clusters=self.n_clusters, init=self.init, random_state=seed, metric=self.metric, max_iter=self.max_iter, tol=self.tol, n_jobs=1) for seed in seeds) labels, inertia, mdm = zip(*res) best = numpy.argmin(inertia) mdm = mdm[best] labels = labels[best] inertia = inertia[best] self.mdm_ = mdm self.inertia_ = inertia self.labels_ = labels return self
if __name__ == "__main__": # classifier = BipartiteRankBoost(n_estimators=50, verbose=1) classifier = GradientBoostingClassifier( n_estimators=800, subsample=0.9, learning_rate=0.05, max_depth=3, random_state=1, verbose=0 ) feature_list = ["nauthors", "npapers", "year", "nattrib", "ncoauthor", "paperrank", "globalpaperrank", "nappear"] trainfeatures = loadFeatures(feature_list, mode="train") trainlabels = cPickle.load(open("labels.train", "rb")) cv_authors = KFold(len(trainlabels), n_folds=5, indices=True, shuffle=True, random_state=1) score = Parallel(n_jobs=-1)( delayed(crossValidation)(trainlabels, trainfeatures, classifier, train_authors, test_authors, pairwise=False) for train_authors, test_authors in cv_authors ) score = np.array(score) print "score mean, std, mean-std:", score.mean(), score.std(), score.mean() - score.std() # testfeatures = loadFeatures(feature_list, mode='test') # testlabels = cPickle.load(open('labels.test', 'rb')) # trainAndPredict(trainlabels, trainfeatures, testlabels, testfeatures, classifier, pairwise=False) # shuffleCrossValidation(trainlabels, trainfeatures, classifier, n_iter=5, verbose=0, pairwise=False)
def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1, pre_dispatch='2*n_jobs', confidence=-1.): return (m for c in Parallel(n_jobs=n_jobs, batch_size=1, pre_dispatch=pre_dispatch) (delayed(_predict_chunk)(pipeline, separator, confidence, chunk) for chunk in _chunks(reads, chunk_size)) for m in c)
# first setting print("\nBinacox vs. Auto Cutoff computing times") n_features = 1 n_cut_points = 2 cov_corr = .5 sparsity = .2 N_simu = 100 n_samples_grid = [300, 500, 1000, 2000, 4000] result_ = pd.DataFrame(columns=["n_samples", "time_bina", "time_ac_all", "time_ac_grid"]) for i, n_samples in enumerate(n_samples_grid): print("n_samples: %d/%d " % ((i + 1), len(n_samples_grid))) result_n = Parallel(n_jobs=10)( delayed(get_times1)(n_simu, n_samples, n_features, n_cut_points) for n_simu in range(N_simu)) result_n = pd.DataFrame(result_n, columns=["n_samples", "time_bina", "time_ac_all", "time_ac_grid"]) result_ = result_.append(result_n, ignore_index=True) result = pd.DataFrame(columns=["n", "method", "time"]) tmp = pd.DataFrame(columns=["n", "method", "time"]) tmp.n = result_.n_samples tmp.method = "Binacox" tmp.time = result_.time_bina result = result.append(tmp, ignore_index=True) tmp.n = result_.n_samples tmp.method = "AC all"
dir_imgs = r'F:\Avinash\Ablations & Behavior\RS neurons\M homologs\20190308\20190309_behavior\f3_abl_vibAmpOnly_amp_3\fastDir_03-14-19-065345' headDiam = 1 # Approximate head diameter in mm (for determining head position by weighted average) #%% Compute background image print('Computing background...') img_back = fsb.track.computeBackground(dir_imgs) print('Estimating pixel size...') pxlSize = fsb.getPxlSize(img_back)[0] #%% Find fish position imgNames = ft.findAndSortFilesInDir(dir_imgs, ext='bmp') r = int(0.5 * headDiam / pxlSize) print('Estimating fish position...') from sklearn.externals.joblib import Parallel, delayed from skimage.io import imread fp = Parallel(n_jobs=32, verbose=1)(delayed(fsb.track.findFish)( imread(os.path.join(dir_imgs, imgName)), back_img=img_back, r=r) for imgName in imgNames) fp = np.array(fp) #%% Sanity check - Look at fish position trajectories nFramesInTrl = 750 fp_trl = ft.sublistsFromList(fp, nFramesInTrl) plt.figure(figsize=(16, 16)) plt.imshow(img_back, cmap='gray') for trl, fp_ in enumerate(fp_trl): fp_ = np.array(fp_) plt.plot(fp_[:, 0], fp_[:, 1], '.-', markersize=4, color=plt.cm.tab20(trl))
def fit(self, X=None, y=None): n_alpha_grid_points = 4 self.results_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.grid_ = np.logspace(0, np.log10(200), self.n_grid_points) if self.adj_type=='erdos-renyi': self.alphas_ = np.logspace(-2.3,np.log10(.025), n_alpha_grid_points)[::1] else: self.alphas_ = np.logspace(np.log(.1),np.log10(.3), n_alpha_grid_points)[::1] self.ks_ = [] for aidx, alpha in enumerate(self.alphas_): if self.verbose: print ('at alpha {} ({}/{})'.format( alpha, aidx, n_alpha_grid_points, )) # draw a new fixed graph for alpha cov, prec, adj = new_graph(self.n_features, alpha, adj_type=self.adj_type,random_sign=False,seed=1) n_nonzero_prec = np.count_nonzero(np.triu(adj,1).flat) self.ks_.append(n_nonzero_prec) mcmc_prng = np.random.RandomState(2) if self.verbose: print (' Graph has {} nonzero entries'.format(n_nonzero_prec)) for sidx, sample_grid in enumerate(self.grid_): n_samples = int(sample_grid * self.n_features) # Debugging # print alpha, n_samples # model selection (once) X = mvn(n_samples, self.n_features, cov,random_state=mcmc_prng) ms_estimator = clone(self.model_selection_estimator) ms_estimator.fit(X) lam = getattr(ms_estimator, self.penalty_) if self.verbose: display_lam = lam if isinstance(lam, np.ndarray): display_lam = np.linalg.norm(lam) print (' ({}/{}), n_samples = {}, selected lambda = {}'.format( sidx, self.n_grid_points, n_samples, display_lam)) # setup default trial estimator if self.trial_estimator is None: trial_estimator = QuicGraphLasso(lam=lam, mode='default', init_method='corrcoef') elif self.trial_estimator == 'Adaptive': trial_estimator = AdaptiveGraphLasso(estimator = QuicGraphLasso(lam=lam,mode='default',init_method='corrcoef'), method='inverse_squared') else: trial_estimator = self.trial_estimator # patch trial estimator with this lambda if self.trial_estimator == 'Adaptive': trial_estimator.estimator_.set_params(**{ self.penalty: lam, }) else: trial_estimator.set_params(**{ self.penalty: lam, }) # estimate statistical power exact_support_counts = Parallel( n_jobs=self.n_jobs, verbose=False, backend='threading', #max_nbytes=None, #batch_size=1, )( delayed(sp_trial)( trial_estimator, n_samples, self.n_features, cov, adj, mcmc_prng ) for nn in range(self.n_trials)) self.results_[aidx, sidx] = 1. * np.sum(exact_support_counts) / self.n_trials if self.verbose: print ('Results at this row: {}'.format(self.results_[aidx, :])) self.is_fitted = True return self
def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params): """Perform feature selection and learn model from training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. New in v 0.13.0: pandas DataFrames are now also accepted as argument for X. y : array-like, shape = [n_samples] Target values. custom_feature_names : None or tuple (default: tuple) Custom feature names for `self.k_feature_names` and `self.subsets_[i]['feature_names']`. (new in v 0.13.0) groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Passed to the fit method of the cross-validator. fit_params : dict of string -> object, optional Parameters to pass to to the fit method of classifier. Returns ------- self : object """ # reset from a potential previous fit run self.subsets_ = {} self.fitted = False self.interrupted_ = False self.best_idx_ = None self.best_feature_names_ = None self.best_score_ = None if hasattr(X, 'loc'): X_ = X.values else: X_ = X if (custom_feature_names is not None and len(custom_feature_names) != X.shape[1]): raise ValueError('If custom_feature_names is not None, ' 'the number of elements in custom_feature_names ' 'must equal the number of columns in X.') if (not isinstance(self.max_features, int) or (self.max_features > X.shape[1] or self.max_features < 1)): raise AttributeError('max_features must be' ' smaller than %d and larger than 0' % (X.shape[1] + 1)) if (not isinstance(self.min_features, int) or (self.min_features > X.shape[1] or self.min_features < 1)): raise AttributeError('min_features must be' ' smaller than %d and larger than 0' % (X.shape[1] + 1)) if self.max_features < self.min_features: raise AttributeError('min_features must be <= max_features') candidates = chain( *((combinations(range(X_.shape[1]), r=i)) for i in range(self.min_features, self.max_features + 1))) def ncr(n, r): """Return the number of combinations of length r from n items. Parameters ---------- n : {integer} Total number of items r : {integer} Number of items to select from n Returns ------- Number of combinations, integer """ r = min(r, n - r) if r == 0: return 1 numer = reduce(op.mul, range(n, n - r, -1)) denom = reduce(op.mul, range(1, r + 1)) return numer // denom all_comb = np.sum([ ncr(n=X_.shape[1], r=i) for i in range(self.min_features, self.max_features + 1) ]) n_jobs = min(self.n_jobs, all_comb) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch) work = enumerate( parallel( delayed(_calc_score)( self, X_, y, c, groups=groups, **fit_params) for c in candidates)) try: for iteration, (c, cv_scores) in work: self.subsets_[iteration] = { 'feature_idx': c, 'cv_scores': cv_scores, 'avg_score': np.mean(cv_scores) } if self.print_progress: sys.stderr.write('\rFeatures: %d/%d' % (iteration + 1, all_comb)) sys.stderr.flush() if self._TESTING_INTERRUPT_MODE: self.subsets_, self.best_feature_names_ = \ _get_featurenames(self.subsets_, self.best_idx_, custom_feature_names, X) raise KeyboardInterrupt except KeyboardInterrupt as e: self.interrupted_ = True sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...') max_score = float('-inf') for c in self.subsets_: if self.subsets_[c]['avg_score'] > max_score: max_score = self.subsets_[c]['avg_score'] best_subset = c score = max_score idx = self.subsets_[best_subset]['feature_idx'] self.best_idx_ = idx self.best_score_ = score self.fitted = True self.subsets_, self.best_feature_names_ = \ _get_featurenames(self.subsets_, self.best_idx_, custom_feature_names, X) return self
def fit(self, X, y, X_val=None, y_val=None, **kwargs): """Fit underlying estimators. If the number of classes = 2, only one model is trained to predict the class 1 (second column) Parameters ---------- X : (sparse) array-like, shape = [n_samples, n_classes] Data. y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes] Multi-class targets. An indicator matrix turns on multilabel classification. Returns ------- self """ # A sparse LabelBinarizer, with sparse_output=True, has been shown to # outpreform or match a dense label binarizer in all cases and has also # resulted in less or equal memory consumption in the fit_ovr function # overall. if X.shape[1] == 2: x_columns = (X[:, 1].ravel().T, ) else: x_columns = (col.ravel() for col in X.T) self.label_binarizer_ = LabelBinarizer(sparse_output=True) Y = self.label_binarizer_.fit_transform(y) Y = Y.tocsc() self.classes_ = self.label_binarizer_.classes_ y_columns = (col.toarray().ravel() for col in Y.T) if 'X_val' in inspect.getargspec(self.estimator.fit).args and \ X_val is not None: if X_val.shape[1] == 2: x_val_columns = (X_val[:, 1].ravel().T, ) else: x_val_columns = (col.ravel() for col in X_val.T) Y_val = self.label_binarizer_.transform(y_val) Y_val = Y_val.tocsc() y_val_columns = (col.toarray().ravel() for col in Y_val.T) else: x_val_columns = [None] * np.shape(Y)[0] y_val_columns = [None] * np.shape(Y)[0] # In cases where individual estimators are very fast to train setting # n_jobs > 1 in can results in slower performance due to the overhead # of spawning threads. See joblib issue #112. self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_binary)(self.estimator, x_column, y_column, x_val_column, y_val_column, classes=[ "not %s" % self.label_binarizer_.classes_[i], self.label_binarizer_.classes_[i] ]) for i, (x_column, y_column, x_val_column, y_val_column) in enumerate(zip(x_columns, y_columns, x_val_columns, y_val_columns))) return self
def extract_dataset(self, dataset, n_jobs=-1, verbosity=2, calc4train_set=False): if verbosity > 1: print( " Calculating Histograms %s, %s" % (colorspace_name[self._colorspace], str(self._original_bins))) if calc4train_set: images = dataset.probe.images_train + dataset.probe.images_test images += dataset.gallery.images_train + dataset.gallery.images_test else: images = dataset.probe.images_test images += dataset.gallery.images_test if dataset.probe.masks_test: if calc4train_set: masks = dataset.probe.masks_train + dataset.probe.masks_test masks += dataset.gallery.masks_train + dataset.gallery.masks_test else: masks = dataset.probe.masks_test masks += dataset.gallery.masks_test else: masks = [None] * (len(images)) if dataset.probe.regions_test: if calc4train_set: regions = dataset.probe.regions_train + dataset.probe.regions_test regions += dataset.gallery.regions_train + dataset.gallery.regions_test else: regions = dataset.probe.regions_test regions += dataset.gallery.regions_test else: regions = [None] * (len(images)) if dataset.probe.maps_test: if calc4train_set: maps = dataset.probe.maps_train + dataset.probe.maps_test maps += dataset.gallery.maps_train + dataset.galley.maps_test else: maps = dataset.probe.maps_test maps += dataset.gallery.maps_test else: maps = [None] * (len(images)) args = ((im, mask, region, m) for im, mask, region, m in zip(images, masks, regions, maps)) results = Parallel(n_jobs)( delayed(_parallel_transform)(self, im, mask, reg, m) for im, mask, reg, m in args) test_len = dataset.test_size if calc4train_set: train_len = dataset.train_size dataset.probe.fe_train = np.asarray(results[:train_len]) dataset.probe.fe_test = np.asarray(results[train_len:train_len + test_len]) dataset.gallery.fe_train = np.asarray(results[train_len + test_len:-test_len]) dataset.gallery.fe_test = np.asarray(results[-test_len:]) else: dataset.probe.fe_test = np.asarray(results[:test_len]) dataset.gallery.fe_test = np.asarray(results[-test_len:])
# generate cross validation values for leave-one-value-out or k-fold assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) nested_fold_values = range(int(p['nestedFoldCount'])) bag_count = int(p['bagCount']) bag_values = range(bag_count) if bag_count > 1 else [0] # ensure java's classpath is set classpath = environ['CLASSPATH'] # command for cluster execution if enabled use_cluster = False if 'useCluster' not in p else p['useCluster'] == 'true' cluster_cmd = 'rc.py --cores 1 --walltime 06:00:00 --queue small --allocation acc_9' # load classifiers from file, skip commented lines classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines()) classifiers = [_.strip() for _ in classifiers] working_dir = dirname(abspath(argv[0])) n_jobs = 1 if use_cluster else -1 #3 all_parameters = list( product([working_dir], [project_path], classifiers, fold_values, bag_values)) Parallel(n_jobs=n_jobs, verbose=50)(delayed(classify)(parameters) for parameters in all_parameters)
def compute_multi_epi_mask(epi_imgs, lower_cutoff=0.2, upper_cutoff=0.9, connected=True, opening=2, threshold=0.5, target_affine=None, target_shape=None, exclude_zeros=False, n_jobs=1, memory=None, verbose=0): """ Compute a common mask for several sessions or subjects of fMRI data. Uses the mask-finding algorithms to extract masks for each session or subject, and then keep only the main connected component of the a given fraction of the intersection of all the masks. Parameters ---------- epi_imgs: list of Niimgs A list of arrays, each item being a subject or a session. 3D and 4D images are accepted. If 3D images is given, we suggest to use the mean image of each session threshold: float, optional the inter-session threshold: the fraction of the total number of session in for which a voxel must be in the mask to be kept in the common mask. threshold=1 corresponds to keeping the intersection of all masks, whereas threshold=0 is the union of all masks. lower_cutoff: float, optional lower fraction of the histogram to be discarded. upper_cutoff: float, optional upper fraction of the histogram to be discarded. connected: boolean, optional if connected is True, only the largest connect component is kept. exclude_zeros: boolean, optional Consider zeros as missing values for the computation of the threshold. This option is useful if the images have been resliced with a large padding of zeros. target_affine: 3x3 or 4x4 matrix, optional This parameter is passed to image.resample_img. Please see the related documentation for details. target_shape: 3-tuple of integers, optional This parameter is passed to image.resample_img. Please see the related documentation for details. memory: instance of joblib.Memory or string Used to cache the function call. n_jobs: integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. Returns ------- mask : 3D nifti-like image The brain mask. """ if len(epi_imgs) == 0: raise TypeError('An empty object - %r - was passed instead of an ' 'image or a list of images' % epi_imgs) masks = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(compute_epi_mask)(epi_img, lower_cutoff=lower_cutoff, upper_cutoff=upper_cutoff, connected=connected, opening=opening, exclude_zeros=exclude_zeros, target_affine=target_affine, target_shape=target_shape, memory=memory) for epi_img in epi_imgs) mask = intersect_masks(masks, connected=connected, threshold=threshold) return mask
def smacof(similarities, metric=True, n_components=2, init=None, n_init=8, n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None, return_n_iter=False): """ Computes multidimensional scaling using SMACOF (Scaling by Majorizing a Complicated Function) algorithm The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes a objective function, the *stress*, using a majorization technique. The Stress Majorization, also known as the Guttman Transform, guarantees a monotone convergence of Stress, and is more powerful than traditional techniques such as gradient descent. The SMACOF algorithm for metric MDS can summarized by the following steps: 1. Set an initial start configuration, randomly or not. 2. Compute the stress 3. Compute the Guttman Transform 4. Iterate 2 and 3 until convergence. The nonmetric algorithm adds a monotonic regression steps before computing the stress. Parameters ---------- similarities : symmetric ndarray, shape (n_samples, n_samples) similarities between the points metric : boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components : int, optional, default: 2 number of dimension in which to immerse the similarities overridden if initial array is provided. init : {None or ndarray of shape (n_samples, n_components)}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array n_init : int, optional, default: 8 Number of time the smacof algorithm will be run with different initialisation. The final results will be the best output of the n_init consecutive runs in terms of stress. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. max_iter : int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose : int, optional, default: 0 level of verbosity eps : float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. return_n_iter : bool Whether or not to return the number of iterations. Returns ------- X : ndarray (n_samples,n_components) Coordinates of the n_samples points in a n_components-space stress : float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) n_iter : int The number of iterations corresponding to the best stress. Returned only if `return_n_iter` is set to True. Notes ----- "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; Groenen P. Springer Series in Statistics (1997) "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. Psychometrika, 29 (1964) "Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ similarities = check_array(similarities) random_state = check_random_state(random_state) if hasattr(init, '__array__'): init = np.asarray(init).copy() if not n_init == 1: warnings.warn( 'Explicit initial positions passed: ' 'performing only one init of the MDS instead of %d' % n_init) n_init = 1 best_pos, best_stress = None, None if n_jobs == 1: for it in range(n_init): pos, stress, n_iter_ = function_library._smacof_single( similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=random_state) if best_stress is None or stress < best_stress: best_stress = stress best_pos = pos.copy() best_iter = n_iter_ else: seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( delayed(function_library._smacof_single)( similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=seed) for seed in seeds) positions, stress, n_iters = zip(*results) best = np.argmin(stress) best_stress = stress[best] best_pos = positions[best] best_iter = n_iters[best] if return_n_iter: return best_pos, best_stress, best_iter else: return best_pos, best_stress
if split > 0: np.random.shuffle(new_order) y_shfl[new_order] = np.copy(y) Xm_shfl[new_order, :, :] = np.copy(Xm) sw_shfl[new_order] = np.copy(sample_weight) cv = StratifiedKFold(y_shfl, k=n_folds) # Cross-validation computed in parallel # run parallel computation out = Parallel(n_jobs=n_cores)(delayed(my_pipeline)(train=train, test=test, Xm_shfl=Xm_shfl, y_shfl=y_shfl, sw_shfl=sw_shfl, Xmg=Xmg, dims=dims, fs=fs, scaler=scaler, clf=clf, n_samples=n_samples, n_dims=n_dims, n_dims_tg=n_dims_tg, n_classes=n_classes) for train, test in cv) # reorder results folds and splits for fold, (train, test) in enumerate(cv): all_folds[split, fold, train] = 1 all_folds[split, fold, test] = 0 coef[split, fold, :, :, :] = out[fold]['coef'] if compute_predict: predict[split, test, :, :] = out[fold]['predict'][new_order[test], :, :]
def data_consist(_table1, _table2, _key1, _key2, _schema1, _schema2, fname, sample_size=1.0, feature_colname1='column', feature_colname2='column', dtype_colname1='type', dtype_colname2='type', output_root='', keep_images=False, n_jobs=1): """ Check consistency between two tables Parameters ---------- _table1: pandas DataFrame one of the two tables to compare _table2: pandas DataFrame one of the two tables to compare _key1: string key for table1 _key2: string key for table2 _schema1: pandas DataFrame data schema (contains column names and corresponding data types) for _table1 _schema2: pandas DataFrame data schema (contains column names and corresponding data types) for _table2 fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the comparison (useful for large tables) float: sample size in percentage feature_colname1: string, default='column' name of the column for feature of _table1 feature_colname2: string, default='column' name of the column for feature of _table2 dtype_colname1: string, default='type' name of the column for data type of _table1 dtype_colname2: string, default='type' name of the column for data type of _table2 output_root: string, default='' the root directory for the output file keep_images: boolean, default=False whether to keep all generated images n_jobs: int, default=1 the number of jobs to run in parallel """ # create a new workbook to store everything wb = openpyxl.Workbook() # prepare directory for generated images img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # copy data tables table1 = _table1.copy() table2 = _table2.copy() # calculate the sample size if sample_size <= 1.0: both_keys = list( set(table1[_key1].values).intersection(set(table2[_key2].values))) sample_size = np.min([ int(table1.shape[0] * sample_size), int(table2.shape[0] * sample_size), len(both_keys) ]) sample_keys = np.random.choice(both_keys, sample_size, replace=False) table1 = table1[table1[_key1].isin(sample_keys)].reset_index(drop=True) table2 = table2[table2[_key2].isin(sample_keys)].reset_index(drop=True) # copy both schema schema1 = _schema1.copy()[[feature_colname1, dtype_colname1]].rename(columns={ feature_colname1: 'column_1', dtype_colname1: 'type_1' }) schema2 = _schema2.copy()[[feature_colname2, dtype_colname2]].rename(columns={ feature_colname2: 'column_2', dtype_colname2: 'type_2' }) # merge two schemas schema = schema1.merge(schema2, left_on='column_1', right_on='column_2', how='outer') # if data types are different in schema1 and schema2, move to error schema_error = schema[schema['type_1'] != schema['type_2']].reset_index( drop=True) schema_error['error'] = "inconsistent data types" schema_error.loc[schema_error['column_1'].isnull(), 'error'] = "column not in table1" schema_error.loc[schema_error['column_2'].isnull(), 'error'] = "column not in table2" schema_correct = schema[schema['type_1'] == schema['type_2']].reset_index( drop=True) # classify the features to compare key_features = schema_correct[schema_correct['type_1'] == 'key']['column_1'].values numeric_features = schema_correct[schema_correct['type_1'] == 'numeric']['column_1'].values string_features = schema_correct[schema_correct['type_1'] == 'str']['column_1'].values date_features = schema_correct[schema_correct['type_1'] == 'date']['column_1'].values corr_results = [] # for key features # only check features in both tables key_features = [ feat for feat in key_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(key_features) > 0: _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)( delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir) for col in key_features) for key_result in key_results: if 'corr' in key_result.keys(): corr_results.append(key_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='key') _insert_numeric_results(key_results, ws, 40, img_dir) # for numeric features # only check features in both tables numeric_features = [ feat for feat in numeric_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(numeric_features) > 0: _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)( delayed(_consist_numeric)(col, table1[[_key1, col]], table2[ [_key2, col]], _key1, _key2, img_dir) for col in numeric_features) for numeric_result in numeric_results: if 'corr' in numeric_result.keys(): corr_results.append(numeric_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='numeric') _insert_numeric_results(numeric_results, ws, 45, img_dir) # for string features # only check features in both tables string_features = [ feat for feat in string_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_string)( col, table1[[_key1, col]], table2[[_key2, col]], _key1, _key2) for col in string_features) for string_result in string_results: if 'corr' in string_result.keys(): corr_results.append(string_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='string') _insert_string_results(string_results, ws, 25) # for date features # only check features in both tables date_features = [ feat for feat in date_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table1[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime( table1[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') table2[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime( table2[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_consist_numeric)(col, table1[[_key1, col]], table2[[_key2, col]], _key1, _key2, img_dir, date_flag=True) for col in date_features) for date_result in date_results: if 'corr' in date_result.keys(): corr_results.append(date_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='date') _insert_numeric_results(date_results, ws, 45, img_dir, date_flag=True) # insert the summary ws = wb['Sheet'] ws.title = 'summary' summary_df = schema_correct[['column_1', 'type_1']].rename(columns={ 'column_1': 'column', 'type_1': 'type' }) corr_df = pd.DataFrame(corr_results) summary_df = summary_df.merge(corr_df, on='column', how='left') summary_df['corr'] = summary_df['corr'].fillna('error') summary_df['error_flg'] = summary_df['corr'].apply(lambda x: 1 if x == 'error' else 0) error_rows = summary_df[summary_df['error_flg'] == 1].index.values _ = _insert_df(summary_df[['column', 'type', 'corr']], ws, header=True) for r_idx in error_rows: ws['C%d' % (r_idx + 2)].style = 'Bad' _adjust_ws(ws=ws, row_height=25) # if there are some errors if len(schema_error) > 0: ws = wb.create_sheet(title='error') _ = _insert_df(schema_error, ws, header=True) _adjust_ws(ws=ws, row_height=25) wb.save(filename=os.path.join(output_root, 'data_consist_%s.xlsx' % (fname))) if not keep_images: shutil.rmtree(img_dir)
def _tell(self, x, y, constraints=None, fit=True): """Perform the actual work of incorporating one or more new points. See `tell()` for the full description. This method exists to give access to the internals of adding points by side stepping all input validation and transformation.""" if "ps" in self.acq_func: if is_2Dlistlike(x): self.Xi.extend(x) self.yi.extend(y) self._n_initial_points -= len(y) elif is_listlike(x): self.Xi.append(x) self.yi.append(y) self._n_initial_points -= 1 # if y isn't a scalar it means we have been handed a batch of points elif is_listlike(y) and is_2Dlistlike(x): self.Xi.extend(x) self.yi.extend(y) if constraints is not None: self.constraints.extend(constraints) self._n_initial_points -= len(y) elif is_listlike(x): self.Xi.append(x) self.yi.append(y) if constraints is not None: self.constraints.append(constraints) self._n_initial_points -= 1 else: raise ValueError("Type of arguments `x` (%s) and `y` (%s) " "not compatible." % (type(x), type(y))) # optimizer learned something new - discard cache self.cache_ = {} # after being "told" n_initial_points we switch from sampling # random points to using a surrogate model if (fit and self._n_initial_points <= 0 and self.base_estimator_ is not None): transformed_bounds = np.array(self.space.transformed_bounds) est = clone(self.base_estimator_) if constraints is not None: est_c = clone(self.constraint_estimator_) with warnings.catch_warnings(): warnings.simplefilter("ignore") est.fit(self.space.transform(self.Xi), self.yi) if constraints is not None: est_c.fit(self.space.transform(self.Xi), self.constraints) if hasattr(self, "next_xs_") and self.acq_func == "gp_hedge": self.gains_ -= est.predict(np.vstack(self.next_xs_)) self.models.append(est) self.constraint_models.append(est) # even with BFGS as optimizer we want to sample a large number # of points and then pick the best ones as starting points X = self.space.transform(self.space.rvs( n_samples=self.n_points, random_state=self.rng)) if self.solution_processor is not None: for i in range(len(X)): x = self.solution_processor(X[i]) X[i] = list(np.concatenate(x)) self.next_xs_ = [] for cand_acq_func in self.cand_acq_funcs_: if self.constraint_estimator_ is not None: mask = np.array(self.constraints) >= 0 if np.any(mask): y_opt = np.min(np.array(self.yi)[mask]) values = _gaussian_acquisition( X=X, model=est, y_opt=y_opt, acq_func=cand_acq_func, acq_func_kwargs=self.acq_func_kwargs) else: values = np.ones(X.shape[0]) else: values = _gaussian_acquisition( X=X, model=est, y_opt=np.min(self.yi), acq_func=cand_acq_func, acq_func_kwargs=self.acq_func_kwargs) if self.constraint_estimator_ is not None: (means, stds) = est_c.predict(X, return_std=True) scaled = np.divide(means, stds) constraint_values = norm.cdf(scaled) values = np.multiply(values, constraint_values) # Find the minimum of the acquisition function by randomly # sampling points from the space if self.acq_optimizer == "sampling": order = np.argsort(values) for i in range(order.size): next_x = X[i] if list(X[i]) not in self.Xi: break # Use BFGS to find the mimimum of the acquisition function, the # minimization starts from `n_restarts_optimizer` different # points and the best minimum is used elif self.acq_optimizer == "lbfgs": x0 = X[np.argsort(values)[:self.n_restarts_optimizer]] with warnings.catch_warnings(): warnings.simplefilter("ignore") results = Parallel(n_jobs=self.n_jobs)( delayed(fmin_l_bfgs_b)( gaussian_acquisition_1D, x, args=(est, np.min(self.yi), cand_acq_func, self.acq_func_kwargs), bounds=self.space.transformed_bounds, approx_grad=False, maxiter=20) for x in x0) cand_xs = np.array([r[0] for r in results]) cand_acqs = np.array([r[1] for r in results]) next_x = cand_xs[np.argmin(cand_acqs)] # lbfgs should handle this but just in case there are # precision errors. if not self.space.is_categorical: next_x = np.clip( next_x, transformed_bounds[:, 0], transformed_bounds[:, 1]) self.next_xs_.append(next_x) if self.acq_func == "gp_hedge": logits = np.array(self.gains_) logits -= np.max(logits) exp_logits = np.exp(self.eta * logits) probs = exp_logits / np.sum(exp_logits) next_x = self.next_xs_[np.argmax(self.rng.multinomial(1, probs))] else: next_x = self.next_xs_[0] # note the need for [0] at the end self._next_x = self.space.inverse_transform( next_x.reshape((1, -1)))[0] # Pack results return create_result(self.Xi, self.yi, self.space, self.rng, models=self.models, constraints=self.constraints)
def run_perm_analysis(save_folder, domains='all', n_jobs=10, use_summary=False, type_of_analysis='any_anxiety', n_perm=1000, seed=None, n_jobs_rf=2, cat_encoding=None): if seed is None: seed = int(time()) target_col = ['persistance_anxiety', 'pureanxiety'] df, df_dtype, y = get_data( modality_name=domains, load_df=NESDA_FILE_MISSING, load_df_dtypes=NESDA_FILE_MISSING_DTYPE, load_df_summary=NESDA_FILE_MISSING_SUMMARY, load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE, load_df_labels=NESDA_FILE_LABELS, use_summary=use_summary, target_col=target_col) y, multiclass = create_labels(y, type_of_analysis) df, cat_vars = impute_data(df, df_dtype) X, var_names = categorical_encoding(df, y, cat_vars, np.arange(df.shape[0]), method=cat_encoding) n_subj, n_features = X.shape estimator = get_classifier(n_subj, random_state=seed, n_jobs_rf=n_jobs_rf, multiclass=multiclass) estimator.fit(X, y) feat_imp_true = estimator.feature_importances_ perm_col = ['perm_{}'.format(i_perm + 1) for i_perm in range(n_perm)] df_feat_imp = pd.DataFrame(index=var_names, columns=['true_feature_importances'] + perm_col) df_feat_imp['true_feature_importances'] = feat_imp_true for i_feature in range(X.shape[1]): print('{}/{}; Feature: {}'.format(i_feature + 1, X.shape[1], var_names[i_feature])) X_perm = X.copy() res = Parallel(n_jobs=n_jobs, verbose=1, pre_dispatch='2*n_jobs', max_nbytes='50M')(delayed(permute_feature)(clone( estimator), X_perm, y, i_feature) for _ in range(n_perm)) df_feat_imp.loc[var_names[i_feature], perm_col] = res df_feat_imp.to_csv( osp.join( save_folder, 'permuted_variable_importances_domains_{}.csv'.format(domains))) np.save( osp.join( save_folder, 'permuted_variable_importances_domains_{}_seed.npy'.format( domains)), np.array([seed]))
def fit(self, niimgs=None, y=None, confounds=None): """Compute the mask and the components Parameters ---------- niimgs: list of filenames or NiImages Data on which the PCA must be calculated. If this is a list, the affine is considered the same for all. """ # Hack to support single-subject data: if isinstance(niimgs, (basestring, nibabel.Nifti1Image)): niimgs = [niimgs] # This is a very incomplete hack, as it won't work right for # single-subject list of 3D filenames # First, learn the mask if not isinstance(self.mask, MultiNiftiMasker): self.masker_ = MultiNiftiMasker(mask=self.mask, smoothing_fwhm=self.smoothing_fwhm, target_affine=self.target_affine, target_shape=self.target_shape, low_pass=self.low_pass, high_pass=self.high_pass, t_r=self.t_r, memory=self.memory, memory_level=self.memory_level) else: try: self.masker_ = clone(self.mask) except TypeError as e: # Workaround for a joblib bug: in joblib 0.6, a Memory object # with cachedir = None cannot be cloned. masker_memory = self.mask.memory if masker_memory.cachedir is None: self.mask.memory = None self.masker_ = clone(self.mask) self.mask.memory = masker_memory self.masker_.memory = Memory(cachedir=None) else: # The error was raised for another reason raise e for param_name in [ 'target_affine', 'target_shape', 'smoothing_fwhm', 'low_pass', 'high_pass', 't_r', 'memory', 'memory_level' ]: if getattr(self.masker_, param_name) is not None: warnings.warn('Parameter %s of the masker overriden' % param_name) setattr(self.masker_, param_name, getattr(self, param_name)) if self.masker_.mask is None: self.masker_.fit(niimgs) else: self.masker_.fit() self.mask_img_ = self.masker_.mask_img_ parameters = get_params(MultiNiftiMasker, self) parameters['detrend'] = True parameters['standardize'] = True # Now do the subject-level signal extraction (i.e. data-loading + # PCA) subject_pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(session_pca)(niimg, self.masker_.mask_img_, parameters, n_components=self.n_components, memory=self.memory, ref_memory_level=self.memory_level, confounds=confounds, verbose=self.verbose) for niimg in niimgs) subject_pcas, subject_svd_vals = zip(*subject_pcas) if len(niimgs) > 1: if not self.do_cca: for subject_pca, subject_svd_val in \ zip(subject_pcas, subject_svd_vals): subject_pca *= subject_svd_val[:, np.newaxis] data = np.empty( (len(niimgs) * self.n_components, subject_pcas[0].shape[1]), dtype=subject_pcas[0].dtype) for index, subject_pca in enumerate(subject_pcas): if self.n_components > subject_pca.shape[0]: raise ValueError('You asked for %i components.' 'This is smaller than single-subject ' 'data size.' % self.n_components) data[index * self.n_components:(index + 1) * self.n_components] = subject_pca data, variance, _ = randomized_svd(data.T, n_components=self.n_components) data = data.T else: data = subject_pcas[0] self.components_ = data return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator foldsForEstimator = {} cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) from collections import Sized # Splits the data based on provided cross-validation splitting strategy. cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling \ {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # Change from original scikit code: adding a new argument, # foldsForEstimator, to the _fit_and_score function to track metadata # for each estimator, for each fold. # _fit_and_score fits the estimator and computes the score for a given # data-split, for given parameters. out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, foldsForEstimator, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) # Computes the scores for each of the folds, for all the possible # parameters, and stores them in grid_scores. scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in out[ grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator else: # If refit is false, we cannot _best_estimator_ is unavailable, and # further predictions can't be made on instance raise Warning( "Note: Refit has been set to false, which makes it impossible to " "make predictions using this GridSearchCV instance after fitting. " "Change refit to true to enable this") # Change from original scikit code: # Populate new field with necessary attributes for storing # cross-validation event self.grid_cv_event = [ X, foldsForEstimator, 0, type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds ] return self
def fit(self, X, y): result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_avgest_fit_est)(est, i, X, y, self.verbose) for i, est in enumerate(self.estimators)) self.estimators = result return self
feature_list = ['user_attributes'] # trainfeatures: feature list of user attributes, where each user attribute has a list of relevant products. # trainlabels: opens Pickle file containing list of products that each contain a list of 0, 1 encoded user attributes. trainfeatures = loadFeatures(feature_list, mode='train') print len(trainfeatures) #print trainfeatures trainlabels = cPickle.load(open(trainPath + 'labels.train', 'rb')) print len(trainlabels) print "Loaded train features and train labels" cv_products = KFold(len(trainlabels), n_folds=5, indices=True, shuffle=True, random_state=1) print "Set up KFold." score = Parallel(n_jobs=-1)(delayed(crossValidation)(trainlabels, trainfeatures, classifier, train_products, test_products, pairwise=False) for train_products, test_products in cv_products) score = np.array(score) print 'score mean, std, mean-std:', score.mean(), score.std(), score.mean() - score.std()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(im_features, np.array(image_classes), test_size=0.26, random_state=0) clf = SVC(C = 100, kernel='rbf').fit(X_train, y_train) scores = clf.score(X_test, y_test) print scores """ X_train, X_test, y_train, y_test = cross_validation.train_test_split(im_features, np.array(image_classes), test_size=0.05, random_state=0) gamma_range = np.power (10., np.arange (-5, 5, 0.5)); C_range = np.power (10., np.arange (-5, 5)); grid_search_params = \ [{'kernel' : ['rbf'], 'C' : C_range, 'gamma' : gamma_range},\ {'kernel' : ['linear'], 'C' : C_range}]; classifier = svm.SVC grid_search_ans = Parallel(n_jobs = -1)(delayed(run_gridSearch)(classifier, args, X_train, y_train, X_test, y_test) for args in list(grid_search.ParameterGrid(grid_search_params))) best_params = list(grid_search.ParameterGrid(grid_search_params))[grid_search_ans.index(max(grid_search_ans))] clf = classifier(**best_params).fit(X_train, y_train) pred = clf.predict(X_test) print metrics.classification_report (pred, y_test) print 'accuracy: ', metrics.accuracy_score(pred, y_test) # Save the SVM #joblib.dump((clf, training_names, stdSlr, k, voc), "surf_fm_trained.pkl", compress=3)