def _delete_orientation(self): """ Delete orientation metadata. Garbage orientation metadata can lead to severe mis-registration trouble. """ # prepare for smart caching if self.scratch is None: self.scratch = self.output_dir if self.caching: cache_dir = os.path.join(self.scratch, 'cache_dir') if not os.path.exists(cache_dir): os.makedirs(cache_dir) mem = Memory(cachedir=cache_dir, verbose=5) else: mem = Memory(None, verbose=0) # deleteorient for func for attr in ['n_sessions', 'session_output_dirs']: if getattr(self, attr) is None: warnings.warn("'%s' attribute of is None! Skipping" % attr) break else: self.func = [mem.cache(delete_orientation)( self.func[sess], self.session_output_dirs[sess]) for sess in range(self.n_sessions)] # deleteorient for anat if self.anat is not None: self.anat = mem.cache(delete_orientation)( self.anat, self.anat_output_dir)
def set_option(physical_cache_path: str = None) -> None: """ Set global options to the package. :param physical_cache_path: Caching across Python interpreter sessions can save a lot of time. This option allows on-disk caching when the path is specified. Specifying an empty string ("") switches disk-caching off explicitly. Use "." to specify the current working directory instead. Kaggle kernels "seemed to like" on disk caching as long as I didn't try to commit the notebook. Then things ended up with a Code: 0 error failing the publishing attempt. It may be the most convenient there to comment out a set_option("") while experimenting, and to uncomment it just before committing the kernel. Leaving it at the default of None leaves the options unchanged. (There's likely more to come.) :return: None """ if physical_cache_path is not None: global _mem if physical_cache_path != "": _mem = Memory(physical_cache_path, verbose=0) digit_correlations._cached_get_digit_correlation_data = \ _mem.cache(digit_correlations._uncached_get_digit_correlation_data) digit_entropy_distribution.cached_generate_sample = \ _mem.cache(digit_entropy_distribution._uncached_generate_sample) else: _mem = None digit_correlations.cached = \ digit_correlations._lru_cached_get_digit_correlation_data digit_entropy_distribution.cached_generate_sample = \ digit_entropy_distribution._lru_cached_generate_sample
def load_adni_longitudinal_rs_fmri(dirname='ADNI_longitudinal_rs_fmri', prefix='wr*.nii'): """ Returns paths of ADNI rs-fMRI """ # get file paths and description images, subject_paths, description = _get_subjects_and_description( base_dir=dirname, prefix='I[0-9]*') images = np.array(images) # get func files func_files = list(map(lambda x: _glob_subject_img( x, suffix='func/' + prefix, first_img=True), subject_paths)) func_files = np.array(func_files) # get motion files # motions = None motions = list(map(lambda x: _glob_subject_img( x, suffix='func/' + 'rp_*.txt', first_img=True), subject_paths)) # get phenotype from csv dx = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'), 'DXSUM_PDXCONV_ADNIALL.csv')) roster = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'), 'ROSTER.csv')) df = description[description['Image_ID'].isin(images)] df = df.sort_values(by='Image_ID') dx_group = np.array(df['DX_Group']) subjects = np.array(df['Subject_ID']) exams = np.array(df['EXAM_DATE']) exams = [date(int(e[:4]), int(e[5:7]), int(e[8:])) for e in exams] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _get_ridsfmri(subjects): return [_ptid_to_rid(s, roster) for s in subjects] rids = np.array(memory.cache(_get_ridsfmri)(subjects)) def _get_examdatesfmri(rids): return [_get_dx(rids[i], dx, exams[i], viscode=None, return_code=True) for i in range(len(rids))] exam_dates = np.array(memory.cache(_get_examdatesfmri)(rids)) def _get_viscodesfmri(rids): return [_get_vcodes(rids[i], str(exam_dates[i]), dx) for i in range(len(rids))] viscodes = np.array(memory.cache(_get_viscodesfmri)(rids)) vcodes, vcodes2 = viscodes[:, 0], viscodes[:, 1] return Bunch(func=func_files, dx_group=dx_group, exam_codes=vcodes, exam_dates=exam_dates, exam_codes2=vcodes2, motion=motions, subjects=subjects, images=images)
def main(): idir = 'data/tweet_sent' location = './cache' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') bert_name = 'bert-base-uncased' memory = Memory(location, verbose=0) train_dataloader, validation_dataloader = memory.cache(prepare_input_data)(idir, bert_name) model = memory.cache(train_model)(train_dataloader, bert_name, device) model.eval() estimate_model(model, validation_dataloader)
def _niigz2nii(self): """ Convert .nii.gz to .nii (crucial for SPM). """ cache_dir = os.path.join(self.output_dir, 'cache_dir') mem = Memory(cache_dir, verbose=100) self.func = mem.cache(do_niigz2nii)(self.func, output_dir=self.output_dir) if not self.anat is None: self.anat = mem.cache(do_niigz2nii)(self.anat, output_dir=self.output_dir)
def load_adni_longitudinal_hippocampus_volume(): """ Returns longitudinal hippocampus measures """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) fs = pd.read_csv(os.path.join(BASE_DIR, 'UCSFFSX51_05_20_15.csv')) # extract hippocampus numerical values column_idx = np.arange(131, 147) cols = ['ST' + str(c) + 'HS' for c in column_idx] hipp = fs[cols].values idx_num = np.array([~np.isnan(h).all() for h in hipp]) hipp = hipp[idx_num, :] # extract roster id rids = fs['RID'].values[idx_num] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) # get subject id def _getptidshippo(rids): return [_rid_to_ptid(rid, roster) for rid in rids] ptids = memory.cache(_getptidshippo)(rids) # extract exam date exams = fs['EXAMDATE'].values[idx_num] vcodes = fs['VISCODE'].values[idx_num] vcodes2 = fs['VISCODE2'].values[idx_num] exams = list(map( lambda e: date(int(e[:4]), int(e[5:7]), int(e[8:])), exams)) exams = np.array(exams) # extract diagnosis def _getdxhippo(rids, exams): return np.array(list(map(_get_dx, rids, [dx]*len(rids), exams))) dx_ind = memory.cache(_getdxhippo)(rids, exams) dx_group = DX_LIST[dx_ind] return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), hipp=np.array(hipp), exam_dates=np.array(exams), exam_codes=np.array(vcodes), exam_codes2=np.array(vcodes2))
def test_multioutput(self): cache = Memory(location=tempfile.gettempdir()) cached_func = cache.cache(sklearn.datasets.make_regression) X, Y = cached_func(n_samples=250, n_features=20, n_informative=9, n_targets=4, bias=0.5, effective_rank=10, tail_strength=0.4, noise=0.3, shuffle=True, coef=False, random_state=1) X_train = X[:200, :] Y_train = Y[:200, :] X_test = X[200:, :] Y_test = Y[200:, :] data = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test } dataset_properties = {'multioutput': True} cs = SimpleRegressionPipeline(dataset_properties=dataset_properties).\ get_hyperparameter_search_space() self._test_configurations(cs, data=data, dataset_properties=dataset_properties)
def __init__( self, gmm_ubm, feature=None, cache=False ): super(SpeakerIdentification, self).__init__() self.gmm_ubm = gmm_ubm # default features for speaker identification are MFCC # 13 coefs + delta coefs + delta delta coefs # + delta energy + delta delta energy if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC( e=False, De=True, DDe=True, coefs=13, D=True, DD=True ) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def pipeline(input_csv, output_csv, n_cores=1, cache="/tmp"): """ Find first commit hash of appearing identifier in file. :param input_csv: Path to input csv. :param output_csv: Path to store result csv. :param n_cores: How many cores to use. :param cache: Cache location. If empty - no caching """ if cache: memory = Memory(cache, verbose=0) parallel_comp = memory.cache(func=_parallel_comp) else: parallel_comp = _parallel_comp df = pd.read_csv(input_csv, header=None) df.columns = COLUMNS args = [ tuple(getattr(line, col) for col in COLUMNS) for i, line in df.iterrows() ] res = Parallel(n_jobs=n_cores)(tqdm( (delayed(parallel_comp)(arg) for arg in args), total=len(args) - 1)) new_args = [arg + (h, filename) for arg, (h, filename) in zip(args, res)] to_df = defaultdict(list) for arg in new_args: for col in NEW_COLUMNS: to_df[col].append(arg[NEW_COL2IND[col]]) new_df = pd.DataFrame.from_dict(to_df) new_df = new_df[NEW_COLUMNS] new_df.to_csv(output_csv, index=False, header=False, compression="gzip")
def parse_all(basedir, mem: Memory = None, with_paths=False, from_cache=False): _parse = parse if mem is None else mem.cache(parse, ignore=['from_cache']) for f in Path(basedir).rglob("*.html"): text = _parse(f, from_cache) if text is not None: yield (f, text) if with_paths else text
def test_multilabel(self): cache = Memory(location=tempfile.gettempdir()) cached_func = cache.cache( sklearn.datasets.make_multilabel_classification ) X, Y = cached_func( n_samples=150, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator=True, return_distributions=False, random_state=1 ) X_train = X[:100, :] Y_train = Y[:100, :] X_test = X[101:, :] Y_test = Y[101:, ] data = {'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test} dataset_properties = {'multilabel': True} cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\ get_hyperparameter_search_space() self._test_configurations(configurations_space=cs, data=data)
def test_store_standard_types(capsys, tmpdir, compress, arg): """Test that standard types can be cached in s3fs store.""" def func(arg): """Dummy function.""" print("executing function") return arg register_s3fs_store_backend() mem = Memory(location=tmpdir.strpath, backend='s3', verbose=0, compress=compress, backend_options=dict(bucket="test")) assert mem.store_backend.location == os.path.join("s3:/", tmpdir.strpath, "joblib") cached_func = mem.cache(func) result = cached_func(arg) assert result == arg out, err = capsys.readouterr() assert out == "executing function\n" assert not err # Second call should also return the cached result result2 = cached_func(arg) assert result2 == arg out, err = capsys.readouterr() assert not out assert not err
def test_clear_cache(capsys, tmpdir): """Check clearing the cache.""" def func(arg): """Dummy function.""" print("executing function") return arg register_s3fs_store_backend() mem = Memory(location=tmpdir.strpath, backend='s3', verbose=0, backend_options=dict(bucket="test")) cached_func = mem.cache(func) cached_func("test") out, _ = capsys.readouterr() assert out == "executing function\n" mem.clear() cached_func("test") out, _ = capsys.readouterr() assert out == "executing function\n" mem.clear() print(mem.store_backend.location) assert not os.listdir(mem.store_backend.location)
def cacheSetup(): global rollDiceCached global memory location = './.cache' memory = Memory(location, verbose=0) rng = np.random.RandomState(42) rollDiceCached = memory.cache(rollDice)
def extract_group_components(subject_components, variances, ccs_threshold=None, n_group_components=None, cachedir=None): # Use asarray to cast to a non memmapped array subject_components = np.asarray(subject_components) if len(subject_components) == 1: # We are in a single subject case return subject_components[0, :n_group_components].T, \ variances[0][:n_group_components] # The group components (concatenated subject components) group_components = subject_components.T group_components = np.reshape(group_components, (group_components.shape[0], -1)) # Save memory del subject_components # Inter-subject CCA memory = Memory(cachedir=cachedir, mmap_mode='r') svd = memory.cache(linalg.svd) cca_maps, ccs, _ = svd(group_components, full_matrices=False) # Save memory del group_components if n_group_components is None: n_group_components = np.argmin(ccs > ccs_threshold) cca_maps = cca_maps[:, :n_group_components] ccs = ccs[:n_group_components] return cca_maps, ccs
def test_get_cache_items(tmpdir): """Test cache items listing.""" def func(arg): """Dummy function.""" return arg register_hdfs_store_backend() mem = Memory(location=tmpdir.strpath, host=__namenode__, backend='hdfs', user='******', verbose=100, compress=False) assert not mem.store.get_cache_items() cached_func = mem.cache(func) for arg in ["test1", "test2", "test3"]: cached_func(arg) # get_cache_items always returns an empty list for the moment assert len(mem.store.get_cache_items()) == 3 mem.clear() assert not mem.store.get_cache_items()
def __init__( self, hmm=None, n_components=16, covariance_type='diag', min_duration=0.250, feature=None, cache=False ): super(SpeechActivityDetection, self).__init__() self.hmm = hmm self.hmm.min_duration = min_duration # default features for speech activity detection # are MFCC (12 coefficients + delta coefficient + delta energy) if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC(e=False, coefs=12, De=True, D=True) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def parallel_func(fcn, n_jobs=-1, verbose=None, total=None, mesg=None, cache_dir=None, **kwargs): """Get an instance of parallel and delayed function. This function is inspired by MNE's one. Parameters ---------- func : callable A function. n_jobs : int Number of jobs to run in parallel. total : int | None If int, use a progress bar to display the progress of dispatched jobs. This should only be used when directly iterating, not when using ``split_list`` or :func:`np.array_split`. If None (default), do not add a progress bar. mesg : string | None Message to display on the progress bar cache_dir : string | None If path to an existing directory, the function is going to cache the computations kwargs : dict | {} Additional arguments are sent to the joblibe.Parallel function. Returns ------- parallel: instance of joblib.Parallel or list The parallel object. my_func: callable ``func`` if not parallel or delayed(func). """ from frites.config import CONFIG # set_log_level(verbose) # manually merge inputs inside the default config for k, v in CONFIG["JOBLIB_CFG"].copy().items(): kwargs[k] = v # verbosity level of joblib kwargs['verbose'] = 1 if verbose in ['debug', True] else 0 # caching option if isinstance(cache_dir, str) and os.path.isdir(cache_dir): logger.info(f'Caching computations to {cache_dir}') memory = Memory(cache_dir, verbose=kwargs['verbose']) fcn = memory.cache(fcn) # parallel functions para_fcn = delayed(fcn) parallel = Parallel(n_jobs, **kwargs) if total is not None: def parallel_progress(op_iter): return parallel(ProgressBar(iterable=op_iter, max_value=total, mesg=mesg)) parallel_out = parallel_progress else: parallel_out = parallel return parallel_out, para_fcn
def run_dmri_pipeline(subject_session, do_topup=True, do_edc=True): subject, session = subject_session data_dir = os.path.join(source_dir, subject, session, 'dwi') write_dir = os.path.join(derivatives_dir, subject, session) dwi_dir = os.path.join(write_dir, 'dwi') # Apply topup to the images input_imgs = sorted(glob.glob('%s/sub*.nii.gz' % data_dir)) dc_imgs = sorted(glob.glob(os.path.join(dwi_dir, 'dcsub*run*.nii.gz'))) mem = Memory('/neurospin/tmp/bthirion/cache_dir') if len(dc_imgs) < len(input_imgs): se_maps = [ os.path.join(source_dir, subject, session, 'fmap', '%s_%s_dir-ap_epi.nii.gz' % (subject, session)), os.path.join(source_dir, subject, session, 'fmap', '%s_%s_dir-pa_epi.nii.gz' % (subject, session))] if do_topup: fsl_topup(se_maps, input_imgs, mem, write_dir, 'dwi') # Then proceeed with Eddy current correction # get the images dc_imgs = sorted(glob.glob(os.path.join(dwi_dir, 'dc*run*.nii.gz'))) dc_img = os.path.join(dwi_dir, 'dc%s_%s_dwi.nii.gz' % (subject, session)) concat_images(dc_imgs, dc_img) # get the bvals/bvec file_bvals = sorted(glob.glob('%s/sub*.bval' % data_dir)) bvals = np.concatenate([np.loadtxt(fbval) for fbval in sorted(file_bvals)]) bvals_file = os.path.join(dwi_dir, 'dc%s_%s_dwi.bval' % (subject, session)) np.savetxt(bvals_file, bvals) file_bvecs = sorted(glob.glob('%s/sub*.bvec' % data_dir)) bvecs = np.hstack([np.loadtxt(fbvec) for fbvec in sorted(file_bvecs)]) bvecs_file = os.path.join(dwi_dir, 'dc%s_%s_dwi.bvec' % (subject, session)) np.savetxt(bvecs_file, bvecs) # Get eddy-preprocessed images # eddy_img = nib.load(glob.glob(os.path.join(dwi_dir, 'eddc*.nii*'))[-1]) # Get eddy-preprocessed images eddy_img = mem.cache(eddy_current_correction)( dc_img, bvals_file, bvecs_file, dwi_dir, mem) # load the data gtab = gradient_table(bvals, bvecs, b0_threshold=10) # Create a brain mask from dipy.segment.mask import median_otsu b0_mask, mask = median_otsu(eddy_img.get_data(), 2, 1) if subject == 'sub-13': from nilearn.masking import compute_epi_mask from nilearn.image import index_img imgs_ = [index_img(eddy_img, i) for i in range(len(bvals)) if bvals[i] < 50] mask_img = compute_epi_mask(imgs_, upper_cutoff=.8) mask_img.to_filename('/tmp/mask.nii.gz') mask = mask_img.get_data() # do the tractography streamlines = tractography(eddy_img, gtab, mask, dwi_dir) return streamlines
def main(): np.random.seed(10) n = 100 m = 100 sampling = 0.1 eps = 1 x_sampler = Sampler(mean=np.array([[1.], [2], [3]]), cov=np.array([[[.1]], [[.1]], [[.1]]]), p=np.ones(3) / 3) y_sampler = Sampler(mean=np.array([[0.], [3], [5]]), cov=np.array([[[.1]], [[.1]], [[.4]]]), p=np.ones(3) / 3) x = x_sampler(n) y = y_sampler(m) mem = Memory(expanduser('~/cache'), verbose=0) fref, gref, refrecords = mem.cache(sinkhorn)(x, y, n_iter=1000, sampling=1., eps=eps) for pin_potential in [False]: for avg_step_size in ['1/sqrt(t)']: f, g, records = mem.cache(sinkhorn)(x, y, n_iter=int(1e6), record_every=1000, step_size='1/sqrt(t)', avg_step_size=avg_step_size, sampling=sampling, pin_potential=pin_potential, eps=eps) norm = var_norm(refrecords['f'][-1][None, :] - records['f'], axis=1) plt.plot(range(len(norm)), norm, label=f'pin_pot{pin_potential} avg{avg_step_size}') plt.xscale('log') # plt.yscale('log') plt.legend() plt.show()
def multiprocess_with_cache(inputs): """ Compute multiprocess with one memory mapped cache. """ location = "./cachedir" memory = Memory(location, verbose=0) f_memoized = memory.cache(f) return Parallel(n_jobs=-1)(delayed(f_memoized)(x) for x in inputs)
def run(aspect, word2vec, trained_model, gpu, out, test, batchsize, sparsity_coef, coherent_coef, dependent, order): """ Train "Rationalizing Neural Predictions" for one specified aspect. Please refer README.md for details. """ memory = Memory(cachedir='.', verbose=1) if os.path.splitext(test)[-1] == '.json': w2v, vocab, _, _, test_dataset = \ memory.cache(prepare_data)(None, word2vec, aspect, annotation=test) elif os.path.splitext(test)[-1] == '.gz': w2v, vocab, test_dataset, _, _ = \ memory.cache(prepare_data)(test, word2vec, aspect) else: raise ValueError( "Input 'test' must be either json file or gzipped text file with" " appropriate extension." ) encoder = rationale.models.Encoder( w2v.shape[1], order, 200, 2, dropout=0.1 ) generator_cls = (rationale.models.GeneratorDependent if dependent else rationale.models.Generator) # Original impl. uses two layers to model bi-directional LSTM generator = generator_cls(w2v.shape[1], order, 200, dropout=0.1) model = rationale.models.RationalizedRegressor( generator, encoder, w2v.shape[0], w2v.shape[1], initialEmb=w2v, dropout_emb=0.1, sparsity_coef=sparsity_coef, coherent_coef=coherent_coef ) # Resume from a snapshot chainer.serializers.load_npz(trained_model, model) if gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(gpu).use() model.to_gpu() # Copy the model to the GPU inv_vocab = {v: k for k, v in vocab.items()} results = rationale.prediction.test(model, test_dataset, inv_vocab, device=gpu, batchsize=batchsize) with open(os.path.join(out, 'output.json'), 'w') as fout: json.dump(results, fout)
def load_adni_longitudinal_csf_biomarker(): """ Returns longitudinal csf measures """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) csf_files = ['UPENNBIOMK.csv', 'UPENNBIOMK2.csv', 'UPENNBIOMK3.csv', 'UPENNBIOMK4_09_06_12.csv', 'UPENNBIOMK5_10_31_13.csv', 'UPENNBIOMK6_07_02_13.csv', 'UPENNBIOMK7.csv', 'UPENNBIOMK8.csv'] cols = ['RID', 'VISCODE', 'ABETA', 'PTAU', 'TAU'] # 3,4,5,7,8 csf = pd.DataFrame() for csf_file in csf_files[2:]: fs = pd.read_csv(os.path.join(BASE_DIR, csf_file)) csf = csf.append(fs[cols]) # remove nans from csf values biom = csf[cols[2:]].values idx = np.array([~np.isnan(v).any() for v in biom]) biom = biom[idx] # get phenotype vcodes = csf['VISCODE'].values[idx] rids = csf['RID'].values[idx] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _getptidscsf(rids): return list(map(lambda x: _rid_to_ptid(x, roster), rids)) ptids = memory.cache(_getptidscsf)(rids) # get diagnosis def _getdxcsf(rids, vcodes): return list(map(lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)], rids, vcodes)) dx_group = memory.cache(_getdxcsf)(rids, vcodes) return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), csf=np.array(biom), exam_codes=np.array(vcodes), exam_codes2=np.array(vcodes))
def construct_and_attach_filename_data(self): synsets = self.synset_list num_per_synset = self.data['num_per_synset'] seed = self.data['seed'] folder = self.local_home('PrecomputedDicts') mem = Memory(folder) compute_filename_dict = mem.cache(self.compute_filename_dict) filenames, filenames_dict = compute_filename_dict(synsets, num_per_synset, seed) self.filenames_dict = filenames_dict
def __init__(self, data_source='yahoo'): ''' Creates the object and sets up caching. :param data_source: A data souce such as `yahoo` or `google`. ''' self.data_source = data_source # caching memory = Memory(cachedir='.') self.get = memory.cache(self.get)
def get_multilabel(self): cache = Memory(location=tempfile.gettempdir()) cached_func = cache.cache(make_multilabel_classification) return cached_func(n_samples=100, n_features=10, n_classes=5, n_labels=5, return_indicator=True, random_state=1)
def add_caching_to_funcs(obj, funcNames): mem = Memory('../.add_caching_to_funcs', verbose=11) if obj is None or funcNames is None: return if isScalar(funcNames): funcNames = [funcNames] for name in funcNames: func = getattr(obj, name, None) if func is not None: setattr(obj, name, mem.cache(func))
def _forward(self, im, indices): memory = Memory(location=self.memory, verbose=0) _apply_transform_cached = memory.cache(_apply_xform) logger.info('Applying forward transformations in pipeline') for xform in self.xforms: im = _apply_transform_cached(xform, im, indices, False) logger.info('All forward transformations applied') return im
def __init__(self): self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache'), verbose=False) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def load_adni_longitudinal_mmse_score(): """ Returns longitudinal mmse scores """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) fs = pd.read_csv(os.path.join(BASE_DIR, 'MMSE.csv')) # extract nans free mmse mmse = fs['MMSCORE'].values idx_num = fs['MMSCORE'].notnull().values mmse = mmse[idx_num] # extract roster id rids = fs['RID'].values[idx_num] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _getptidsmmse(rids): return [_rid_to_ptid(rid, roster) for rid in rids] # get subject id ptids = memory.cache(_getptidsmmse)(rids) # extract visit code (don't use EXAMDATE ; null for GO/2) vcodes = fs['VISCODE'].values vcodes = vcodes[idx_num] vcodes2 = fs['VISCODE2'].values vcodes2 = vcodes2[idx_num] def _getdxmmse(rids, vcodes2): return list(map( lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)], rids, vcodes2)) # get diagnosis dx_group = memory.cache(_getdxmmse)(rids, vcodes2) return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), mmse=mmse, exam_codes=vcodes, exam_codes2=vcodes2)
def _adjoint(self, im, indices): memory = Memory(location=self.memory, verbose=0) _apply_transform_cached = memory.cache(_apply_xform) logger.info('Applying adjoint transformations in pipeline') for xform in self.xforms[::-1]: im = _apply_transform_cached(xform, im, indices, True) logger.info('All adjoint transformations applied') return im
def _run_suject_level1_glm(subject_data_dir, subject_output_dir, **kwargs): """ Just another wrapper. """ mem = Memory(os.path.join(subject_output_dir, "cache_dir")) return mem.cache(run_suject_level1_glm)(subject_data_dir, subject_output_dir, **kwargs)
def __init__(self, meta=None): if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache')) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def load_features_cached(cache_dir, participant_label, feature_base_dir, clinical_feature_file, modality='clinical'): memory = Memory(cache_dir / "load", verbose=0) allowed_modalities = [ "clinical", "structural", "structGlobScort", "functional", "fullcon" ] modality_parts = modality.split("+") for m in modality_parts: if m not in allowed_modalities: raise ValueError(m, modality) features, feature_files = memory.cache(stack_subjects_features)( participant_label, feature_base_dir, clinical_feature_file, modality) stacked_features = memory.cache(concat_modality_features)(features, modality) return stacked_features
def single(n_players, n_actions, n_matrices, _seed, conditioning, skewness, l1_penalty, gaussian_noise, stochastic_noise, _run): mem = Memory(location=expanduser('~/cache')) H = mem.cache(make_positive_matrices)(n_players, n_actions, n_matrices, conditioning, skewness, stochastic_noise, _seed) game = MatrixGame(H, l1_penalty=l1_penalty, gaussian_noise=gaussian_noise) values, policies = compute_nash(game) _run.info['policies'] = policies.tolist() _run.info['values'] = values.tolist()
def getagreement(tpl,datadir,task_type='all'): """Get agreement values for annotators in the :data:'tpl' list Args: tpl (list): combination group of annotators datadir (str): Cache data directory used by joblib Returns: namedtuple defined as ``Agree = collections.namedtuple('Agree', ['kappa', 'alpha','avg_ao'], verbose=True)`` """ mem = Memory(cachedir=datadir) readjson=mem.cache(json2taskdata.readjson,mmap_mode='r') create_task_data= mem.cache(json2taskdata.create_task_data) count_occurrances=mem.cache(json2taskdata.count_occurrances) count_labels=mem.cache(json2taskdata.count_labels) annotators=set() lectask=[] #------------------------------------------------------------------------------- # for each annotator in group tpl #------------------------------------------------------------------------------- for stditem in tpl: aname=stditem.split('.')[0][3:][-2:] annotators.add(aname) lecdict=readjson(stditem) newlectask= create_task_data(lecdict,task_type=task_type,annotator=aname) label_data=json2taskdata.create_labels_list(newlectask) abscount=count_occurrances(str(label_data)) yaml.dump(abscount,open(os.path.join( datadir,'abscount-'+aname+'.yaml'),'w')) setcount=count_labels(newlectask) yaml.dump(setcount,open(os.path.join( datadir,'setcount-'+aname+'.yaml'),'w')) lectask=lectask+newlectask task=AnnotationTask(data=lectask,distance=nltk.metrics.distance.masi_distance_mod) return {frozenset(annotators): Agree(task.kappa(),task.alpha(),task.avg_Ao())}
def make_dictionary(X, n_components=20, alpha=5., write_dir='/tmp/', contrasts=[], method='multitask', l1_ratio=.5, n_subjects=13): """Create dictionary + encoding""" from sklearn.decomposition import dict_learning_online, sparse_encode from sklearn.preprocessing import StandardScaler from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet mem = Memory(write_dir, verbose=0) dictionary = mem.cache(initial_dictionary)(n_components, X) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) if method == 'online': components, dictionary = dict_learning_online(X.T, n_components, alpha=alpha, dict_init=dictionary, batch_size=200, method='cd', return_code=True, shuffle=True, n_jobs=1, positive_code=True) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) elif method == 'sparse': components = sparse_encode(X.T, dictionary, alpha=alpha, max_iter=10, n_jobs=1, check_input=True, verbose=0, positive=True) elif method == 'multitask': # too many hard-typed parameters !!! n_voxels = X.shape[1] // n_subjects components = np.zeros((X.shape[1], n_components)) clf = MultiTaskLasso(alpha=alpha) clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio) for i in range(n_voxels): x = X[:, i:i + n_subjects * n_voxels:n_voxels] components[i: i + n_subjects * n_voxels: n_voxels] =\ clf.fit(dictionary.T, x).coef_ return dictionary, components
def init_phd_lab(cls, directory: phd_lab_directory) -> None: if cls.phd_lab_directory is not None: raise RuntimeError("Cannot initialize PhdLabWrapper " "a second time!") cls.phd_lab_directory = directory memcache = os.path.join(cls.phd_lab_directory, 'memcache') if not os.path.exists(memcache): os.makedirs(memcache) memory = Memory(".memcache", verbose=True) global fit_with_cache fit_with_cache = memory.cache(fit_with_cache)
def main(): ## subsdir=r'E:\elan projects\L2\submissions\extracted' ## dstdir=os.path.join(subsdir,r'passed') ## copypassedfiles(dstdir,subsdir) dstdir=r'E:\elan projects\L2\resubmission\full' import glob jsonflist=glob.glob(dstdir+'\\'+r'*.379.json') mem = Memory(cachedir=dstdir) json2agreementmatrix_cached=mem.cache(json2agreementmatrix) c=json2agreementmatrix_cached(jsonflist,task_type='all') print c
def __init__(self, use_cache=True, cachedir=None): """Inits TpsSolverFactory Args: use_cache: whether to cache solver matrices in file cache_dir: cached directory. if not specified, the .cache directory in parent directory of top-level package is used. """ if use_cache: if cachedir is None: # .cache directory in parent directory of top-level package cachedir = os.path.join(__import__(__name__.split('.')[0]).__path__[0], os.path.pardir, ".cache") memory = Memory(cachedir=cachedir, verbose=0) self.get_solver_mats = memory.cache(self.get_solver_mats)
def _load_data(root_dir="/", data_set="ds107", cache_dir="/volatile/storage/workspace/parietal_retreat/" + "covariance_learn/cache/", n_jobs=1): from joblib import Memory mem = Memory(cachedir=cache_dir) load_data_ = mem.cache(setup_data_paths.run) df = setup_data_paths.get_all_paths(root_dir=root_dir, data_set=data_set) # region_signals = joblib.load(os.path.join(root_dir, dump_file)) region_signals = load_data_(root_dir=root_dir, data_set=data_set, n_jobs=n_jobs, dump_dir=os.path.join(cache_dir, data_set)) return df, region_signals
def _delete_orientation(self): """ Delete orientation metadata. Garbage orientation metadata can lead to severe mis-registration trouble. """ # prepare for smart caching cache_dir = os.path.join(self.output_dir, 'cache_dir') if not os.path.exists(cache_dir): os.makedirs(cache_dir) mem = Memory(cachedir=cache_dir, verbose=5) # deleteorient for func self.func = [mem.cache(delete_orientation)( self.func[j], self.tmp_output_dir, output_tag=self.session_id[j]) for j in xrange(len(self.session_id))] # deleteorient for anat if not self.anat is None: self.anat = mem.cache(delete_orientation)( self.anat, self.tmp_output_dir)
def __init__(self, caching=False): """Create a new CompatIdFetcher object. Args: caching: Whether to cache setup from run to run. See PrebuiltCompatibilityTest.CACHING for details. """ self.compat_ids = None if caching: # This import occurs here rather than at the top of the file because we # don't want to force developers to install joblib. The caching argument # is only set to True if PrebuiltCompatibilityTest.CACHING is hand-edited # (for testing purposes). # pylint: disable=import-error from joblib import Memory memory = Memory(cachedir=tempfile.gettempdir(), verbose=0) self.FetchCompatIds = memory.cache(self.FetchCompatIds)
def ica_step(group_maps, group_variance, cachedir=None): memory = Memory(cachedir=cachedir, mmap_mode='r') # We do a spatial ICA: the arrays are transposed in the following, # axis1 = component, and axis2 is voxel number. _, ica_maps = memory.cache(fastica)(group_maps.T, whiten=False) # Project the ICAs on the group maps to give a 'cross-subject # reproducibility' score. proj = np.dot(ica_maps, group_maps) reproducibility_score = (np.abs(proj)*group_variance).sum(axis=-1) order = np.argsort(reproducibility_score)[::-1] ica_maps = ica_maps[order, :] return ica_maps.T
class _DiskCache(object): cached_methods = methods def __init__(self, *args, **kwargs): from tempfile import mkdtemp from joblib import Memory self.cachedir = cachedir or mkdtemp() self.memory = Memory(cachedir=self.cachedir) for method in self.cached_methods: setattr(self, method, self.memory.cache(getattr(self, method))) if not os.path.isdir(self.cachedir): raise OSError("Non-existent directory: ", self.cachedir) super(_DiskCache, self).__init__(*args, **kwargs)
def __init__(self, meta=None, seed=0, ntrain=15, ntest=15, num_splits=10): self.seed = seed self.ntrain = ntrain self.ntest = ntest self.num_splits = num_splits if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache')) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def __init__(self, root, filter_species_ids=None, required_attributes=None, transform=None, is_training=False, cachedir=CACHE_DIR): super(GogglesDataset, self).__init__() mem = Memory(cachedir) metadata_loader = mem.cache(self._load_metadata) self.is_training = is_training self._data_dir = root required_species, \ self.attributes, \ self._image_data = metadata_loader(root) # _load_metadata(root) cached if filter_species_ids is not None: assert type(filter_species_ids) is list filter_species_ids = set(filter_species_ids) required_species = list(filter(lambda s: s.id in filter_species_ids, required_species)) self._image_data = list(filter(lambda d: d.species.id in filter_species_ids, self._image_data)) self._species_labels = {species: label for label, species in enumerate(required_species)} if is_training is not None: self._image_data = list(filter( lambda d: d.is_for_training == is_training, self._image_data)) if required_attributes is not None: assert type(required_attributes) is list self.attributes = required_attributes elif filter_species_ids is not None: attributes = set() for species in required_species: attributes = attributes.union(species.attributes) self.attributes = list(sorted(attributes, key=lambda a: a.id)) self.num_attributes = len(self.attributes) if transform is not None: self._transform = transform else: self._transform = transforms.Compose([transforms.ToTensor()])
def __init__(self, meta=None, seed=0, ntrain=10, ntest=10, num_splits=5): self.seed = seed self.ntrain = ntrain self.ntest = ntest self.num_splits = num_splits self.names = ["Face", "Body", "Object"] if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home("cache")) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def test_cached(self): try: from joblib import Memory mem = Memory(self.cache_dir) dep_tree = { 'a': 5, 'b': 6, 'c': mem.cache(slow_func), } data = Pipeline(dep_tree) t0 = time.time() data.resolve() delta = time.time() - t0 t0 = time.time() data.resolve() delta = time.time() - t0 assert delta < .1 except: pass
def compute_confidence_par(allLearners, dada): lab_confidence = np.zeros([dada.shape[0], len(allLearners)]) tic = time.time() #import ipdb;ipdb.set_trace() print 'producing weighted outputs IN PARALLEL' mem = Memory(cachedir='tmp') classif_RBF2 = mem.cache(confidence_par) c = l_c[0] r = Parallel(n_jobs=N_JOBS)(delayed(confidence_par)(allLearners,ii,dada) for ii in enumerate(allLearners)) res, iis = zip(*r) for t,y in enumerate(iis): lab_confidence[:,y] = res[t] print "time taken to produce confidence:", round(time.time() - tic,2), "seconds" #import ipdb;ipdb.set_trace() return lab_confidence
def __init__( self, segmentation=None, duration=1., step=0.1, gap=0., threshold=0., feature=None, cache=False ): super(SpeechTurnSegmentation, self).__init__() if segmentation is None: self.segmentation = SegmentationGaussianDivergence( duration=duration, step=step, gap=gap, threshold=threshold ) else: self.segmentation = segmentation # default features for segmentation # are MFCC (energy + 12 coefficients) if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC( e=True, De=False, DDe=False, coefs=12, D=False, DD=False ) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def svm_cla_sklearn_feat_sel(features_train, features_test, labels_train, labels_test): from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, RFECV from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import zero_one_loss features_train = sp.array(features_train, dtype = 'uint8') features_test = sp.array(features_test, dtype = 'uint8') print "zscore features" tic = time.time() features_train, mean_f, std_f = features_preprocessing(features_train) features_test, mean_f, std_f = features_preprocessing(features_test, mean_f, std_f) print "time taken to zscore data is:", round(time.time() - tic) , "seconds" featSize = np.shape(features_train) selector = LinearSVC(C=0.0007, penalty="l1", dual=False).fit(features_train, labels_train) print 'Starting with %d samp, %d feats, keeping %d' % (featSize[0], featSize[1], (np.shape(selector.transform(features_train)))[1]) print 'classifying' features_train = selector.transform(features_train) features_test = selector.transform(features_test) #import ipdb; ipdb.set_trace() mem = Memory(cachedir='tmp') classif_RBF2 = mem.cache(classif_RBF) c = l_c[0] Parallel(n_jobs=8)(delayed(classif_RBF2)(features_train, features_test, labels_train, labels_test, g, c) for g in l_g) #import ipdb; ipdb.set_trace() print "Starting CONTROL classification for c = ", c tic = time.time() clf = SVC(C=c) clf.fit(features_train, labels_train) #[:1960][:] score = clf.score(features_test, labels_test) #[:13841][:] print "selected CONTROL score for c = ", c, "is: ", score print "time taken:", time.time() - tic, "seconds"
test_func = mem.cache(test_func) Parallel(n_jobs=1)(delayed(test_func)(i) for i in [a, a, a]) Parallel(n_jobs=2)(delayed(test_func)(i) for i in [a, a, a]) ### Can use with latest version on github from joblib import Parallel, delayed import numpy as np a = np.memmap('/tmp/memmaped', dtype=np.float32, mode='w+', shape=(3, 5)) b = np.memmap('/tmp/memmaped', dtype=np.float32, mode='r', shape=(3, 5)) Parallel(n_jobs=2)(delayed(np.mean)(x) for x in np.array_split(b, 3)) cachedir2 = mkdtemp() memory2 = Memory(cachedir=cachedir2, mmap_mode='r') square = memory2.cache(np.square) a = np.vander(np.arange(3)).astype(np.float) square(a) import joblib import numpy as np testarray = {} for i in xrange(5): testarray[i] = convert2memmap(np.array(range(500*100))) filepath = "/tmp/test.joblib" res = joblib.dump(testarray, filepath) testarray = joblib.load(filepath, mmap_mode="r+") for key in testarray:
def cfunc(*fargs, **fkwargs): return Memory.cache(self, func, *args, **kwargs).__call__(*fargs, **fkwargs)