metricsMatrixEntire = [] global metricsMatrix metricsMatrix = [] global metricsMatrixSel metricsMatrixSel = [] global metricsMatrixEntireSel metricsMatrixEntireSel = [] return 'Reset' location = './cachedir' memory = Memory(location, verbose=0) # NOTE: Only works with labeled data def neighborhood_hit(X, y, k, selected=None): # Add 1 to k because the nearest neighbor is always the point itself k += 1 y = np.array(y) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X, y) if selected: X = X[selected, :]
import librosa import logging import numpy as np from torch.utils.data import Dataset from joblib import Memory import tempfile import shutil import specaug logger = logging.getLogger('root') FORMAT = "[%(asctime)s %(filename)s:%(lineno)s - %(funcName)s()] %(message)s" logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format=FORMAT) logger.setLevel(logging.DEBUG) CACHE_DIR = './cache' memory = Memory(CACHE_DIR, verbose=0) PAD = 0 N_FFT = 320 N_MFCC = 40 N_MELS = 80 SAMPLE_RATE = 16000 def clear_joblib_cache(): logger.info('Clear Joblib cache at: %s' % CACHE_DIR) shutil.rmtree(CACHE_DIR) def augment_audio_with_sox(path, sample_rate, tempo, gain): """ Changes tempo and gain of the recording with sox and loads it. """
import functools import numpy as np import pprint import scipy import time import zstandard as zstd # pip install zstandard from . import amm from . import matmul_datasets as md from . import pyience as pyn from . import compress from . import amm_methods as methods from joblib import Memory _memory = Memory('.', verbose=0) # NUM_TRIALS = 1 NUM_TRIALS = 10 # @_memory.cache def _estimator_for_method_id(method_id, **method_hparams): return methods.METHOD_TO_ESTIMATOR[method_id](**method_hparams) def _hparams_for_method(method_id): if method_id in methods.SKETCH_METHODS: # dvals = [2, 4, 6, 8, 12, 16, 24, 32, 48, 64] # d=1 undef on fd methods # dvals = [1, 2, 4, 8, 16, 32, 64, 128] dvals = [1, 2, 4, 8, 16, 32, 64]
from joblib import Memory import pandas as pd from os.path import expanduser memory = Memory(cachedir=expanduser('~/.cache/myapp'), verbose=0) @memory.cache def load(filename): df = pd.read_csv(filename, parse_dates=['start_time', 'end_time']) # ... return df @memory.cache def pre_process(df): df['temp'].fillna(0, inplace=True) # ...
import shutil from joblib import Parallel, delayed from all_members_ensemble import gen_members import os import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) if not sys.warnoptions: warnings.simplefilter("ignore") os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses from joblib import Memory cachedir = './parallel_brute_force_search_exec_tmpmemory' + '_' + time.strftime( "%H_%M_%S", time.localtime(time.time())) memory = Memory(cachedir, verbose=0) class Estimator: def __init__(self, classifier=None, random_state=None, accuracy=0): self.classifier = classifier self.random_state = random_state self.accuracy = accuracy def fit(self, X, y): is_fitted = True self.classifier.fit(X, y) def predict(self, X): return self.classifier.predict(X)
for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(location=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge
import os from os.path import join import mne import pandas as pd import numpy as np from joblib import Memory from . import preprocessing memory = Memory(cachedir=os.environ['PYMEG_CACHE_DIR'], verbose=0) subjects_dir = os.environ['SUBJECTS_DIR'] def set_fs_subjects_dir(directory): """Set freesurfer subjectdir environment variable""" global subjects_dir os.environ['SUBJECTS_DIR'] = directory subjects_dir = directory def check_bems(subjects): """Create a plot of all BEM segmentations.""" for sub in subjects: fig = mne.viz.plot_bem(subject=sub, subjects_dir=subjects_dir, brain_surfaces='white', orientation='coronal') @memory.cache def get_source_space(subject, sdir=None):
from rdkit import Chem from rdkit.Chem import AllChem from rdkit import DataStructs from joblib import Memory import numpy as np memory = Memory(cachedir="/tmp", verbose=0) @memory.cache def sdf_to_desc(sdf_file): fps = [] targets = [] nfps = [] for mol in Chem.SDMolSupplier(sdf_file): fps.append(AllChem.GetMorganFingerprintAsBitVect(mol, 2)) # fingerprint targets.append(float(mol.GetProp("ACTIVITY"))) # activity for fp in fps: nfp = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, nfp) nfps.append(nfp) return (np.array(nfps), np.array(targets)) if __name__ == '__main__': descs, targets = sdf_to_desc("CHEMBL3301363.sdf") print descs.shape
from joblib import Memory memory = Memory("cache", verbose=0)
import xgboost as xgb import numpy as np import time from sklearn import datasets from joblib import Memory import pandas as pd import argparse memory = Memory('./cachedir', verbose=0) # Contains a dataset in numpy format as well as the relevant objective and metric class TestDataset: def __init__(self, name, Xy, objective ): self.name = name self.objective = objective self.X, self.y = Xy def set_params(self, params_in): params_in['objective'] = self.objective if self.objective == "multi:softmax": params_in["num_class"] = int(np.max(self.y) + 1) return params_in def get_dmat(self): return xgb.DMatrix(self.X, self.y) def get_test_dmat(self, num_rows): rs = np.random.RandomState(432) return xgb.DMatrix(self.X[rs.randint(0, self.X.shape[0], size=num_rows), :])
(redirects_url, redirects_filename), (page_links_url, page_links_filename), ] for url, filename in resources: if not os.path.exists(filename): print("Downloading data from '%s', please wait..." % url) opener = urlopen(url) open(filename, 'wb').write(opener.read()) print() # ############################################################################# # Loading the redirect files memory = Memory(cachedir=".") def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" k = redirects.get(k, k) return index_map.setdefault(k, len(index_map)) DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/") SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1) def short_name(nt_uri): """Remove the < and > URI markers and the common URI prefix""" return nt_uri[SHORTNAME_SLICE]
import pylab as pl import bcolz from psi.data.io import Recording from matplotlib.backends.backend_pdf import PdfPages from tqdm import tqdm from . import util from joblib import Memory tmpdir = '/tmp/lbhb' if not os.path.exists(tmpdir): os.makedirs(tmpdir, exist_ok=True) memory = Memory(cachedir=tmpdir) def load_trial_log(dirname): try: tl_dirname = os.path.join(dirname, 'trial_log') trial_log = bcolz.ctable(rootdir=tl_dirname).todataframe() experiment_info = util.parse_filename(dirname) for k, v in experiment_info.items(): trial_log[k] = v trial_log.index.name = 'trial' trial_log.reset_index(inplace=True) trial_log.set_index(['datetime', 'animal', 'trial'], inplace=True) return trial_log except Exception as e:
from joblib import Memory import networkx as nx import dimod data_cache = './Data' mem = Memory(data_cache) def regular_graph_bqm(degree: int, variable_count: int, copy_count: int): bqm_arr = [] for _ in range(copy_count): problem_graph = nx.random_regular_graph(degree, variable_count) bqm_arr.append( dimod.generators.uniform(problem_graph, dimod.SPIN, low=0.5, high=1.0)) return bqm_arr regular_graph_bqm = mem.cache(regular_graph_bqm)
POWER_ITR = 'power_iteration' MATH = 'math_method' mapping = { "without_acs": 0, "with_acs": 1, "male": 0, "female": 1, "very fair": 0, "fair": 1, "medium": 2, "dark": 3, } CACHE_DIR = Path(settings.path_for(settings.PPR.FEEDBACK.CACHE_DIR)) CACHE = Memory(str(CACHE_DIR), verbose=1) def math_method(adj_matrix, alpha): return np.linalg.inv( np.identity(adj_matrix.shape[0]) - (alpha * adj_matrix)) def power_iteration(adj_matrix, alpha, seed): n = 0 pi_1 = None pi_2 = np.copy(seed) while True: pi_1 = np.matmul(np.dot(alpha, adj_matrix), pi_2) + np.dot( (1 - alpha), seed)
def one_dimensional_exp(): eps = 1e-1 grid = torch.linspace(-4, 12, 500)[:, None] C = compute_distance(grid, grid) x_sampler = Sampler(mean=torch.tensor([[1.], [2], [3]]), cov=torch.tensor([[[.1]], [[.1]], [[.1]]]), p=torch.ones(3) / 3) y_sampler = Sampler(mean=torch.tensor([[0.], [3], [5]]), cov=torch.tensor([[[.1]], [[.1]], [[.4]]]), p=torch.ones(3) / 3) torch.manual_seed(100) np.random.seed(100) lpx = x_sampler.log_prob(grid) lpy = y_sampler.log_prob(grid) lpx -= torch.logsumexp(lpx, dim=0) lpy -= torch.logsumexp(lpy, dim=0) px = torch.exp(lpx) py = torch.exp(lpy) fevals = [] gevals = [] labels = [] plans = [] mem = Memory(location=expanduser('~/cache')) n_samples = 5000 x, loga = x_sampler(n_samples) y, logb = y_sampler(n_samples) f, g = mem.cache(sinkhorn)(x.numpy(), y.numpy(), eps=eps, n_iter=100) f = torch.from_numpy(f).float() g = torch.from_numpy(g).float() distance = compute_distance(grid, y) feval = - eps * torch.logsumexp((- distance + g[None, :]) / eps + logb[None, :], dim=1) distance = compute_distance(grid, x) geval = - eps * torch.logsumexp((- distance + f[None, :]) / eps + loga[None, :], dim=1) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'True potential') m = 50 hatf, posx, hatg, posy, sto_fevals, sto_gevals = sampling_sinkhorn(x_sampler, y_sampler, m=m, eps=eps, n_iter=100, grid=grid) feval = evaluate_potential(hatg, posy, grid, eps) geval = evaluate_potential(hatf, posx, grid, eps) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'Online Sinkhorn') alpha, posx, posy, _, _ = rkhs_sinkhorn(x_sampler, y_sampler, m=m, eps=eps, n_iter=100, sigma=1, grid=grid) feval = evaluate_kernel(alpha, posx, grid, sigma=1) geval = evaluate_kernel(alpha, posy, grid, sigma=1) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'RKHS') fevals = torch.cat([feval[None, :] for feval in fevals], dim=0) gevals = torch.cat([geval[None, :] for geval in gevals], dim=0) fig = plt.figure(figsize=(width, .21 * width)) gs = gridspec.GridSpec(ncols=6, nrows=1, width_ratios=[1, 1.5, 1.5, .8, .8, .8], figure=fig) plt.subplots_adjust(right=0.97, left=0.05, bottom=0.27, top=0.85) ax0 = fig.add_subplot(gs[0]) ax0.plot(grid, px, label=r'$\alpha$') ax0.plot(grid, py, label=r'$\beta$') ax0.legend(frameon=False) # ax0.axis('off') ax1 = fig.add_subplot(gs[1]) ax2 = fig.add_subplot(gs[2]) ax3 = fig.add_subplot(gs[3]) ax4 = fig.add_subplot(gs[4]) ax5 = fig.add_subplot(gs[5]) for i, (label, feval, geval, (plan, x, y)) in enumerate(zip(labels, fevals, gevals, plans)): if label == 'True potential': ax1.plot(grid, feval, label=label, zorder=100 if label == 'True Potential' else 1, linewidth=3 if label == 'True potential' else 1, color='C3') ax2.plot(grid, geval, label=None, zorder=100 if label == 'True Potential' else 1, linewidth=3 if label == 'True potential' else 1, color='C3') plan = plan.numpy() ax3.contour(y[:, 0], x[:, 0], plan, levels=30) elif label == 'RKHS': ax1.plot(grid, feval, label=label, linewidth=2, color='C4') ax2.plot(grid, geval, label=None, linewidth=2, color='C4') plan = plan.numpy() ax4.contour(y[:, 0], x[:, 0], plan, levels=30) else: plan = plan.numpy() ax5.contour(y[:, 0], x[:, 0], plan, levels=30) ax0.set_title('Distributions') ax1.set_title('Estimated $f$') ax2.set_title('Estimated $g$') ax3.set_title('True OT plan') ax4.set_title('RKHS') ax5.set_title('O-S') # ax4.set_title('Estimated OT plan') colors = plt.cm.get_cmap('Blues')(np.linspace(0.2, 1, len(sto_fevals[::2]))) for i, eval in enumerate(sto_fevals[::2]): ax1.plot(grid, eval, color=colors[i], linewidth=2, label=f'O-S $n_t={i * 10 * 2 * 50}$' if i % 2 == 0 else None, zorder=1) for i, eval in enumerate(sto_gevals[::2]): ax2.plot(grid, eval, color=colors[i], linewidth=2, label=None, zorder=1) for ax in (ax1, ax2, ax3): ax.tick_params(axis='both', which='major', labelsize=5) ax.tick_params(axis='both', which='minor', labelsize=5) ax.minorticks_on() ax1.legend(frameon=False, bbox_to_anchor=(-1, -0.53), ncol=5, loc='lower left') # ax2.legend(frameon=False, bbox_to_anchor=(0., 1), loc='upper left') sns.despine(fig) for ax in [ax3, ax4, ax5]: ax.axis('off') ax0.axes.get_yaxis().set_visible(False) plt.savefig(join(get_output_dir(), 'continuous.pdf')) plt.show()
from sklearn.linear_model import LassoLarsCV, LassoCV, Lasso from sklearn.model_selection import cross_val_predict, LeaveOneOut from sklearn.preprocessing import OneHotEncoder from sklearn.cluster import AgglomerativeClustering from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from hashlib import md5 from hdmedians import medoid from functools import partial import pandas as pd import numpy as np from joblib import Memory mem = Memory('../tasks-joblib', verbose=0) def hash(s): m = md5() m.update(s.encode('utf-8')) return m.hexdigest() @mem.cache() def label_clusters_bow(socd, clusters): # Top 5 positive words for each cluster via tfidf logistic regression tfidf_vectorizer = TfidfVectorizer() tfidf = tfidf_vectorizer.fit_transform(socd.task) logistic = LogisticRegression(penalty='l1', C=2) logistic.fit(tfidf, clusters) rev_lookup = dict([(idx, w) for w,idx in tfidf_vectorizer.vocabulary_.items()]) a = np.array([[rev_lookup[j] for j in i]
FN = args.graph data = pd.read_csv(FN, delimiter = " ") G = nx.Graph() for i,s in data.iterrows(): u = s[0] v = s[1] G.add_edge(u,v) # In[ ]: memory = Memory(cachedir='./cached_bri', verbose=0) @memory.cache def cached_bridgeness(G): print("Computing centrality metric (cache miss), please be patient.") return bridgeness.bridgeness_centrality(G) bri = cached_bridgeness(G) #bri = bridgeness.bridgeness_centrality(G) #e_bri = bridgeness.edge_bridgeness_centrality(G) #bet = bridgeness.betweenness_centrality(G) # In[ ]: phi = args.phi
#lmax = 200 #Nside = 64 #m = 2 #lmax = 2000 #Nside = 1024 #lmax = 1000 #Nside = 512 m = 1 a_l = np.zeros(lmax + 1 - m) a_l[1 - m] = 1 #a_l[4 - m] = -1 #a_l[15] = 0.1 #a_l = (-1) ** np.zeros(lmax + 1) from joblib import Memory memory = Memory('joblib') @memory.cache def getroots(l, m): return associated_legendre_roots(lmax + 1, m) if 1: roots = getroots(lmax + 1, m) roots = get_ring_thetas(Nside, positive_only=True) print roots.shape[0] P = compute_normalized_associated_legendre(m, roots, lmax) print butterfly_compress(P).get_stats() print butterfly_compress(P.T).get_stats()
# !/usr/bin/env python # -*- coding: utf-8 -*- """ 基于磁盘的缓存 joblib 教程 https://joblib.readthedocs.io/en/latest/memory.html 安装 pip install joblib """ from joblib import Memory memory = Memory(location="./cachedir") @memory.cache def sum2(a, b): print(f"计算{a}+{b} ... ") return a + b print(sum2(2, 3)) print(sum2(2, 3)) print(sum2(4, 7)) print(sum2(4, 7)) print(sum2(2, 3)) print(sum2(4, 7)) """ 后续的计算直接从磁盘的缓存中进行读取了,就不走这个函数了 """
status = -1 print("Error: Enter the peplen param\n") sys.exit(status) # Extract data from the CSV format data = readCSV(dirname, peplen) # print (data[0]) # print (data[1]) # print (list(set(data[0]) & set(data[1]))) # print (distance.jaccard(data[0], data[1])) # print (data) # print (data.shape) # Construct the clustering model mem = Memory(cachedir='c:/tmp') model = hdbscan.HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0, approx_min_span_tree=True, core_dist_n_jobs=2, gen_min_span_tree=True, memory=mem, metric='euclidean', min_cluster_size=3, min_samples=None, p=None) #algorithm='best', metric='euclidean', min_cluster_size=3) # Construct clusters and display the clusters list
def __init__(self, label='Run', tooltip='Run Simulation', start_func=None, done_func=None, outcb=None, show_progress=True, width='auto', cachename=None, cachecb=None, showcache=True): self.label = label self.tooltip = tooltip self.start_func = start_func self.done_func = done_func self.outcb = outcb self.cachename = cachename self.q = Queue() self.thread = 0 self.status = None self.output = None self.show_progress = show_progress self.progress = None self.make_rname = None self.width = width self.cachecb = cachecb self.showcache = showcache self.cbuf = collections.deque(maxlen=200) # circular buffer if start_func is None: print("start_func is required.", file=sys.stderr) return if cachename: # set up cache cachedir = os.path.join(Submit.CACHEDIR, cachename) cachetabdir = os.path.join(Submit.CACHETABDIR, cachename) if not os.path.isdir(cachedir): os.makedirs(cachedir) memory = Memory(cachedir=cachetabdir, verbose=0) @memory.cache def make_rname(*args): # uuid should be unique, but check just in case while True: fname = str(uuid.uuid4()).replace('-', '') if not os.path.isdir(os.path.join(cachedir, fname)): break return fname self.make_rname = make_rname self.but = w.Button( description=self.label, tooltip=self.tooltip, button_style='success' ) self.output = w.Textarea(layout={'width': '100%', 'height': '400px'}) self.acc = w.Accordion(children=[self.output], width=self.width) self.acc.set_title(0, 'Output') self.acc.selected_index = None self.but.on_click(self._but_cb) self.disabled = False _layout = w.Layout( flex_flow='column', justify_content='flex-start', width=self.width ) self.w = w.VBox([self.but], layout=_layout)
from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), "mnist_benchmark_data"), mmap_mode="r") @memory.cache def load_data(dtype=np.float32, order="F"): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml("mnist_784") X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
from joblib import Memory # The main script will override this as necessary. # By default, no caching is performed (backend=None). memory = Memory(backend=None)
# Statistical model to extrapolate Covid-19 actives cases Here we build the statistical model behind https://covid19-dash.github.io/ The model is made by fitting a weighted least square on the log of the number of cases over a window of the few last days. This is a computational notebook: it is the actual code that is run to build the model. """ # %% # Use joblib, to speed up interactions from joblib import Memory mem = Memory(location='.') # %% # ## Load and plot the data # %% # Download the data import data_input data = mem.cache(data_input.get_data)() # We model only the active cases active = data['active'] # %% # First plot the time course of the most affected countries last_day = active.iloc[-1]
import os import mne import numpy as np from joblib import Memory from scipy.signal import tukey try: from ..utils import check_random_state except ValueError: from alphacsc.utils import check_random_state mem = Memory(location='.', verbose=0) @mem.cache() def load_data(n_trials=40, n_channels=1, n_times=6, sigma=.05, sfreq=300, f_noise=True, random_state=None, n_jobs=4): """Simulate data with 10Hz mu-wave and 10Hz oscillations. Parameters ---------- n_trials : int Number of simulated signals n_channels : int Number of channels in the signals
def test_pipeline_memory_sampler(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
And run the benchmark: LAPJV_OLD=lapjv-old bench.sh ''') lapjv_old = None from centrosome.lapjv import lapjv as lapjv_centrosome from lap.tests.test_utils import (get_dense_int, get_cost_CS, sparse_from_dense_CS, sparse_from_dense) max_time_per_benchmark = 20 szs = [10, 100, 200, 500, 1000, 2000, 5000] rngs = [100, 1000, 10000, 100000] seeds = [1299821, 15485867, 32452867, 49979693] cachedir = '/tmp/lapjv-cache' memory = Memory(cachedir=cachedir, verbose=1) @memory.cache def get_hard_data(sz, rng, seed): cost = get_dense_int(sz, 100, hard=True, seed=seed) opt = lapjv(cost)[0] return cost, opt if lapjv_old is not None: @mark.timeout(max_time_per_benchmark) @mark.parametrize('sz,rng,seed', [(sz, rng, seed) for sz in szs for rng in rngs for seed in seeds]) def test_JV_old(benchmark, sz, rng, seed):
import sys import matplotlib import matplotlib.pyplot as plt import pandas as pd from joblib import Memory matplotlib.use('TkAgg') # Auto-detect terminal width. pd.options.display.width = None pd.options.display.max_rows = 1000 pd.options.display.max_colwidth = 200 # Initialize a persistent memcache. mem_hist = Memory(cachedir='./.cached_plot_hist', verbose=0) mem_sim = Memory(cachedir='./.cached_plot_sim', verbose=0) PRINT_BASELINE = True PRINT_DELTA_ONLY = True BETWEEN_START = pd.to_datetime('09:30').time() BETWEEN_END = pd.to_datetime('09:30:00.000001').time() # Linewidth for plots. LW = 2 # Used to read and cache simulated quotes. # Doesn't actually pay attention to symbols yet. # @mem_sim.cache
import time import numpy as np import pandas as pd from scipy import sparse from joblib import Memory import matplotlib.pyplot as plt from scipy.stats.mstats import gmean from alphacsc.cython import _fast_sparse_convolve_multi from alphacsc.cython import _fast_sparse_convolve_multi_uv memory = Memory(cachedir='', verbose=0) def _dense_convolve_multi(z_i, ds): """Convolve z_i[k] and ds[k] for each atom k, and return the sum z_i : array, shape(n_atoms, n_times_valid) Activations ds : array, shape(n_atoms, n_channels, n_times_atom) Dictionary Returns ------- res : array, shape(n_channels, n_times) Result of the convolution """ return np.sum([[np.convolve(zik, dkp) for dkp in dk] for zik, dk in zip(z_i, ds)], 0)
def one_dimensional_exp_rkhs(): eps = 1e-1 grid = torch.linspace(-4, 12, 500)[:, None] C = compute_distance(grid, grid) x_sampler = Sampler(mean=torch.tensor([[1.], [2], [3]]), cov=torch.tensor([[[.1]], [[.1]], [[.1]]]), p=torch.ones(3) / 3) y_sampler = Sampler(mean=torch.tensor([[0.], [3], [5]]), cov=torch.tensor([[[.1]], [[.1]], [[.4]]]), p=torch.ones(3) / 3) torch.manual_seed(100) np.random.seed(100) lpx = x_sampler.log_prob(grid) lpy = y_sampler.log_prob(grid) lpx -= torch.logsumexp(lpx, dim=0) lpy -= torch.logsumexp(lpy, dim=0) px = torch.exp(lpx) py = torch.exp(lpy) fevals = [] gevals = [] labels = [] plans = [] mem = Memory(location=expanduser('~/cache')) n_samples = 2000 x, loga = x_sampler(n_samples) y, logb = y_sampler(n_samples) f, g = mem.cache(sinkhorn)(x.numpy(), y.numpy(), eps=eps, n_iter=100) f = torch.from_numpy(f).float() g = torch.from_numpy(g).float() distance = compute_distance(grid, y) feval = -eps * torch.logsumexp( (-distance + g[None, :]) / eps + logb[None, :], dim=1) distance = compute_distance(grid, x) geval = -eps * torch.logsumexp( (-distance + f[None, :]) / eps + loga[None, :], dim=1) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'True potential') m = 50 sigma = 1 alpha, posx, posy, sto_fevals, sto_gevals = rkhs_sinkhorn(x_sampler, y_sampler, m=m, eps=eps, n_iter=1000, sigma=sigma, grid=grid) feval = evaluate_kernel(alpha, posx, grid, sigma=sigma) geval = evaluate_kernel(alpha, posy, grid, sigma=sigma) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'RKHS descent') hatf, posx, hatg, posy, sto_fevals_, sto_gevals_ = sampling_sinkhorn( x_sampler, y_sampler, m=m, eps=eps, n_iter=1000, grid=grid) feval = evaluate_potential(hatg, posy, grid, eps) geval = evaluate_potential(hatf, posx, grid, eps) plan = (lpx[:, None] + feval[:, None] / eps + lpy[None, :] + geval[None, :] / eps - C / eps) plans.append((plan, grid, grid)) fevals.append(feval) gevals.append(geval) labels.append(f'Online Sinkhorn') fevals = torch.cat([feval[None, :] for feval in fevals], dim=0) gevals = torch.cat([geval[None, :] for geval in gevals], dim=0) fig = plt.figure(figsize=(8 * 4 / 5, 2.2)) gs = gridspec.GridSpec(ncols=4, nrows=1, width_ratios=[1.2, 1.2, 1, 1], figure=fig) plt.subplots_adjust(right=0.97, left=0.08, top=0.8) ax1 = fig.add_subplot(gs[0]) ax2 = fig.add_subplot(gs[1]) ax3 = fig.add_subplot(gs[2]) ax4 = fig.add_subplot(gs[3]) for i, (label, feval, geval, (plan, x, y)) in enumerate(zip(labels, fevals, gevals, plans)): ax1.plot(grid, feval, label=label, zorder=0 if label == 'True potential' else 1, linewidth=4 if label == 'True potential' else 2) ax2.plot( grid, geval, label=label, zorder=0 if label == 'True potential' else 1, linewidth=4 if label == 'True potential' else 2, ) plan = plan.numpy() if label == 'RKHS descent': ax3.contour(y[:, 0], x[:, 0], plan, levels=30) elif label == 'Online Sinkhorn': ax4.contour(y[:, 0], x[:, 0], plan, levels=30) ax1.set_title('Estimated $f$') ax2.set_title('Estimated $g$') ax4.set_title('Online Sinkhorn') ax3.set_title('OT plan: RKHS') ax2.legend(frameon=False, loc='upper left', bbox_to_anchor=(0, 1)) # ax2.legend(frameon=False, bbox_to_anchor=(0., 1), loc='upper left') sns.despine(fig) for ax in [ax3, ax4]: ax.axis('off') plt.savefig('continuous_rkhs.pdf') plt.show()