def run_on_real_dataset(dataset_dir): """`dataset_dir` should contain a file `*.data` in NumPy format.""" if os.path.isdir(dataset_dir): data_files = [ x for x in os.listdir(dataset_dir) if x.endswith('.data') ] if len(data_files) != 1: raise ValueError("The dataset directory {} ".format(dataset_dir) + "should contain one `.data` file but got " + "{}.".format(data_files)) data_file = data_files[0] data_path = os.path.join(dataset_dir, data_file) name_expe = data_file.split('.')[0] # Load real dataset and binarize perfs = binarize(np.loadtxt(data_path)) # da_matrix = DAMatrix(perfs=perf, name=name_expe) da_matrix = DAMatrix.load(dataset_dir) da_matrix.perfs = perfs meta_learners = get_the_meta_learners( exclude_optimal=True)[1:] # Exclude random run_expe(da_matrix, meta_learners=meta_learners, name_expe=name_expe, with_once_random=True, show_legend=True) else: raise ValueError("Not a directory: {}".format(dataset_dir))
def run_nfl(): n_datasets = 20000 n_algos = 5 perfs = (np.random.rand(n_datasets, n_algos) < 0.5).astype(int) name_expe = 'nfl' da_matrix = DAMatrix(perfs=perfs, name=name_expe) meta_learners = get_the_meta_learners() run_expe(da_matrix, meta_learners, name_expe=name_expe)
def run_3a(): n_datasets = 20000 n_algos = 4 col = (np.random.rand(n_datasets, 1) < 0.5).astype(int) perfs = np.concatenate([col] * n_algos, axis=1) name_expe = '3a-repeated-columns' da_matrix = DAMatrix(perfs=perfs, name=name_expe) meta_learners = get_the_meta_learners() run_expe(da_matrix, meta_learners, name_expe=name_expe, ylim=(0.45, 1.05))
def run_3g(): n_datasets = 20000 name_expe = '3g' epsilon = 1e-1 X1 = (np.random.rand(n_datasets, 1) < 0.5 + epsilon).astype(int) X2 = 1 - X1 perfs = np.concatenate([X1, X1, X1, X2, X2, X2], axis=1) da_matrix = DAMatrix(perfs=perfs, name=name_expe) meta_learners = get_the_meta_learners(exclude_greedy_plus=True) run_expe(da_matrix, meta_learners, name_expe=name_expe, show_legend=False)
def run_3b(): n_datasets = 20000 n_algos = 5 X1 = (np.random.rand(n_datasets, 1) < 0.5).astype(int) X2 = 1 - X1 perfs = np.concatenate([X1, X1, X2, X2], axis=1) name_expe = '3b-complementary-2-algos' da_matrix = DAMatrix(perfs=perfs, name=name_expe) # da_matrix = ComplementaryDAMatrix() meta_learners = get_the_meta_learners(exclude_greedy_plus=True) run_expe(da_matrix, meta_learners, name_expe=name_expe)
def test_unit_TopkRankMetaLearner(): meta_learner = TopkRankMetaLearner() for _ in range(10): perfs = np.array([ [1, 3, 2], [4, 6, 5], ]) n_algos = perfs.shape[1] perm = np.random.permutation(n_algos) perfs = perfs[:, perm] da_matrix = DAMatrix(perfs=perfs) meta_learner.meta_fit(da_matrix) idx = meta_learner.indices_algo_to_reveal # print(idx) # print([da_matrix.algos[i] for i in idx]) # print(perfs, idx[0], perm) assert perm[idx[0]] == 1
def run_3f_old(): n_datasets = 20000 name_expe = '3f' epsilon = 1e-1 A = (np.random.rand(n_datasets * 2, 1) < 0.5 + 2 * epsilon).astype(int) B = (np.random.rand(n_datasets * 2, 1) < 0.5 + epsilon).astype(int) C = (np.random.rand(n_datasets * 2, 1) < 0.5 - epsilon).astype(int) D = (np.random.rand(n_datasets * 2, 1) < 0.5 - 2 * epsilon).astype(int) perfs = np.concatenate([A, B, C, D], axis=1) valid_rows = [] for row in perfs: if not (row[0] == 0 and row[1] == 0 and row[2] == 0): valid_rows.append(row) if len(valid_rows) == n_datasets: break perfs = np.array(valid_rows)[:n_datasets] assert len(perfs) == n_datasets da_matrix = DAMatrix(perfs=perfs, name=name_expe) meta_learners = get_the_meta_learners() run_expe(da_matrix, meta_learners, name_expe=name_expe)
def test_nfldamatrix(): da_matrix = NFLDAMatrix() path_to_dir = da_matrix.save() da_matrix2 = DAMatrix.load(path_to_dir) print(da_matrix.perfs) print(da_matrix2.perfs)
from mlt.data import DAMatrix from mlt.metric import AccuracyMetric from mlt.metric import ArgmaxMeanMetric from mlt.metric import EmpArgmaxMetric from mlt.metric import AverageRankMetric import numpy as np ### Test cases ### perfs = np.array([ [0, 1], [0, 1], ]) da_te = DAMatrix(perfs=perfs) da_te.best_algo = 1 dist_pred = np.array([0.4, 0.6]) def test_AccuracyMetric(): accuray_metric = AccuracyMetric() assert accuray_metric(dist_pred, da_te) == 0.6 def test_ArgmaxMeanMetric(): argmax_mean_metric = ArgmaxMeanMetric() assert argmax_mean_metric(dist_pred, da_te) == 0.6 def test_EmpArgmaxMetric():
def get_da_matrix_3f(): fpath = '../results/da_matrix_4f.txt' perfs = np.loadtxt(fpath) da_matrix = DAMatrix(perfs=perfs) return da_matrix
def get_multivariate_bernoulli_3f(epsilon=1e-1, n_datasets=20000, use_cvxopt=USE_CVXOPT): """ ABCD x0: 0000 x1: 0001 x2: 0010 x3: 0011 x4: 0100 x5: 0101 x6: 0110 x7: 0111 x8: 1000 x9: 1001 x10: 1010 x11: 1011 x12: 1100 x13: 1101 x14: 1110 x15: 1111 """ n_algos = 4 e = epsilon B = [ 0.5 - 2 * e, # P(A=0) 0.5 - e, # P(B=0) 0.5 + e, # P(C=0) 0.5 + 2 * e, # P(D=0) 0, # P(B=0|A=0) = 0.5 + 2e 0, # P(C=0|A=0) = 0.5 + 2e 0, # P(D=0|A=0) = 0.5 + e 0, # P(C=0|A=0,B=0) = 0 0, # P(B=0|A=0,D=0) = 0.5 0, # P(C=0|A=0,D=0) = 0.5 1, # P(all) = 1 ] A = np.zeros(shape=(len(B), 2**n_algos)) indicess = [[] for _ in range(n_algos)] # A=0 indices_A0 = [] for i in range(2): for j in range(2): for k in range(2): s = "0{}{}{}".format(i, j, k) idx = int(s, base=2) indices_A0.append(idx) # B=0 indices_B0 = [] for i in range(2): for j in range(2): for k in range(2): s = "{}0{}{}".format(i, j, k) idx = int(s, base=2) indices_B0.append(idx) # C=0 indices_C0 = [] for i in range(2): for j in range(2): for k in range(2): s = "{}{}0{}".format(i, j, k) idx = int(s, base=2) indices_C0.append(idx) # D=0 indices_D0 = [] for i in range(2): for j in range(2): for k in range(2): s = "{}{}{}0".format(i, j, k) idx = int(s, base=2) indices_D0.append(idx) # A=0,B=0 indices_A0B0 = [] for i in range(2): for j in range(2): s = "00{}{}".format(i, j) idx = int(s, base=2) indices_A0B0.append(idx) # A=0,C=0 indices_A0C0 = [] for i in range(2): for j in range(2): s = "0{}0{}".format(i, j) idx = int(s, base=2) indices_A0C0.append(idx) # A=0,D=0 indices_A0D0 = [] for i in range(2): for j in range(2): s = "0{}{}0".format(i, j) idx = int(s, base=2) indices_A0D0.append(idx) # A=0,B=0,C=0 indices_A0B0C0 = [] for i in range(2): s = "000{}".format(i) idx = int(s, base=2) indices_A0B0C0.append(idx) # A=0,C=0,D=0 indices_A0C0D0 = [] for i in range(2): s = "0{}00".format(i) idx = int(s, base=2) indices_A0C0D0.append(idx) # A=0,B=0,D=0 indices_A0B0D0 = [] for i in range(2): s = "00{}0".format(i) idx = int(s, base=2) indices_A0B0D0.append(idx) for idx in indices_A0: A[0, idx] += 1 # P(A) for idx in indices_B0: A[1, idx] += 1 # P(B) for idx in indices_C0: A[2, idx] += 1 # P(C) for idx in indices_D0: A[3, idx] += 1 # P(D) factor_importance = 1 fi = factor_importance # P(B=0|A=0) = 0.5 + 2e for idx in indices_A0B0: A[4, idx] += 1 for idx in indices_A0: A[4, idx] += -(0.5 + 2 * e) A[4] *= fi**2 # P(C=0|A=0) = 0.5 + 2e for idx in indices_A0C0: A[5, idx] += 1 for idx in indices_A0: A[5, idx] += -(0.5 + 2 * e) A[5] *= fi**2 # We want Greedy to choose D at step 2 # P(D=0|A=0) = 0.5 + e for idx in indices_A0D0: A[6, idx] += 1 for idx in indices_A0: # A[6, idx] += - (0.5 + e) A[6, idx] += -0.5 A[6] *= fi # We want Mean to be perfect at step 3 # P(C=0|A=0,B=0) = 0 for idx in indices_A0B0C0: A[7, idx] += 1 A[7] *= fi # We wang Greedy to be bad at step 3 # P(C=0|A=0,D=0) = 0.5 for idx in indices_A0C0D0: A[8, idx] += 1 for idx in indices_A0D0: A[8, idx] += -0.5 A[8] *= fi # P(B=0|A=0,D=0) = 0.5 for idx in indices_A0B0D0: A[9, idx] += 1 for idx in indices_A0D0: A[9, idx] += -0.5 A[9] *= fi # P(all) = 1 for idx in range(2**n_algos): A[10, idx] += 1 if not use_cvxopt: # # Use optimization tool to solve the equation def f(x): y = np.dot(A, x) - B return np.dot(y, y) cons = [ { 'type': 'eq', 'fun': lambda x: x.sum() - 1 }, LinearConstraint(A=np.eye(2**n_algos), lb=np.zeros(2**n_algos), ub=np.ones(2**n_algos)), ] res = minimize(f, np.zeros(2**n_algos), method='SLSQP', constraints=cons, options={'disp': False}) x = np.array(res['x']) else: # Use CVXOPT print(A) print(A.shape) print("np.linalg.matrix_rank(A)", np.linalg.matrix_rank(A)) P = matrix(A.T.dot(A)) q = matrix(-A.T.dot(B)) G = np.concatenate([np.eye(16), -np.eye(16)]) G = matrix(G) h = [1.0] * 16 + [0.0] * 16 h = matrix(h) # AA = np.array([1.0] * 16).reshape(1, 16) # AA = matrix(AA) # b = matrix(1.0) # sol = solvers.qp(P, q, G, h, AA, b) sol = solvers.qp(P, q, G, h) x = np.array(sol['x']).reshape(16) x = [e if e >= 0 else 0 for e in x] x = np.array(x) x = x / x.sum() residu = A.dot(x) - B print("Ax:", A.dot(x)) print("B:", B) print("x.sum()", x.sum()) print(x >= 0) print(x <= 1) print("residu.shape:", residu.shape) print("residu:", residu) print("residu norm:", residu.dot(residu)) print("x:", x) perfs = [] for i in range(n_datasets): idx = np.random.choice(2**n_algos, p=x) bits = [] for _ in range(n_algos): bits.append(idx % 2) idx //= 2 bits = bits[::-1] perfs.append(bits) perfs = np.array(perfs) name = '3f' da_matrix = DAMatrix(perfs=perfs, name=name) PA0 = sum([x[i] for i in indices_A0]) PA0C0 = sum([x[i] for i in indices_A0C0]) print("Real P(C=0|A=0)={}".format(PA0C0 / PA0)) PA0 = sum([x[i] for i in indices_A0]) PA0D0 = sum([x[i] for i in indices_A0D0]) print("Real P(D=0|A=0)={}".format(PA0D0 / PA0)) return da_matrix