def gemm_recompute(A, B, thresh, s3_key): """ Compute A * B.T via speculative execution (i.e., recompute straggling workers). Params ====== A : numpywren.matrix.BigMatrix First input matrix. B : numpywren.matrix.BigMatrix Second input matrix. thresh : float (in [0, 1]) Fraction of workers that should finish before recomputing. s3_key : str Storage key for output matrix. Returns ======= C : matrix.BigMatrix Resultant matrix product. t_comp : float Time for thresh percentage of the workers to finish. t_straggle : float Time for the remaining 1 - thresh percentage of the workers to finish after we begin recomputing. """ if not (0 <= thresh <= 1): raise ValueError("thresh must be in the interval [0, 1]") """Initialize output matrix""" num_col_blocks = A.shape[1] // A.shard_sizes[1] shard_sizes = (A.shard_sizes[0], B.shard_sizes[0]) C = matrix.BigMatrix(s3_key, shape=(A.shape[0], B.shape[0]), shard_sizes=shard_sizes, autosqueeze=False, write_header=True) C.delete() # Only needed if you reuse the same s3_key (if the blocks already exist, no work will be done here) """Stage 1: Compute "thresh" percentage of the results""" t_comp_start = time.time() pwex = pywren.lambda_executor() futures, num_done = pwex.map(lambda x: pywren_gemm(x, A, B, C, num_col_blocks), C.block_idxs), 0 while num_done < thresh * len(futures): fs_dones, _ = pywren.wait(futures, return_when=ANY_COMPLETED) num_done = len(fs_dones) t_comp = time.time() - t_comp_start # Total stage 1 time """Stage 2: Recompute straggling workers (the last 1-thresh percent of jobs)""" t_straggle_start = time.time() futures_stragglers = pwex.map(lambda x: pywren_gemm(x, A, B, C, num_col_blocks), C.block_idxs_not_exist) while len(C.block_idxs_not_exist) > 0: pywren.wait(futures, return_when=ALWAYS) pywren.wait(futures_stragglers, return_when=ALWAYS) t_straggle = time.time() - t_straggle_start # Total stage 2 time return C, t_comp, t_straggle
parser.add_argument('test_key', type=str, help="train_key") parser.add_argument('--train_labels', type=str, help="train_labels", default="y_train_fishervector.npy") parser.add_argument('--test_labels', type=str, help="test_labels", default="y_test_fishervector.npy") args = parser.parse_args() y_train = np.load(args.train_labels) y_test = np.load(args.test_labels) K_train = matrix.BigSymmetricMatrix(args.train_key, bucket="pictureweb") K_test = matrix.BigMatrix(args.test_key, bucket="pictureweb") model = matrix.BigMatrix(args.model_key, bucket="pictureweb", shape=(K_train.shape[0], int(np.max(y_train) + 1)), shard_sizes=(4096, 1000), write_header=True) config = wc.default() config['runtime']['s3_bucket'] = 'pictureweb' config['runtime'][ 's3_key'] = 'pywren.runtime/pywren_runtime-3.6-pictureweb.tar.gz' config['standalone']['sqs_queue_name'] = 'pictureweb' print("please launch some standalone instances for this script....") pwex = pywren.standalone_executor(config=config) print("Evaluating Train")
def code_2D(A, num_parity_blocks, thres=1): assert (len(A._block_idxs(0)) % num_parity_blocks == 0) shard_size = A.shard_sizes[0] coded_shape = (A.shape[0] + num_parity_blocks * A.shard_sizes[0], A.shape[1]) coding_length = int(np.ceil(len(A._block_idxs(0)) / num_parity_blocks)) coding_fn2D = make_coding_function2D(A, coding_length) coded_2D_shape = ( A.shape[0] + (coding_length + 1 + num_parity_blocks) * A.shard_sizes[0], A.shape[1]) A_coded_2D = matrix.BigMatrix(A.key + "CODED2D_{0}_{1}_{2}".format( A.shape[0], shard_size, num_parity_blocks), shape=coded_2D_shape, shard_sizes=A.shard_sizes, write_header=True, parent_fn=coding_fn2D) # if list(set(A_coded_2D.block_idxs_not_exist) - set(A.block_idxs_exist)) == []: # return A_coded_2D last_block = max(A._block_idxs(0)) columns = A_coded_2D._block_idxs(1) rows = A_coded_2D._block_idxs(0) to_read = [] blocks_exist = A_coded_2D.block_idxs_exist for row in rows: if (row <= last_block): continue for column in columns: if (row, column) in blocks_exist: continue else: to_read.append((row, column)) print("Number of parity blocks", len(to_read)) num_parities_1D = coding_length * len(A._block_idxs(1)) to_read_phase1 = to_read[0:num_parities_1D] to_read_phase2 = to_read[num_parities_1D:] def get_block_wrapper(x): A_coded_2D.get_block(*x) return 0 #### For 2D ENCODING of A, uncomment pwex = pywren.lambda_executor() t_enc1 = time.time() futures2 = pwex.map(get_block_wrapper, to_read_phase1) result_count = 0 fs_dones = [] while (result_count < thres * len(to_read_phase1)): fs_dones, fs_notdones = pywren.wait(futures2, 2) result_count = len(fs_dones) print(result_count) time.sleep(3) for f in fs_dones: try: f.result() except Exception as e: print(e) pass t_enc1 = time.time() - t_enc1 print("Encoding phase 1 time", t_enc1) t_enc2 = time.time() futures2 = pwex.map(get_block_wrapper, to_read_phase2) result_count = 0 while (result_count < thres * len(to_read_phase2)): fs_dones, fs_notdones = pywren.wait(futures2, 2) result_count = len(fs_dones) print(result_count) time.sleep(3) for f in fs_dones: try: f.result() except Exception as e: print(e) pass t_enc2 = time.time() - t_enc2 print("Encoding phase 2 time", t_enc2) print("Total ENCODING time", t_enc1 + t_enc2) # a = list(set(A_coded_2D.block_idxs_not_exist) - set(A.block_idxs_exist)) # print("Still to encode", a) return A_coded_2D
def start_encode_mtx(M, blocks_per_parity, s3_key): """ Apply a (blocks_per_parity + 1, blocks_per_parity) MDS code to the matrix M every blocks_per_parity rows by summing up the previous blocks_per_parity rows. Params ====== M : numpywren.matrix.BigMatrix The matrix to encode. blocks_per_parity : int The number of input blocks sum up in creating each parity block. Note that as this number increases, less redundancy is provided. s3_key : str The storage key for Amazon S3. Returns ======= M_coded : numpywren.matrix.BigMatrix The encoded matrix. futures : list List of the pywren futures. num_workers : int The number of workers. """ # Useful definitions num_row_blocks = M.shape[0] // M.shard_sizes[0] num_col_blocks = M.shape[1] // M.shard_sizes[1] num_parity = num_row_blocks // blocks_per_parity # total number of parity blocks that will be added coded_shape = (M.shape[0] + num_parity * M.shard_sizes[0], M.shape[1]) # Ensure no blocks will go uncoded if not num_row_blocks % blocks_per_parity == 0: raise ValueError("Number of row blocks ({0}) is not divisible \ by number of blocks per parity ({1})".format( num_row_blocks, blocks_per_parity)) # Create the coded matrix object coding_fn = make_coding_function(M, blocks_per_parity) M_coded = matrix.BigMatrix(s3_key, shape=coded_shape, shard_sizes=M.shard_sizes, write_header=True, parent_fn=coding_fn) M_coded.delete( ) # Only needed if you reuse the same s3_key (if the blocks already exist, no work will be done here) # Generate encoding indices encode_idx = [] for j in range(num_col_blocks): for i in range(1, num_parity + 1): encode_idx.append((i * (blocks_per_parity + 1) - 1, j)) num_workers = len(encode_idx) # Encode the matrix pwex = pywren.lambda_executor() futures = pwex.map(lambda x: get_block_wrapper(M_coded, x), encode_idx) return M_coded, futures, num_workers
## Define number of processors to use while calculating the gradient n_procs = 60 ## Define number of parity blocks to use for coded computation num_parity_blocks = 6 #Make num_parity blocks close to sqrt(n_procs) for efficiency """ Define numpywren BigMatrix and make sure the data has been uploaded to S3 cloud storage (done using the script upload_data_to_s3) """ X_s3_conv = matrix.BigMatrix("logistic_synthetic_data_{0}_{1}_{2}".format( n_samples, n_features, n_procs), shape=(n_samples, n_features), shard_sizes=(n_samples // n_procs, n_features), write_header=True) X_s3_test = matrix.BigMatrix("logistic_epsilon_test_data_{0}_{1}".format( n_samples_test, n_features), shape=(n_samples_test, n_features), shard_sizes=(n_samples_test, n_features), write_header=True) y_s3_conv = matrix.BigMatrix("logistic_synthetic_data_y_{0}_{1}".format( n_samples, n_procs), shape=(n_samples, ), shard_sizes=(n_samples // n_procs, ), write_header=True)
n_features = 3000 n_samples = 300000 n_samples_test = int(0.25 * n_samples) X, X_test, y2, y_test = toy_logistic_data(n_samples, n_samples_test, n_features) y = y2 ## Define number of processors to use while calculating the gradient n_procs = 60 ## Define numpywren BigMatrix and upload data to S3 cloud storage X_s3_conv = matrix.BigMatrix("logistic_synthetic_data_{0}_{1}_{2}".format( n_samples, n_features, n_procs), shape=(n_samples, n_features), shard_sizes=(n_samples // n_procs, n_features), write_header=True) shard_matrix(X_s3_conv, X, overwrite=True) X_s3_unconv = matrix.BigMatrix( "logistic_synthetic_data_{0}_{1}_{2}".format(n_samples, n_features, n_procs), shape=(n_samples, n_features), shard_sizes=(n_samples, int(np.ceil(n_features / n_procs))), write_header=True) shard_matrix(X_s3_unconv, X, overwrite=True) X_s3_test = matrix.BigMatrix("logistic_epsilon_test_data_{0}_{1}".format( n_samples_test, n_features), shape=(n_samples_test, n_features),
pca_dpi = args.pca_sample_descs_per_image num_sample_descs = pca_dpi * pca_sample_images sifts_hash = utils.hash_string( utils.hash_args((train_keys, args.pca_dim, pca_sample_images, args.pca_sample_descs_per_image, args.random_seed, args.pca_dim)) + utils.hash_function(calculate_sifts) + utils.hash_function(sift.sift)) lcs_hash = utils.hash_string( utils.hash_args((train_keys, args.pca_dim, pca_sample_images, args.pca_sample_descs_per_image, args.random_seed, args.pca_dim)) + utils.hash_function(calculate_lcs) + utils.hash_function(lcs.lcs)) sift_sample_descs = matrix.BigMatrix(sifts_hash, shape=(num_sample_descs, SIFT_DESC_LENGTH), shard_sizes=(pca_dpi**2, SIFT_DESC_LENGTH), write_header=True) lcs_sample_descs = matrix.BigMatrix(lcs_hash, shape=(num_sample_descs, LCS_DESC_LENGTH), shard_sizes=(pca_dpi**2, LCS_DESC_LENGTH), write_header=True) block_idxs_not_exist = sift_sample_descs.block_idxs_not_exist print("Sample Descs Blocks not exist", len(block_idxs_not_exist)) print("Sample Descs Blocks total", len(sift_sample_descs.block_idxs)) pca_sample_train_keys = train_keys[idxs_sample] chunked_train_keys = list(utils.chunk(pca_sample_train_keys, pca_dpi))
from numpywren import matrix, matrix_utils from numpywren import binops from numpywren.matrix_init import shard_matrix from OverSketch import OverSketchFunc m = 2000 n = 10000 b = 1000 l = 3000 d = int(4 * b) A_loc = np.asarray(range(m * n)) A_loc = A_loc.reshape(m, n) B_loc = np.random.rand(n, l) A = matrix.BigMatrix("oversketch_A_{0}_{1}_{2}".format(m, n, b), shape=(m, n), shard_sizes=(b, n), write_header=True) shard_matrix(A, A_loc) B = matrix.BigMatrix("oversketch_B_{0}_{1}_{2}".format(n, l, b), shape=(n, l), shard_sizes=(n, b), write_header=True) shard_matrix(B, B_loc) print("A and B done") AB = OverSketchFunc(A, B, d) print("OverSketch done") c = AB.numpy()
def OverSketchFunc(A, B, d, thres = 0.95): m = A.shape[0] n = A.shape[1] l = B.shape[1] b = A.shard_sizes[0] assert (d % b == 0) assert (m % b == 0) assert (l % b == 0) assert (b == B.shard_sizes[1]) N = int(d/b) sketch_A = matrix.BigMatrix("sketch_A_{0}_{1}".format(m, d), shape=(m, d), shard_sizes=(b, b)) sketch_BT = matrix.BigMatrix("sketch_B_{0}_{1}".format(l, d), shape=(l, d), shard_sizes=(b, b)) hashes = np.random.randint(0, b, size=(N, n)) flips = np.random.choice([-1,1], size=(N, n)) def OverSketchMatrix(id, X, hashes, flips, b, sketch): """ Calculates OverSketch AS for a row-block of a fat matrix A with block-size b """ x = id[0] y = id[1] A = X.get_block(x,0) m,n = A.shape hash_local = hashes[y,:] flip_local = flips[y,:] sketch_block = np.zeros((m, b)) for i in range(n): sketch_block[:, hash_local[i]] += flip_local[i]*A[:,i] sketch.put_block(sketch_block, x, y) return 0 pwex = pywren.lambda_executor() t1 = time.time() futuresA = pwex.map(lambda x: OverSketchMatrix(x, A, hashes, flips, b, sketch_A), sketch_A.block_idxs) futuresB = pwex.map(lambda x: OverSketchMatrix(x, B.T, hashes, flips, b, sketch_BT), sketch_BT.block_idxs) fs_donesA = pywren.wait(futuresA, 2)[0] fs_donesB = pywren.wait(futuresB, 2)[0] while len(fs_donesA)<thres*len(futuresA) and len(fs_donesB)<thres*len(futuresB): fs_donesA = pywren.wait(futuresA, 2)[0] fs_donesB = pywren.wait(futuresB, 2)[0] print("Sketching time", time.time() - t1) ## Computation phase def blockMatMul(A, B, tensorAB, id): """ Multiplies A and B.T in a blocked fashion """ i = id[0] j = id[1] k = id[2] X = A.get_block(i,k) Y = B.get_block(j,k) tensorAB[k].put_block(X.dot(Y.T), i, j) return 0 tensorAB = [] for x in range(N): tensorAB.append(matrix.BigMatrix("AxB_outer_{0}_{1}_{2}".format(m, l, x), shape=(m, l), shard_sizes=(b, b))) computeArr = [(i,j,k) for (i,k) in sketch_A.block_idxs for j in sketch_BT._block_idxs(0)] t1 = time.time() futures = pwex.map(lambda x: blockMatMul(sketch_A, sketch_BT, tensorAB, x), computeArr) fs_dones = pywren.wait(futures, 2)[0] while len(fs_dones)<thres*len(futures): fs_dones = pywren.wait(futures, 2)[0] print("Computation time", time.time() - t1) ## Reduction phase def blockMatMulReduction(tensorAB, AB, id): """ Reduces the output from computation phase to get A*B Variable 'count' keeps track of number of blocks that have returned """ i = id[0] j = id[1] X = None count = 1 for k in range(N): if X is None: try: X = tensorAB[k].get_block(i,j) except Exception as e: print(e) pass else: try: X = X + tensorAB[k].get_block(i,j) count = count+1 except Exception as e: print(e) pass AB.put_block(X/count, i, j) return 0 AB = matrix.BigMatrix("AxB_{0}_{1}".format(m, l), shape=(m, l), shard_sizes=(b, b)) reduceArr = [(i,j) for i in sketch_A._block_idxs(0) for j in sketch_BT._block_idxs(0)] t1 = time.time() futures_red = pwex.map(lambda x: blockMatMulReduction(tensorAB, AB, x), reduceArr) fs_dones = pywren.wait(futures_red)[0] print("Reduction time", time.time() - t1) return AB
def gemm_coded(A, B, blocks_per_parity, s3_key, completion_pct=.7, encode_A=True, encode_B=True, np_A=-1, np_B=-1): """ Compute A * B.T using a locally recoverable product code for redundancy. Params ====== A : numpywren.matrix.BigMatrix First input matrix. B : numpywren.matrix.BigMatrix Second input matrix. blocks_per_parity : int Number of blocks to sum up when creating each parity block. s3_key: str Storage key for output matrix. completion_pct: int The fraction of multiplication workers that must finish before moving on to decoding. encode_A : bool Whether or not A needs to be encoded. Allows for the user to pre-encode A if it will be used multiple times. encode_B : bool Whether or not B needs to be encoded. Allows for the user to pre-encode B if it will be used multiple times. np_A : int Number of parity blocks in the matrix A. Should be provided if and only if encode_A is set to false. np_B : int Number of parity blocks in the matrix B. Should be provided if and only if encode_B is set to false. Returns ======= C : numpywren.matrix.BigMatrix Resultant matrix product. t_enc : float Encoding time. t_comp : float Computation time. t_dec : float Decoding time. """ if (not encode_A) and np_A == -1: raise ValueError("You must provide the number of parity blocks in A if you pre-encoded it.") if (not encode_B) and np_B == -1: raise ValueError("You must provide the number of parity blocks in B if you pre-encoded it.") """Stage 1: Encoding""" start = time.time() if encode_A or encode_B: # Spin up encoding workers num_workers = 0 if encode_A: A_coded, futures_encode_A, num_workers_A = start_encode_mtx(A, blocks_per_parity, "A_coded") num_workers += num_workers_A if encode_B: B_coded, futures_encode_B, num_workers_B = start_encode_mtx(B, blocks_per_parity, "B_coded") num_workers += num_workers_B # Wait until enough encoding workers are done to move on num_done = 0 while num_done < MIN_ENCODING_COMPLETION_PCT * num_workers: fs_A, fs_B = [], [] if encode_A: fs_A, _ = pywren.wait(futures_encode_A, return_when=ANY_COMPLETED) if encode_B: fs_B, _ = pywren.wait(futures_encode_B, return_when=ANY_COMPLETED) num_done = len(fs_A) + len(fs_B) if not encode_A: A_coded = A if not encode_B: B_coded = B t_enc = time.time() - start # Total encoding time """Intermediate step: Initialize output matrix (untimed for consistency with gemm_recompute).""" # Determine coded dimensions of A, B if encode_A: num_parity_A = (A.shape[0] // A.shard_sizes[0]) // blocks_per_parity coded_shape_A = (A.shape[0] + num_parity_A * A.shard_sizes[0], A.shape[1]) else: num_parity_A = np_A coded_shape_A = A_coded.shape if encode_B: num_parity_B = (B.shape[0] // B.shard_sizes[0]) // blocks_per_parity coded_shape_B = (B.shape[0] + num_parity_B * B.shard_sizes[0], B.shape[1]) else: num_parity_B = np_B coded_shape_B = B_coded.shape # Create (encoded) output matrix shard_sizes_C = (A.shard_sizes[0], B.shard_sizes[0]) C_coded = matrix.BigMatrix(s3_key + "coded", shape=(A_coded.shape[0], B_coded.shape[0]), \ shard_sizes=shard_sizes_C, \ autosqueeze=False, \ write_header=True) C_coded.delete() # Only needed if you reuse the same s3_key (if the blocks already exist, no work will be done here) # Generate indices for the output matrix num_row_blocks_C = C_coded.shape[0] // C_coded.shard_sizes[0] num_col_blocks_C = C_coded.shape[1] // C_coded.shard_sizes[1] num_cols_coded = A_coded.shape[1] // A_coded.shard_sizes[1] # Inner dimension of the coded multiplication block_idx_C = C_coded.block_idxs num_workers = len(block_idx_C) np.random.shuffle(block_idx_C) # Randomize jobs to avoid bad straggler locality """Stage 2: Multiply""" t_comp_start = time.time() pwex = pywren.lambda_executor() futures_matmul = pwex.map(lambda x: pywren_gemm(x, A_coded, B_coded, C_coded, num_cols_coded), block_idx_C) fs_done_matmul, num_done = [], 0 while num_done < completion_pct * num_workers: fs_done_matmul, _ = pywren.wait(futures_matmul, return_when=ANY_COMPLETED) num_done = len(fs_done_matmul) t_comp = time.time() - t_comp_start # Total stage 2 time """Stage 3: Decoding""" t_dec_start = time.time() decode_idx = [(i, j) for i in range(num_parity_A) for j in range(num_parity_B)] num_workers = len(decode_idx) futures_decode = pwex.map(lambda x: decode_gemm(num_row_blocks_C, num_parity_A, C_coded, x), decode_idx) fs_done_decode, num_done = [], 0 while num_done < num_workers and len(C_coded.block_idxs_not_exist) > 0: fs_done_decode, _ = pywren.wait(futures_decode, return_when=ANY_COMPLETED) num_done = len(fs_done_decode) t_dec = time.time() - t_dec_start # Total stage 3 time """Final step: Specify the systematic part (i.e., all non-parity blocks) of the result""" # Determine output dimensions if encode_A: C_num_rows = A.shape[0] else: C_num_rows = A.shape[0] - np_A * A.shard_sizes[0] if encode_B: C_num_cols = B.shape[0] else: C_num_cols = B.shape[0] - np_B * B.shard_sizes[0] # Create the output matrix containing only the systematic part of the result get_systematic_part = systematicize(C_coded, blocks_per_parity) C_shard_sizes = (A.shard_sizes[0], B.shard_sizes[0]) C = matrix.BigMatrix(s3_key, shape=(C_num_rows, C_num_cols), shard_sizes=C_shard_sizes, parent_fn=get_systematic_part) C.delete() # Only needed if you reuse the same s3_key (if the blocks already exist, no work will be done here) return C, t_enc, t_comp, t_dec