def __init__(self, args): self.down_sampling_rates = [16, 16] self.kernel_size = 2 self.args = args self.dilation_rates = [2**i for i in range(args.wl + 1)] * 2 self.receptive_field = self.calc_pad(self.args) try: directories = self.validate_directories(self.args) except ValueError as e: print("Some arguments are wrong:") print(str(e)) return self.logdir = directories['logdir'] # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. ### modifying samle size to become square complete self.args.sample_size = self.args.sample_size - self.receptive_field // 2 # Create network. self.net_train = SampleTransformer(self.down_sampling_rates, self.dilation_rates, self.kernel_size, self.receptive_field, self.args) # self.net_val = SampleTransformer(self.down_sampling_rates, self.dilation_rates, self.kernel_size, self.receptive_field, self.args) # Load raw waveform from VCTK corpus. with tf.name_scope('create_inputs'): # Allow silence trimming to be skipped by specifying a threshold near # zero. silence_threshold = self.args.silence_threshold if self.args.silence_threshold > \ EPSILON else None gc_enabled = self.args.gc_channels is not None self.reader = AudioReader( args.data_dir, sample_rate=0, batch_size=self.args.batch_size, gc_enabled=gc_enabled, receptive_field=self. receptive_field, # TODO: change receiptive field sample_size=self.args.sample_size, silence_threshold=silence_threshold) self.audio_batch, self.begin = self.reader.get_input_placeholder() self.trainData_iter = self.reader.get_data_iterator('train') self.valData_iter = self.reader.get_data_iterator('val') if args.l2_regularization_strength == 0: args.l2_regularization_strength = None self.g_step = tf.placeholder(dtype=tf.int32, shape=None, name='step') self.lr = tf.placeholder(dtype=tf.float32, shape=None, name='learning_rate') self.loss_train = self.net_train.loss( self.audio_batch, self.begin, self.g_step, True, l2_regularization_strength=args.l2_regularization_strength) bs.clear_bst_constants() params = tf.trainable_variables() grads = bs.gradients(self.loss_train, params) self.global_norm, self.norm_scale = bs.clip_by_global_norm( grads, grad_scale=1.0, clip_norm=1.0) adam = bs.AdamOptimizer(learning_rate=self.lr, norm_scale=self.norm_scale, grad_scale=1.0, fp16=False) self.train_op = adam.apply_gradients(zip(grads, params)) self.loss_val = self.net_train.loss( self.audio_batch, self.begin, self.g_step, False, l2_regularization_strength=args.l2_regularization_strength) # Restoring ... with tf.variable_scope('memroy', reuse=True): memory = tf.get_variable('mem') self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) init = tf.global_variables_initializer() self.sess.run(init) var_list = tf.trainable_variables() + [memory] self.saver = tf.train.Saver(var_list=var_list, max_to_keep=args.max_checkpoints) try: self.saved_global_step, self.best_val_loss = load( self.saver, self.sess, self.logdir, self.args.load_type) if self.saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. self.saved_global_step = 0 self.best_val_loss = np.inf except: print( "Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise self.summary_writer = tf.summary.FileWriter( os.path.join(self.logdir, STARTED_DATESTRING)) open_type = 'a' if os.path.exists(self.logdir + '/log.txt') else 'w' self.log_file = open(self.logdir + '/log.txt', open_type) with open(self.logdir + '/config.txt', open_type) as f: f.write(STARTED_DATESTRING + '\n\n') for arg in vars(self.args): f.write('{}: {}\n'.format(arg, getattr(self.args, arg)))
def model(xs, ys, loss_scale=None, train=False): with tf.variable_scope("model", reuse=not train): with tf.device("/cpu:0"): if train: grad_scale = tf.reciprocal(loss_scale) if hps.float16 else 1.0 global_step = tf.get_variable( "global_step", [], initializer=tf.ones_initializer(), trainable=False) learning_rate = tf.minimum( global_step * (1.0 / hps.warmup_iters), 1.0) * hps.lr mpi_scale = tf.constant(1.0 / mpi_size) with tf.device("/gpu:0"): # Contains scope/var_name substrings we use to group gradients for all reduce # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise bs.allreduce could hang. # The groups should be ordered in which the all-reduce is called. # Any gradients not matching the substrings will get appended to the last group. grad_groups = [] # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02)) p_embed = tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01)) if hps.float16: x_embed = bs.float_cast(x_embed, dtype=tf.float16, dx_dtype=tf.float16) p_embed = bs.float_cast(p_embed, dtype=tf.float16, dx_dtype=tf.float16) # bs.embedding_lookup can be much faster than tf version for low entropy indexes or small vocabs x = bs.embedding_lookup(x_embed, xs) if train and hps.embed_pdrop > 0.0: # this part of the code is not recomputed so no need to remember the generated mask returned by bs.dropout x, _ = bs.dropout(x, keep_prob=1.0 - hps.embed_pdrop) p_embed, _ = bs.dropout(p_embed, keep_prob=1.0 - hps.embed_pdrop) h = x + p_embed grad_groups.insert(0, 'embed') for l in range(hps.n_layer): layer_name = 'layer_%d' % l # enable the recompute decorator in training # see blocksparse/grads.py if you want understand how this works h = transformer_block(h, layer_name, train=train, recompute=train and hps.recompute) grad_groups.insert(0, layer_name) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) if hps.float16: # much faster and more memory efficient (but currently only implemented in fp16) loss = bs.softmax_cross_entropy(logits=logits, labels=ys) else: labels = tf.cast(tf.reshape(ys, [-1]), tf.int32) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) loss = tf.reduce_mean(loss) if train: # apply loss scaling in fp16 mode if hps.float16: grad_loss = bs.scale_tensor(loss, loss_scale) else: grad_loss = loss # use bs.gradients to allow bs.recomputable decorators to work params = tf.trainable_variables() grads = bs.gradients(grad_loss, params) if mpi_size > 1: # apply (1.0 / mpi_size) scaling prior to all_reduce to allow greater utilization of fp16 dynamic range. # That is we're ok with flushing some small values to zero to allow growth of large values in allreduce (without hitting inf). loss = bs.scale_tensor(loss, mpi_scale) grads = [bs.scale_tensor(g, mpi_scale) for g in grads] # allreduce in an mpi context # bias and gain grads will be in fp32, but have them fp16 cast prior to allreduce cast_all = tf.float16 if H.float16 else None loss = bs.allreduce(loss) grads = bs.group_allreduce(grads, params, search_strings=grad_groups, cast_all=cast_all) # This does not actually perform the clippiing, only measures the norm_scale needed to be applied. # norm_scale is then later applied in the fused optimizer ops (eliminating an extra pass over the gradients). # norm_scale is also used to detect inf/nan values in any of the gradients so the whole update can be skipped # and tried again with a new loss_scale. global_norm, norm_scale = bs.clip_by_global_norm( grads, grad_scale=grad_scale, clip_norm=hps.clip_norm) # Apply AdamOptimizer: # fp16 mode is a special feature to store running mean and variance variables in custom fp16 formats. # Using this mode should incure no loss in accuracy and save a lot of memory in your model. # For futher memory savings consider using bs.AdafactorOptimizer. adam = bs.AdamOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale, fp16=hps.float16) train_op = adam.apply_gradients(zip(grads, params)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op), global_norm, norm_scale else: if mpi_size > 1: loss = bs.allreduce(bs.scale_tensor(loss, mpi_scale)) return loss
def testBlocksparseMatMul(self): # layout = np.zeros((2,2), dtype=np.int32) # layout[0,0] = 1 n, m = 160, 5 layout = networkx.generators.barabasi_albert_graph(n, m) #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5) layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32) layout[0:m,0:m] = 1 #layout[0:60,0:60] = 1 #layout = np.zeros((4,4), dtype=np.int32) #layout = np.ones((4,4), dtype=np.int32) #layout[0,0] = 1 #layout = np.ones((1,1), dtype=np.int32) blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max(), layout.sum(axis=0).min()) #exit() with self.test_session(config=conf) as sess, tf.device("/gpu:0"): for bsize, axis in ( (32,0), (16,0), (8,0), ): # (32,1), (32,0), (16,0), (8,0) bsmm = bs.BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") if one: W = np.ones(bsmm.w_shape, dtype=np.float32) for w in range(bsmm.blocks): #c, k = bsmm.block_coord(w) #if c == k: W[w] = np.eye(bsmm.bsize, dtype=np.float32) # W = np.ones(bsmm.w_shape, dtype=np.float32) # W[:] += np.arange(32, dtype=np.float32).reshape(1,1,32) else: # W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float16).astype(np.float32) W = np.random.normal(loc=0.0, scale=0.01, size=bsmm.w_shape).astype(np.float16).astype(np.float32) # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:] w = tf.constant(W) # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) ) # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape) # print("identity_init: ", (s1 - s2).max()) # exit() for N in (256,128,64,32,16,8,): # 128,64,32,16,1, 256,512,1024,2048,4096, 256,1024,4096,16384 if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) # X = np.eye(bsmm.bsize, dtype=np.float32) # E = np.arange(X.size, dtype=np.float32).reshape(X.shape) # X[:] += np.arange(X.size, dtype=np.float32).reshape(X.shape) # X[:] += np.arange(32, dtype=np.float32).reshape(32,1) # E[:] += np.arange(16, dtype=np.float32).reshape(1,32) # X[:] += np.arange(64, dtype=np.float32).reshape(1,64) # E[:] += np.arange(64, dtype=np.float32).reshape(1,64) else: # X = np.random.uniform(0.0, 10.0, bsmm.i_shape(N)).astype(np.float16).astype(np.float32) # E = np.random.uniform(0.0, 10.0, bsmm.o_shape(N)).astype(np.float16).astype(np.float32) X = np.random.normal(loc=0.0, scale=0.1, size=bsmm.i_shape(N)).astype(np.float16).astype(np.float32) E = np.random.normal(loc=0.0, scale=0.1, size=bsmm.o_shape(N)).astype(np.float16).astype(np.float32) x = tf.constant(X) e = tf.constant(E) for dtype in dtypes: print("Axis:%d Bsize:%2d N:%d dtype:%s Params:%d" % (axis, bsize, N, dtype.name, bsize*bsize*blocks)) # compute in tensorflow if l2norm: w2 = bsmm.l2_normalize(w, dtype=dtype) else: w2 = bs.float_cast(w, dtype=dtype) y = bs.float_cast(x, dtype=dtype) for j in range(depth): repeat = bench if bench and j==depth-1 else 0 y = bsmm(y, w2, bench=repeat) # (bench and j==depth-1) (bench and j==0) y = bs.float_cast(y, dtype=tf.float32) #if bench: sess.run( y ) #y = sess.run( y ) with tf.control_dependencies([y.op]): d = bs.gradients(y, [x, w], e) if depth > 1: d[1] = bs.group_param_grads(d[1], 8) sess.run(tf.global_variables_initializer()) #y, = sess.run( [y] ) y, (dx, dw) = sess.run( [y, d ] ) if not bench: # compute in numpy if l2norm: W2 = bsmm.l2_normalize_test(W) else: W2 = W Ys = [X] for j in range(depth): Ys.append(bsmm.fprop_test(Ys[-1], W2)) Y = Ys.pop() DW = np.zeros(bsmm.w_shape, dtype=np.float32) DX = E for j in range(depth): DW += bsmm.updat_test(Ys.pop(), DX) DX = bsmm.bprop_test(DX, W2) if l2norm: DW = bsmm.l2_normalize_grad_test(W, DW) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum()) #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err)) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) # rtol = 1e-4 if dtF is tf.float32 else 1e-1 # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol) if out: np.savetxt("out.txt", difA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') np.savetxt("outC.txt", cpuA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') np.savetxt("outD.txt", devA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') exit() print("")
def atestSparseProj(self): nhidden = 1024*8 nproj = 1024 N = 64 with self.test_session(config=conf) as sess, tf.device("/gpu:0"): if one: X = np.ones((nhidden,N), dtype=np.float32) Y = np.ones(( nproj,N), dtype=np.float32) EX = np.ones((nhidden,N), dtype=np.float32) EY = np.ones(( nproj,N), dtype=np.float32) else: X = np.random.uniform(-1.0, 1.0, (nhidden,N)).astype(np.float32) Y = np.random.uniform(-1.0, 1.0, ( nproj,N)).astype(np.float32) EX = np.random.uniform(-1.0, 1.0, (nhidden,N)).astype(np.float32) EY = np.random.uniform(-1.0, 1.0, ( nproj,N)).astype(np.float32) x = tf.constant(X) y = tf.constant(Y) ex = tf.constant(EX) ey = tf.constant(EY) sproj = bs.SparseProj(nhidden, nproj) lut = sproj.gather_lut SLC = X[lut,:] ADD = X.copy() MUL = X.copy() ADD[lut,:] += Y MUL[lut,:] *= Y SLC_DX = np.zeros(x.shape) SLC_DX[lut,:] = EY ADD_DX = EX ADD_DY = EX[lut,:] MUL_DX = EX.copy() MUL_DX[lut,:] *= Y MUL_DY = EX[lut,:] * X[lut,:] slc_op = sproj.gather(x) mul_op = sproj.scatter_mul(x, y) add_op = sproj.scatter_add(x, y) slc = sess.run( slc_op ) mul = sess.run( mul_op ) add = sess.run( add_op ) # this op overwrites x, run last slc_dx, = sess.run( bs.gradients(slc_op, [x ], ey) ) add_dx, add_dy = sess.run( bs.gradients(add_op, [x,y], ex) ) mul_dx, mul_dy = sess.run( bs.gradients(mul_op, [x,y], ex) ) # this op overwrites ex, run last for op, cpuA, devA in ( ("slc:", SLC, slc), ("add:", ADD, add), ("mul:", MUL, mul), ("slc_dx:", SLC_DX, slc_dx), ("add_dx:", ADD_DX, add_dx), ("add_dy:", ADD_DY, add_dy), ("mul_dx:", MUL_DX, mul_dx), ("mul_dy:", MUL_DY, mul_dy), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum()) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) if out: np.savetxt("out.txt", difA, fmt='%5.1f') np.savetxt("outC.txt", cpuA, fmt='%5.1f') np.savetxt("outD.txt", devA, fmt='%5.1f') exit()
def atestBlocksparseMatMulGated(self): with self.test_session(config=conf) as sess, tf.device("/gpu:0"): N = 128 K = 8*56*2*4 n = K//8 m = 30 dtype = tf.float32 repeat = 0 dw_gated = False block_size = 8 layout = networkx.generators.barabasi_albert_graph(n, m) layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32) layout[0:m,0:m] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) # layout = np.ones((112,32), dtype=np.int32) bsmm = bs.BlocksparseMatMul(layout, block_size=block_size, feature_axis=0, name="test") if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) W = np.ones(bsmm.w_shape , dtype=np.float32) G = np.ones(bsmm.blocks , dtype=np.float32) else: X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) W = np.random.uniform(-1.0, 1.0, bsmm.w_shape ).astype(np.float32) G = np.random.uniform( 0.0, 1.0, bsmm.blocks ).astype(np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) for w, (c, k) in enumerate(bsmm.updat_list): G[w] = (c & 1) ^ (k & 1) ^ 1 #G[::2] = 0.0 # block = dict() # for w, (c, k) in enumerate(bsmm.updat_list): # block[(c,k)] = w # grid = [] # for c in range(bsmm.CB): # row = [] # for k in range(bsmm.KB): # row.append(G[block[(c,k)]]) # grid.append(row) # for row in grid: # print(row) # exit() x = tf.constant(X) e = tf.constant(E) w = tf.constant(W) g = tf.constant(G) wf = bs.float_cast(w, dtype=dtype) xf = bs.float_cast(x, dtype=dtype) y = bsmm(xf, wf, gate=g, gate_grad=True, dw_gated=dw_gated, bench=repeat) y = bs.float_cast(y, dtype=tf.float32) d = bs.gradients(y, [x, w], e) sess.run( tf.global_variables_initializer() ) y, (dx, dw) = sess.run( [y, d] ) # gpu kernel doesn't touch zero gate blocks # for b in range(bsmm.blocks): # if G[b] == 0.0: # dw[b,:,:] = 0.0 Y = bsmm.fprop_test(X, W, gate=G) DX = bsmm.bprop_test(E, W, gate=G) DW = bsmm.updat_test(X, E, gate=G, dw_gated=dw_gated) #print(Y.shape, dtype) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw),): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum() + 1e-12) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) if out: dim = K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1,dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1,dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1,dim)), fmt='%5.1f') exit()