def generate_data(args): dense_variables = generate_dense_variables( args.slot_num * args.nnz_per_slot * args.embedding_vec_size, [args.num_dense_units for _ in range(args.num_dense_layers)]) vocabulary_tensors = generate_vocabulary_table( args.max_vocabulary_size_per_gpu, args.embedding_vec_size, hvd.size()) samples, labels = utils.generate_random_samples( num_of_samples=args.global_batch_size, vocabulary_size=args.max_vocabulary_size_per_gpu * hvd.size(), slot_num=args.slot_num, max_nnz=args.nnz_per_slot, use_sparse_mask=False) samples, labels = tf.convert_to_tensor(samples), tf.convert_to_tensor( labels) for i in range(args.num_dense_layers): # dense_variables[0] means weight, dense_variables[1] means bias dense_variables[0][i] = hvd.broadcast(dense_variables[0][i], root_rank=0) dense_variables[1][i] = hvd.broadcast(dense_variables[1][i], root_rank=0) for i in range(hvd.size()): vocabulary_tensors[i] = hvd.broadcast(vocabulary_tensors[i], root_rank=0) samples = hvd.broadcast(samples, root_rank=0) labels = hvd.broadcast(labels, root_rank=0) return dense_variables, vocabulary_tensors, samples, labels
def test_horovod_broadcast_grad_gpu(self): """Test the correctness of the broadcast gradient on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_BROADCAST. return hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): if _executing_eagerly(): tensor = self.tfe.Variable(tf.ones([5] * dim) * rank) else: tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 if _executing_eagerly(): with tf.GradientTape() as tape: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) with tf.device("/gpu:%d" % local_rank): grad_out = tape.gradient(broadcasted_tensor, tensor) else: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) with tf.device("/gpu:%d" % local_rank): grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = self.evaluate(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def _add_broadcast_ops(): bcast_global_variables_ops = [] for var in tf.global_variables(): bcast_global_variables_ops.append(tf.assign(var, hvd.broadcast(var, 0))) with tf.control_dependencies(bcast_global_variables_ops): tf.no_op(name='auto_parallel_bcast_global_vars')
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product( dtypes, dims, root_ranks): tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = session.run(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product( dtypes, dims, root_ranks): tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = session.run(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64, tf.bool ] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product( dtypes, dims, root_ranks): tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( session.run( tf.reduce_all( tf.equal(tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor")
def test_horovod_broadcast_cpu(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors on CPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64, tf.bool ] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): with tf.device("/cpu:0"): tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( self.evaluate( tf.reduce_all( tf.equal(tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor")
def test_horovod_broadcast(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float32, tf.float64, tf.bool] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): try: tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( session.run(tf.reduce_all(tf.equal( tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor") except Exception: import traceback traceback.print_exc()
def _add_broadcast_ops(target, worker_id): bcast_global_variables_ops = [] with tf.device('/job:worker/task:%d' % worker_id): for var in target: bcast_global_variables_ops.append( tf.assign(var, hvd.broadcast(var, 0))) with tf.control_dependencies(bcast_global_variables_ops): tf.no_op(name='auto_parallel_bcast_global_vars')
def test_horovod_broadcast_grad_cpu(self): """Test the correctness of the broadcast gradient on CPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # As of TensorFlow v1.9, gradients are not supported on # integer tensors dtypes = [tf.float32, tf.float64] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): if _executing_eagerly(): tensor = self.tfe.Variable(tf.ones([5] * dim) * rank) else: tensor = tf.ones([5] * dim) * rank if dtype == tf.bool: tensor = tensor % 2 if _executing_eagerly(): with tf.GradientTape() as tape: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) with tf.device("/cpu:0"): grad_out = tape.gradient(broadcasted_tensor, tensor) else: tensor = tf.cast(tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) grad_ys = tf.ones([5] * dim) with tf.device("/cpu:0"): grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0] grad_out = self.evaluate(grad) c = size if rank == root_rank else 0 expected = np.ones([5] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def batch_shuffle(tensor): # nx... total, rank = hvd.size(), hvd.rank() batch_size = tf.shape(tensor)[0] with tf.device('/cpu:0'): all_idx = tf.range(total * batch_size) shuffle_idx = tf.random.shuffle(all_idx) shuffle_idx = hvd.broadcast(shuffle_idx, 0) my_idxs = tf.slice(shuffle_idx, [rank * batch_size], [batch_size]) all_tensor = allgather(tensor, 'batch_shuffle_key') # gn x ... return tf.gather(all_tensor, my_idxs), shuffle_idx
def send_receive(self, tensors, ctx): decompressed_tensors = [] for ranki in range(self.world_size): ranki_tensors = [ broadcast(tensor, root_rank=ranki) for tensor in tensors ] ranki_decompressed = self.compressor.decompress(ranki_tensors, ctx) decompressed_tensors.append(ranki_decompressed) aggregated_tensor = self.compressor.aggregate(decompressed_tensors) return aggregated_tensor
def broadcast(value, root_rank, name=None): """ Perform a broadcast on a tensor-compatible value. Arguments: value: A tensor-compatible value to reduce. The shape of the input must be identical across all ranks. root_rank: Rank of the process from which global variables will be broadcasted to all other processes. name: Optional name for the constants created by this operation. """ bcast_op = hvd.broadcast(tf.constant(value, name=name), root_rank) return K.get_session().run(bcast_op)
def broadcasting_dataloader_wrapper(self): if hvd.rank() == 0: (numerical_features, categorical_features), labels = self.pipe.get_next() # Bitcasting to float32 before broadcast and back to int32 right afterwards is necessary # otherwise tensorflow performs a spurious D2H and H2D transfer on this tensor. # Without this call, the columnwise-split mode gets about 2x slower. categorical_features = tf.bitcast(categorical_features, type=tf.float32) else: # using random uniform instead of e.g., tf.zeros is necessary here # tf.zeros would be placed on CPU causing a device clash in the broadcast numerical_features = tf.random.uniform( shape=[self.dlrm.batch_size, self.dlrm.num_numerical_features], dtype=tf.float16) categorical_features = tf.random.uniform( maxval=1, dtype=tf.float32, shape=[self.dlrm.batch_size, len(self.dlrm.table_sizes)]) labels = tf.random.uniform(maxval=1, shape=[self.dlrm.batch_size], dtype=tf.int32) labels = tf.cast(labels, dtype=tf.int8) numerical_features = hvd.broadcast(numerical_features, root_rank=0, name='numerical_broadcast') categorical_features = hvd.broadcast(categorical_features, root_rank=0, name='cat_broadcast') labels = hvd.broadcast(labels, root_rank=0, name='labels_broadcast') categorical_features = tf.bitcast(categorical_features, type=tf.int32) return (numerical_features, categorical_features), labels
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") tensor = tf.ones([17] * 3, dtype=tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): self.evaluate(hvd.broadcast(tensor, rank))
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor = tf.ones([17] * 3, dtype=tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, rank))
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): self.evaluate(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): self.evaluate(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session() as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
def test_horovod_broadcast_gpu(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors on GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): self.skipTest(("No GPUs available")) if os.environ.get('HOROVOD_MIXED_INSTALL'): # Skip if compiled with CUDA but without HOROVOD_GPU_ALLREDUCE. self.skipTest("Not compiled with HOROVOD_GPU_ALLREDUCE") hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") dtypes = [ tf.uint8, tf.int8, tf.uint16, tf.int16, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64, tf.bool ] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): with tf.device("/gpu:%d" % local_rank): tensor = tf.ones([17] * dim) * rank root_tensor = tf.ones([17] * dim) * root_rank if dtype == tf.bool: tensor = tensor % 2 root_tensor = root_tensor % 2 tensor = tf.cast(tensor, dtype=dtype) root_tensor = tf.cast(root_tensor, dtype=dtype) broadcasted_tensor = hvd.broadcast(tensor, root_rank) self.assertTrue( self.evaluate( tf.reduce_all( tf.equal(tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))), "hvd.broadcast produces incorrect broadcasted tensor")
# Parameters: # eps -- time resolution # damping -- wave damping eps = tf.placeholder(tf.float32, shape=()) damping = tf.placeholder(tf.float32, shape=()) # Create variables for simulation state U = tf.Variable(u_init) Ut = tf.Variable(ut_init) #The complete 3N/2 * N matrices on which we'll perform calculations U_full = tf.Variable(np.zeros([(N / 2) + 3, N], dtype=np.float32)) Ut_full = tf.Variable(np.zeros([(N / 2) + 3, N], dtype=np.float32)) rank_bcast = tf.group( tf.assign(U_full[N / 2:], hvd.broadcast( U[:3], 1)), #Sending first 3 rows of rank 1 to rank 0 for U tf.assign(Ut_full[N / 2:], hvd.broadcast( Ut[:3], 1)), #Sending first 3 rows of rank 1 to rank 0 for Ut tf.assign(U_full[:3], hvd.broadcast( U[-3:], 0)), #Sending last 3 rows of rank 0 to rank 1 for U tf.assign(Ut_full[:3], hvd.broadcast(Ut[-3:], 0))) #Sending 2nd half of rank 0 to rank 1 for Ut #Copy the rest of U and Ut for rank 0 U_full_rank0_group = tf.group(U_full[:N / 2].assign(U), Ut_full[:N / 2].assign(Ut)) #Copy the rest of U and Ut for rank 1 U_full_rank1_group = tf.group(U_full[3:].assign(U), Ut_full[3:].assign(Ut))
def broadcast(tensor, name=None): return hvd.broadcast(tensor, root_rank=0)
# Print initial state print "Rank " + str(hvd.rank()) + " send initial: " + str(send_buf) if hvd.rank() == 0: print "Rank " + str(hvd.rank()) + " recv initial: " + str(recv0_buf) else: print "Rank " + str(hvd.rank()) + " recv initial: " + str(recv1_buf) # Create tensorflow variables Send_Buffer = tf.Variable(send_buf, name='Send_Buffer') Recv0_Buffer = tf.Variable(recv0_buf, name='Recv0_Buffer') Recv1_Buffer = tf.Variable(recv1_buf, name='Recv1_Buffer') #communicate bcast = tf.group( tf.assign(Recv1_Buffer, hvd.broadcast(Send_Buffer, 0)), #Rank 0's send_buffer to Rank 1's recv tf.assign(Recv0_Buffer, hvd.broadcast(Send_Buffer, 1))) #Rank 1's send_buffer to Rank 0's recv # Initialize state to initial conditions tf.global_variables_initializer().run() bcast.run() # Print final state if hvd.rank() == 0: print "Rank " + str(hvd.rank()) + " recv final: " + str( Recv0_Buffer.eval()) else: print "Rank " + str(hvd.rank()) + " recv final: " + str(
U.assign(tf.concat(values=[tf.slice(U, [0,0],[N,N]), tf_recv_buf_0], axis=0)), U.assign(U_), Ut.assign(Ut_), tf_send_buf.assign(send_buf)) # Update the state for Rank 1 r1_step = tf.group( U.assign(tf.concat(values=[tf_recv_buf_1, tf.slice(U, [2,0], [N,N])], axis=0)), U.assign(U_), Ut.assign(Ut_), tf_send_buf.assign(send_buf)) # Broadcast the two rows broadcast = tf.group( tf.assign(tf_recv_buf_1, hvd.broadcast(tf_send_buf, 0)), tf.assign(tf_recv_buf_0, hvd.broadcast(tf_send_buf, 1))) # Initialize state to initial conditions tf.global_variables_initializer().run() # Run num_iter steps of PDE start = time.time() for i in range(num_iter): broadcast.run() # Step simulation if hvd.rank() == 0: r0_step.run({eps: 0.06, damping: 0.03}) else: r1_step.run({eps: 0.06, damping: 0.03})
U.assign(U_), Ut.assign(Ut_)) #create send and receive buffers send_buf = np.zeros([2, N], dtype=np.float32) recv0_buf = np.zeros([2, N], dtype=np.float32) recv1_buf = np.zeros([2, N], dtype=np.float32) Send_Buffer = tf.Variable(send_buf, name='Send_Buffer') Recv0_Buffer = tf.Variable(recv0_buf, name='Recv0_Buffer') Recv1_Buffer = tf.Variable(recv1_buf, name='Recv1_Buffer') bcast = tf.group( tf.assign(Recv1_Buffer, hvd.broadcast(Send_Buffer, 0)), tf.assign(Recv0_Buffer, hvd.broadcast(Send_Buffer, 1))) fill_row = None if hvd.rank() == 0: #fill bottom 2 rows ka values in send_buffer fill_row = tf.scatter_update(Send_Buffer, [0,1], Ut[N-2:N, :]) else: #fill top 2 rows ka values in send_buffer fill_row = tf.scatter_update(Send_Buffer, [0,1], Ut[2:4, :]) update_row = None if hvd.rank() == 0: #fill bottom 2 rows ka values in send_buffer update_row = tf.scatter_update(Ut, [N,N+1], Recv0_Buffer[:, :])
airlineData['IsDelayed'], test_size=0.25, random_state=42) xTrain, xTest, yTrain, yTest = np.array(X_train), np.array(X_test), np.array( y_train), np.array(y_test) indices = splitData(xTrain, yTrain, numTrees) currSess = tf.InteractiveSession() indexBC = tf.get_variable(initializer=tf.constant(indices), dtype=tf.int32, name="IndexBC") ### broadcast only for 0 rank indicesBroadCast = hvd.broadcast(indexBC, 0) def tree_fit_predict(xTrain, yTrain, xTest, index): model = decisionTree.DecisionTree(maxDepth=maxDepth, verbose=True) # new = tf.gather(xTrain,index) # print(new) # # index = np.array(index) # # index = index.astype(int) # # print(index.dtype) result = model.fit(xTrain[index], yTrain[index]).predict(xTest) return tf.convert_to_tensor(result) xTrainTensor = tf.placeholder(tf.float32) yTrainTensor = tf.placeholder(tf.float32)
def worker(rank, size, input_file_specs, batch_size=256, warmup_sec=10.0, run_sec=60 * 60 * 4, num_threads=0, sync=False, warn_latency_sec=4.0, report_period_sec=2.0, round_robin_files=True, throttle_sleep_sec=0.01, throttle_total_rate_bytes_per_sec=0): if rank == 0: print('storage_benchmark_tensorflow: BEGIN') print(datetime.datetime.utcnow()) metrics_file_name = '/imagenet-scratch/logs/storage_benchmark_tensorflow_metrics-%d.log' % rank with open(metrics_file_name, 'a') as metrics_file: hostname = socket.gethostname() # Set random seed to have deterministic behavior. tf.set_random_seed(rank + 1) # Round robin the input file spec. This allows multiple mount points to be used. input_file_spec = input_file_specs[hvd.local_rank() % len(input_file_specs)] print('rank=%3d: %s: input_file_spec=%s' % (rank, hostname, input_file_spec)) if round_robin_files: # Distribute sets of file names evenly over all processes and without overlap. all_input_filenames = sorted(glob.glob(input_file_spec)) num_files = len(all_input_filenames) i = rank input_filenames = [] while i < num_files: input_filenames.append(all_input_filenames[i]) i += size print( 'rank=%3d: Found %d total files. %d files assigned to this process.' % (rank, len(all_input_filenames), len(input_filenames))) if len(input_filenames) == 0: raise ValueError('Not enough matching files.') input_file_spec = None else: # This will use tf.data.TFRecordDataset.list_files to randomly distribute files. input_filenames = None # # Build execution graph. # ds_iterator = create_iterator(batch_size, num_threads, input_file_spec=input_file_spec, input_filenames=input_filenames) # num_bytes_tensor is an int64 tensor of shape (batch_size). num_bytes_tensor = ds_iterator.get_next() # When num_bytes_for_step_tensor is evaluated, it reads the TFRecord files. num_bytes_for_step_tensor = tf.reduce_sum(num_bytes_tensor) # The following operations are used to synchronize the processes when running in sync mode. if sync: stop_flag_placeholder = tf.placeholder(tf.bool, shape=()) stop_flag_broadcast_tensor = hvd.broadcast(stop_flag_placeholder, 0, 'stop_flag_broadcast') num_bytes_for_step_placeholder = tf.placeholder(tf.int64, shape=()) total_bytes_for_step_tensor = hvd.allreduce( num_bytes_for_step_placeholder, average=False) # # Start the TensorFlow session and execute the graph. # config = tf.ConfigProto() config.device_count['GPU'] = 0 config.intra_op_parallelism_threads = 1 config.inter_op_parallelism_threads = 1 print('rank=%3d: Creating session' % rank) with tf.Session(config=config) as session: print('rank=%3d: Session created' % rank) session.run( [tf.initializers.global_variables(), tf.tables_initializer()]) print('rank=%3d: Initialized variables' % rank) # Run first step. This can take 30 seconds for 100,000 files. print('rank=%3d: Running first step' % rank) _ = session.run(num_bytes_for_step_tensor) print('rank=%3d: First step complete' % rank) # Wait for barrier so we know when all processes have finished the first step. print('rank=%3d: Waiting for barrier' % rank) session.run(hvd.allreduce(tf.constant(0))) if rank == 0: print('rank=%3d: Completed waiting for barrier' % rank) # To ensure that all processes finish warmup and stop at exactly the same time, # the rank 0 node broadcasts its time to all other ranks. # This also serves as a synchronization barrier. local_t0 = time.time() t0_tensor = tf.constant(local_t0, tf.float64) t0_tensor = hvd.broadcast(t0_tensor, 0, 't0') t0 = session.run(t0_tensor) start_time = t0 + warmup_sec stop_time = start_time + run_sec step = 0 warmed_up = False num_records = 0 num_bytes = 0 total_bytes = 0 next_report_time = time.time() + report_period_sec if throttle_total_rate_bytes_per_sec: throttle_rate_bytes_per_sec = throttle_total_rate_bytes_per_sec / size burst_sec = 1.0 throttle = TokenBucket(tokens=throttle_rate_bytes_per_sec * burst_sec, fill_rate=throttle_rate_bytes_per_sec) else: throttle = None while True: # Reset all counters when warmup completes. t = time.time() if not warmed_up and t >= start_time: print('rank=%3d: warmup complete at step %d' % (rank, step)) warmed_up = True t0 = start_time step = 0 num_records = 0 num_bytes = 0 total_bytes = 0 # Run a single step of batch_size records per process. run_options = tf.RunOptions() # run_options.timeout_in_ms = 10000 num_bytes_for_step = np.int64(0) try: num_bytes_for_step = session.run(num_bytes_for_step_tensor, options=run_options) except Exception as e: print('rank=%3d: %s: ERROR: %s' % (rank, hostname, e)) step_dt = time.time() - t if (warmed_up or step >= 1) and step_dt > warn_latency_sec: print('rank=%3d: %s: WARNING: step %d took %0.3f seconds' % (rank, hostname, step, step_dt)) next_report_time = 0.0 # Calculate local stop flag. In sync mode, this is broadcast from rank 0. stop_flag = time.time() >= stop_time # Use Horovod to aggregate the byte counter across all processes. # This also acts as a synchronization barrier, much like gradient descent when # it shares gradients. # Also coordinate the stop flag so all processes stop at the same step. sync_dt = 0.0 if sync: t = time.time() total_bytes_for_step, stop_flag = session.run( [ total_bytes_for_step_tensor, stop_flag_broadcast_tensor ], feed_dict={ num_bytes_for_step_placeholder: num_bytes_for_step, stop_flag_placeholder: stop_flag, }, ) total_bytes += total_bytes_for_step sync_dt = time.time() - t if warmed_up and sync_dt > 30.0: print( 'rank=%3d: %s: WARNING: sync after step %d took %0.3f seconds' % (rank, hostname, step, sync_dt)) next_report_time = 0.0 num_records += batch_size num_bytes += num_bytes_for_step t = time.time() metrics = { '@timestamp': datetime.datetime.utcnow().isoformat() + 'Z', 'batch_size': batch_size, 'rank': rank, 'hostname': hostname, 'step': step, 'num_bytes': int(num_bytes_for_step), 'latency_sec': step_dt, 'sync_latency_sec': sync_dt, } json.dump(metrics, metrics_file) metrics_file.write("\n") metrics_file.flush() if t >= next_report_time: dt = t - t0 if not sync: records_per_sec = num_records / dt bytes_per_sec = num_bytes / dt MB_per_sec = bytes_per_sec / 1e6 print( 'rank=%3d: warmed_up=%d, step=%6d, records/sec=%8.0f, MB/sec=%11.3f, records=%10d, bytes=%15d, dt=%9.3f' % (rank, warmed_up, step, records_per_sec, MB_per_sec, num_records, num_bytes, dt)) if sync: if rank == 0: total_records = num_records * size records_per_sec = total_records / dt bytes_per_sec = total_bytes / dt MB_per_sec = bytes_per_sec / 1e6 print( 'TOTAL: warmed up=%d, step=%6d, records/sec=%8.0f, MB/sec=%11.3f, records=%10d, bytes=%15d, dt=%9.3f' % (warmed_up, step, records_per_sec, MB_per_sec, total_records, total_bytes, dt)) next_report_time = t + report_period_sec # Throttle byte rate. if throttle: while not throttle.consume(num_bytes_for_step): # print('sleeping') time.sleep(throttle_sleep_sec) if stop_flag: print('rank=%3d: %s: complete at step %d' % (rank, hostname, step)) break step += 1 # Use Horovod to aggregate the final counters across all processes. num_steps_tensor = tf.constant(step) num_bytes_tensor = tf.constant(num_bytes) total_steps_tensor = hvd.allreduce(num_steps_tensor, average=False) total_bytes_tensor = hvd.allreduce(num_bytes_tensor, average=False) total_steps, total_bytes = session.run( [total_steps_tensor, total_bytes_tensor]) if rank == 0: dt = stop_time - start_time num_records = total_steps * batch_size records_per_sec = num_records / dt total_GB = total_bytes / 1e9 bytes_per_sec = total_bytes / dt MB_per_sec = bytes_per_sec / 1e6 print('FINAL: number of processes: %12d' % size) print('FINAL: batch size: %12d' % batch_size) print('FINAL: sync: %12s' % sync) print('FINAL: round robin files: %12s' % round_robin_files) print('FINAL: number of records: %12d' % num_records) print('FINAL: GB: %12.3f' % total_GB) print('FINAL: elapsed sec: %12.3f' % dt) print('FINAL: records/sec: %12.0f' % records_per_sec) print('FINAL: MB/sec: %12.3f' % MB_per_sec) if rank == 0: print('storage_benchmark_tensorflow: END')
rank = hvd.rank() print('rank', rank) points_per_device = int(num_points / hvd.size()) np_points = np_points[rank * points_per_device:rank * points_per_device + points_per_device - 1] #points = tf.constant(np_points, dtype=tf.float32) points = tf.placeholder(dtype=tf.float32, name='global_sum_place_hold') centroids = tf.get_variable(name='centroids', shape=[num_cluster, dim], initializer=tf.initializers.random_uniform( minval=0, maxval=10.0, seed=123)) bcast_result = hvd.broadcast(centroids, 0) init_centroids_sync = tf.assign(centroids, bcast_result) expanded_points = tf.expand_dims(points, 0) expanded_centroids = tf.expand_dims(centroids, 1) distances = tf.reduce_sum(tf.square(tf.subtract(points, expanded_centroids)), 2) assignments = tf.argmin(distances, 0) loss_op = tf.reduce_sum(tf.reduce_min(distances, 0)) tf_sum = tf.unsorted_segment_sum(points, assignments, num_cluster) tf_count = tf.unsorted_segment_sum(tf.ones_like(points), assignments, num_cluster)
Ut_Send = tf.Variable(ut_init, name='Ut_Send') # Create tensor flow variable to receive into Ur0 = tf.Variable(np.zeros([3, N], dtype=np.float32)) Utr0 = tf.Variable(np.zeros([3, N], dtype=np.float32)) Ur1 = tf.Variable(np.zeros([3, N], dtype=np.float32)) Utr1 = tf.Variable(np.zeros([3, N], dtype=np.float32)) #Used for calculations U_main = tf.Variable(np.zeros([N + 3, N], dtype=np.float32)) Ut_main = tf.Variable(np.zeros([N + 3, N], dtype=np.float32)) #Communicate 3 rows rank_bcast = tf.group( tf.assign(Ur0, hvd.broadcast(U[:3], 1)), #Rank 1's send_buffer to Rank 0's recv for U tf.assign(Utr0, hvd.broadcast(Ut[:3], 1)), #Rank 1's send_buffer to Rank 0's recv for Ut tf.assign(Ur1, hvd.broadcast(U[-3:], 0)), #Rank 0's send_buffer to Rank 1's recv for U tf.assign(Utr1, hvd.broadcast(Ut[-3:], 0))) #Rank 0's send_buffer to Rank 1's recv for Ut rank0_join = tf.group(U_main.assign(tf.concat([U, Ur0], 0)), Ut_main.assign(tf.concat([Ut, Utr0], 0))) rank1_join = tf.group(U_main.assign((tf.concat([Ur1, U], 0))), Ut_main.assign((tf.concat([Utr1, Ut], 0))))
def get_image_labels(self): if self.is_all_shared: ### ALL SHARED ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) with tf.name_scope("to-preprocessing"): capacity = 20 * self.FLAGS.batch_size to_pre_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=None, name="to_pre_queue") to_pre_op = to_pre_queue.enqueue([image, label]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_pre_queue, [to_pre_op] * Pipeline.QR_THREADS)) tf.summary.scalar("to_pre_fraction_of_%d_full" % capacity, math_ops.to_float(to_pre_queue.size()) * (1. / capacity)) image, label = to_pre_queue.dequeue() with tf.name_scope("preprocessing"):#TODO image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) with tf.name_scope("to-allgather"): capacity = 20 * self.FLAGS.batch_size to_allg_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to_allgather_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_allg_queue, [to_allg_queue.enqueue([image, label])] * Pipeline.QR_THREADS)) tf.summary.scalar("to_allgather_fraction_of_%d_full" % capacity, math_ops.to_float(to_allg_queue.size()) * (1. / capacity)) # num_preprocessors = tf.placeholder(tf.int32, shape=[], name="num_preprocessors) # self.num_hvd_send_tensor = send_images, send_labels = to_allg_queue.dequeue_many(self.num_hvd_send) # if rank == #TODO all_images = hvd.allgather(send_images, name="hvd_allgather") all_labels = hvd.allgather(send_labels, name="hvd_allgather") #TODO: Remove extra queues with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []],#TODO name="to_compute_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_queue, [to_compute_queue.enqueue_many([all_images, all_labels])]))#1 thread! tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_queue.size()) * (1. / capacity)) image, label = to_compute_queue.dequeue() elif self.is_single_bcast: ### SINGLE BROADCAST ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" if 0 in self.member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) send_images, send_labels = create_qr("to-allg", 10 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], Pipeline.QR_THREADS, False, True, self.num_hvd_send) all_images = hvd.allgather(send_images, group=0, name=allg_images_name) all_labels = hvd.allgather(send_labels, group=0, name=allg_labels_name) all_images, all_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, True, self.images_per_bcast) if 1 in self.member_of_group: # For the middle man rank, reset all_images and all_labels # names to their broadcasted tensors so that the bcast is # performed. Note that the bcast root is rank 0 since the # group1 sent to init had this rank listed first, meaning that # the resulting mpi group comm has this rank has rank 0 if len(self.member_of_group) == 1: # Then not middle man, so construct holder variable WITH CORRECT NAME! # tf.Variable(self.num_hvd_send? all_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) all_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) #shape of [] turns into 1D instead of 0D all_images = hvd.broadcast(all_images, 0, group=1, name=bcast_images_name) all_labels = hvd.broadcast(all_labels, 0, group=1, name=bcast_labels_name) image, label = create_qr("to-compute", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, False) elif self.is_multi_bcast: ### MULTIPLE BROADCAST # print("Rank:", rank, member_of_group, group_rank_list) img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) # allg_image_name = "allgathered-image" # need some naming commonalities # allg_label_name = "allgathered-label" allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" # if 0 in member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline if self.rank < self.FLAGS.num_pre: with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) # image = tf.Print(image, ["using preprocessed image"]) send_images, send_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], 2 * Pipeline.QR_THREADS, False, True, self.images_per_bcast) else: send_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) send_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) with tf.device("/cpu:0"): bcast_images_root = "broadcast-images-" bcast_labels_root = "broadcast-labels-" bcast_images_per_group = [hvd.broadcast(send_images, i, group=i, name=bcast_images_root + str(i)) for i in range(self.FLAGS.num_pre)] bcast_labels_per_group = [hvd.broadcast(send_labels, i, group=i, name=bcast_labels_root + str(i)) for i in range(self.FLAGS.num_pre)] with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[post_pre_image_dtype, post_pre_label_dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to-compute-queue") to_comp_ops = [to_compute_q.enqueue_many([bcast_images_per_group[i], bcast_labels_per_group[i]]) for i in range(self.FLAGS.num_pre)] queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_q, to_comp_ops)) tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_q.size()) * (1. / capacity)) image, label = to_compute_q.dequeue() return image, label
if (hvd.rank() == 0): u_init[a, b] = np.random.uniform() else: u_init[a + 3, b] = np.random.uniform() # Parameters: # eps -- time resolution # damping -- wave damping eps = tf.placeholder(tf.float32, shape=()) damping = tf.placeholder(tf.float32, shape=()) # Create variables for simulation state U = tf.Variable(u_init) Ut = tf.Variable(ut_init) #communicate rows for calculations bcast = tf.group(tf.assign(U[0:3], hvd.broadcast(U[N - 3:N], 0)), tf.assign(Ut[0:3], hvd.broadcast(Ut[N - 3:N], 0)), tf.assign(U[N - 3:N], hvd.broadcast(U[0:3], 1)), tf.assign(Ut[N - 3:N], hvd.broadcast(Ut[0:3], 1))) # Discretized PDE update rules U_ = U + eps * Ut Ut_ = Ut + eps * (laplace(U) - damping * Ut) # Operation to update the state step = tf.group(U.assign(U_), Ut.assign(Ut_)) #sliced output n*n U_slice0 = tf.group(U[0:N].assign(tf.slice(U, [0, 0], [N, N]))) Ut_slice0 = tf.group(Ut[0:N].assign(tf.slice(Ut, [0, 0], [N, N])))
def broadcast(backend, value, root_rank, name): bcast_op = hvd.broadcast(tf.constant(value, name=name), root_rank) return backend.get_session().run(bcast_op)
def broadcast(backend, value, root_rank, name): return _eval(backend, hvd.broadcast(tf.constant(value, name=name), root_rank))
def __call__(self, *args, **kwargs): weights = self.wrapped(*args, **kwargs) weights = hvd.broadcast(weights, root_rank=0, name='BroadcastingInitializer') return weights