def end(self, session): import os raw_report = session.run(self._report_op) write_file = os.path.join(self._write_dir, f'{self._name}_report.txt') with open(write_file, 'w') as f: f.write(ipu.utils.extract_all_strings_from_event_trace(raw_report)) from gcprofile import save_tf_report save_tf_report(raw_report) print(f"Wrote profiling report to {write_file}")
def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model" use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
print("Compiling and Warmup...") start = time.time() sess.run(inference_output) convolution_predictions = sess.run(outfeed) # convolution_predictions = sess.run(inference_output, feed_dict={input_image: np_image}) raw_output = sess.run( decoder, feed_dict={input_detection: convolution_predictions[0]}) filtered_output = process_detections(raw_output) draw_detections(original_image, original_image_dims[0], original_image_dims[1], filtered_output) print("Done running inference.") duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) if REPORT: rep_out = sess.run(report) save_tf_report(rep_out) rep = utils.extract_all_strings_from_event_trace(rep_out) with open( str(WIDTH) + "x" + str(HEIGHT) + "_ipus" + str(NUM_IPUS) + "_ssd_report.txt", "w") as f: f.write(rep) # Performance runs print("Executing...") for iter_count in range(N_ITERATIONS): print("Running iteration: ", iter_count) # Run start = time.time() sess.run(inference_output) convolution_predictions = sess.run(outfeed) raw_output = sess.run( decoder, feed_dict={input_detection: convolution_predictions[0]})
def train(replication_factor, batch_size, batch_per_step, profile, num_iter, time_steps): """Launch training.""" # Set up in-feeds for the data with tf.device('cpu'): data_generator = EnvGenerator(batch_size, time_steps) items = next(data_generator) output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items)) output_shapes = tuple((tf.TensorShape(i.shape) for i in items)) total_bytes = 0 for i in items: total_bytes += i.nbytes print(f'Input data size = {total_bytes/1000000} MB/batch') dataset = tf.data.Dataset.from_generator(data_generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, "InfeedQueue", replication_factor=replication_factor) data_init = infeed_queue.initializer # Compile loss op with ipu_scope("/device:IPU:0"): total_loss = ipu_compiler.compile( lambda: loops.repeat(batch_per_step, build_train_op, infeed_queue=infeed_queue, inputs=[tf.constant(0.0, dtype=DTYPE)])) # Set up report op optionally. if profile: with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() # Set up session on IPU opts = utils.create_ipu_config( profiling=profile, use_poplar_text_report=use_poplar_text_report, profile_execution=profile, merge_infeed_io_copies=True) opts = utils.set_optimization_options( opts, max_cross_replica_sum_buffer_size=10000000) opts = utils.auto_select_ipus(opts, [replication_factor]) utils.configure_ipu_system(opts) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # Initialize variables utils.move_variable_initialization_to_cpu() sess.run([tf.global_variables_initializer(), data_init]) # Run training and time total_time = 0.0 total_samples = 0 skip_iterations = 5 # Initially the infeed may buffer extra input data and # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec. for iters in range(num_iter): data_generator.reset_counter() t0 = time.perf_counter() sess.run(total_loss) t1 = time.perf_counter() if profile: raw_reports = sess.run(report) if use_poplar_text_report: # extract the report rep = utils.extract_all_strings_from_event_trace(raw_reports) print("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: os.makedirs('profile_rl', exist_ok=True) save_tf_report(raw_reports, log_dir='profile_rl') print("Writing profiling report to profile_rl") break if iters > skip_iterations: total_time += (t1 - t0) total_samples += (batch_size * batch_per_step * replication_factor) print("Average %.1f items/sec" % (total_samples / total_time))
def train_with_session(input_fn, cosmoflow_config): with tf.device('cpu'): infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue( input_fn( ), # difference in tf.dataset construction changes throughput feed_name="training_infeed", replication_factor=cosmoflow_config['ipu_config']['num_ipus']) outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( 'outfeed', replication_factor=cosmoflow_config['ipu_config']['num_ipus']) def cosmoflow_training_loop(): def body(loss, features, labels): with tf.variable_scope("MainGraph"): model = get_model(**cosmoflow_config['model']) outputs = model(features, training=True) train_config = cosmoflow_config['train'] loss_name = train_config['loss'] if loss_name == "mse": loss = tf.losses.mean_squared_error(labels=labels, predictions=outputs) else: raise NotImplementedError("loss: %s" % loss_name) optimizer = tf.train.GradientDescentOptimizer( cosmoflow_config['optimizer']['lr']) if cosmoflow_config['ipu_config']['num_ipus'] > 1: optimizer = CrossReplicaOptimizer(optimizer) train_op = optimizer.minimize(loss=loss) with tf.control_dependencies([train_op]): return loss, outfeed_queue.enqueue(loss) loss = 0.0 return ipu.loops.repeat( cosmoflow_config['ipu_config']['iterations_per_loop'], body, [loss], infeed_queue) # Compile model with ipu.scopes.ipu_scope('/device:IPU:0'): res = ipu.ipu_compiler.compile(cosmoflow_training_loop, inputs=[]) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = get_ipu_options(cosmoflow_config) ipu.utils.configure_ipu_system(ipu_options) ipu.utils.move_variable_initialization_to_cpu() data_config = cosmoflow_config['data'] # remember that effective batch-size is batch-size X num_ipus # also note that num_loops is different from num_steps given to IPUEstimator num_loops = ((data_config["n_epochs"] * data_config["n_train"]) // (data_config["batch_size"] * cosmoflow_config['ipu_config']['num_ipus'] * cosmoflow_config['ipu_config']['iterations_per_loop'])) if cosmoflow_config['ipu_config']['profiling']: with tf.device('cpu'): from tensorflow.compiler.plugin.poplar.ops import gen_ipu_ops # Event trace trace = gen_ipu_ops.ipu_event_trace() with tf.Session() as sess: sess.run(infeed_queue.initializer) sess.run(tf.global_variables_initializer()) # Warm up print("Compiling and Warmup...") start = time.time() sess.run(res) if cosmoflow_config['ipu_config']['profiling']: report = sess.run(trace) from gcprofile import save_tf_report save_tf_report(report) duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) print("Executing...") losses = [] average_batches_per_sec = [] start = time.time() for i in range(num_loops): t0 = time.time() sess.run(res) local_losses = sess.run(dequeue_outfeed) duration = time.time() - t0 average_batches_per_sec.append( cosmoflow_config['ipu_config']['iterations_per_loop'] / duration) report_string = "{:<7.3} sec/itr.".format(duration) print(report_string) losses.append(local_losses) t1 = time.time() duration_seconds = t1 - start logging.info("Took {:.2f} minutes".format(duration_seconds / 60)) print('Iteration, Batches/Second, Samples/Second') for loop_idx, bps in enumerate(average_batches_per_sec): print('{}, {}, {}'.format( loop_idx, bps, bps * data_config["batch_size"] * cosmoflow_config['ipu_config']['num_ipus'])) samples_per_second = np.mean(average_batches_per_sec) * data_config[ "batch_size"] * cosmoflow_config['ipu_config']['num_ipus'] print( "Took {:.2f} minutes, i.e. {:.0f} samples per second for batch-size {} and no. IPUs = {}" .format(duration_seconds / 60, samples_per_second, cosmoflow_config['data']['batch_size'], cosmoflow_config['ipu_config']['num_ipus'])) # Finalize logging.info('All done!') return
def end(self, session): raw_reports = session.run(self._report) from gcprofile import save_tf_report save_tf_report(raw_reports)
def train_process(model, LR_Class, opts): # --------------- OPTIONS --------------------- epochs = opts["epochs"] iterations_per_epoch = DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] // opts["total_batch_size"] if not opts['iterations']: iterations = epochs * iterations_per_epoch log_freq = iterations_per_epoch // opts['logs_per_epoch'] else: iterations = opts['iterations'] log_freq = opts['log_freq'] if log_freq < opts['batches_per_step']: iterations_per_step = log_freq else: iterations_per_step = log_freq // int( round(log_freq / opts['batches_per_step'])) iterations_per_valid = iterations_per_epoch iterations_per_ckpt = iterations_per_epoch // opts[ 'ckpts_per_epoch'] if opts['ckpts_per_epoch'] else np.inf LR = LR_Class(opts, iterations) batch_accs = deque(maxlen=iterations_per_epoch // iterations_per_step) batch_losses = deque(maxlen=iterations_per_epoch // iterations_per_step) batch_times = deque(maxlen=iterations_per_epoch // iterations_per_step) start_all = None # -------------- BUILD TRAINING GRAPH ---------------- train = training_graph( model, opts, iterations_per_step * opts["gradients_to_accumulate"]) train.session.run(train.init) train.session.run(train.iterator.initializer) # -------------- BUILD VALIDATION GRAPH ---------------- if opts['validation']: valid = validation.initialise_validation(model, opts) # -------------- SAVE AND RESTORE -------------- if opts['ckpts_per_epoch']: filepath = train.saver.save(train.session, opts["checkpoint_path"], global_step=0) print("Saved checkpoint to {}".format(filepath)) if opts.get('restoring'): filename_pattern = re.compile(".*ckpt-[0-9]+$") ckpt_pattern = re.compile(".*ckpt-([0-9]+)$") filenames = sorted( [ os.path.join(opts['logs_path'], f[:-len(".index")]) for f in os.listdir(opts['logs_path']) if filename_pattern.match(f[:-len(".index")]) and f[-len(".index"):] == ".index" ], key=lambda x: int(ckpt_pattern.match(x).groups()[0])) latest_checkpoint = filenames[-1] logging.print_to_file_and_screen( "Restoring training from latest checkpoint: {}".format( latest_checkpoint), opts) ckpt_pattern = re.compile(".*ckpt-([0-9]+)$") i = int(ckpt_pattern.match(latest_checkpoint).groups()[0]) + 1 train.saver.restore(train.session, latest_checkpoint) epoch = float(opts["total_batch_size"] * (i + iterations_per_step)) / DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] else: i = 0 # ------------- TRAINING LOOP ---------------- print_format = ( "step: {step:6d}, iteration: {iteration:6d}, epoch: {epoch:6.2f}, lr: {lr:6.4g}, loss: {loss_avg:6.3f}, top-1 accuracy: {train_acc_avg:6.3f}%" ", img/sec: {img_per_sec:6.2f}, time: {it_time:8.6f}, total_time: {total_time:8.1f}" ) step = 0 start_all = time.time() while i < iterations: step += opts["gradients_to_accumulate"] log_this_step = ((i // log_freq) < ((i + iterations_per_step) // log_freq) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations)) ckpt_this_step = (opts["ckpts_per_epoch"] and ( (i // iterations_per_ckpt) < ((i + iterations_per_step) // iterations_per_ckpt) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations))) valid_this_step = (opts['validation'] and ( (i // iterations_per_valid) < ((i + iterations_per_step) // iterations_per_valid) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations))) # Run Training try: batch_loss, batch_acc, batch_time, current_lr = training_step( train, i + 1, LR.feed_dict_lr(i)) if opts['pipeline_depth'] > 1: current_lr *= opts["loss_scaling"] except tf.errors.OpError as e: raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message) batch_time /= iterations_per_step # Calculate Stats batch_accs.append([batch_acc]) batch_losses.append([batch_loss]) if i != 0: batch_times.append([batch_time]) # Print loss if log_this_step: train_acc = np.mean(batch_accs) train_loss = np.mean(batch_losses) if len(batch_times) != 0: avg_batch_time = np.mean(batch_times) else: avg_batch_time = batch_time # flush times every time it is reported batch_times.clear() total_time = time.time() - start_all epoch = float(opts["total_batch_size"] * (i + iterations_per_step)) / DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] stats = OrderedDict([ ('step', step), ('iteration', i + iterations_per_step), ('epoch', epoch), ('lr', current_lr), ('loss_batch', batch_loss), ('loss_avg', train_loss), ('train_acc_batch', batch_acc), ('train_acc_avg', train_acc), ('it_time', avg_batch_time), ('img_per_sec', opts['total_batch_size'] / avg_batch_time), ('total_time', total_time), ]) logging.print_to_file_and_screen(print_format.format(**stats), opts) logging.write_to_csv(stats, i == 0, True, opts) if ckpt_this_step: filepath = train.saver.save(train.session, opts["checkpoint_path"], global_step=i + iterations_per_step) print("Saved checkpoint to {}".format(filepath)) # Eval if valid_this_step and opts['validation']: if 'validation_points' not in locals(): validation_points = [] validation_points.append( (i + iterations_per_step, epoch, i == 0, filepath)) i += iterations_per_step # ------------ COLLECT PROFILE ----------- if opts["profile"]: from gcprofile import save_tf_report save_tf_report(train.session.run(train.profile)) # ------------ RUN VALIDATION ------------ if 'validation_points' in locals() and opts['validation']: for iteration, epoch, first_run, filepath in validation_points: validation.validation_run(valid, filepath, iteration, epoch, first_run, opts) # --------------- CLEANUP ---------------- train.session.close()
def end(self, session): raw_reports = session.run(self._report) save_tf_report(raw_reports)
# With pipelining, IPU-level profiling is needed to correctly visualise the execution trace. # For pipelined models either SNAKE or HOOF IPU selection orders are advised; # the latter works best when the first and last stage live on the same IPU. # For more info, check ipu.utils.py file or the TensorFlow document: # https://www.graphcore.ai/docs/targeting-the-ipu-from-tensorflow#tensorflow.python.ipu.utils.SelectionOrder. cfg = ipu.utils.create_ipu_config( profiling=args.profile, profile_execution=ipu.utils.ExecutionProfileType.IPU_PROFILE if args.profile else False, selection_order=ipu.utils.SelectionOrder.SNAKE) # Auto select as many IPUs as we want to pipeline across cfg = ipu.utils.auto_select_ipus(cfg, 2) ipu.utils.configure_ipu_system(cfg) with tf.Session() as sess: # Initialize sess.run(init_op) sess.run(infeed_queue.initializer) # Run for step in range(steps): sess.run(compiled_model, {lr: args.learning_rate}) if args.profile and gcprofile_present: raw_reports = sess.run(report) save_tf_report(raw_reports) break # Read the outfeed for the training losses losses = sess.run(outfeed_op) epoch = float(examples_per_step * step / n_examples) print("Epoch {:.1f}, Mean loss: {:.3f}".format( epoch, np.mean(losses)))