def main(): # if tf.__version__.split('.')[0] != "1": # raise Exception("Tensorflow version 1 required") if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as filename: filename.write(json.dumps(vars(a), sort_keys=True, indent=4)) examples = load_examples() model = create_model(examples.inputs, examples.targets) # encoding images for saving with tf.name_scope("encode_images"): display_fetches = {} for name, value in examples._asdict().items(): if "path" in name: display_fetches[name] = value elif tf.is_numeric_tensor(value): display_fetches[name] = tf.map_fn(tf.image.encode_png, deprocess(value), dtype=tf.string, name=name + "_pngs") for name, value in model._asdict().items(): if tf.is_numeric_tensor(value) and "predict_" not in name: display_fetches[name] = tf.map_fn(tf.image.encode_png, deprocess(value), dtype=tf.string, name=name + "_pngs") # progress report for all losses with tf.name_scope("progress_summary"): progress_fetches = {} for name, value in model._asdict().items(): if not tf.is_numeric_tensor( value ) and "grads_and_vars" not in name and not name == "train": progress_fetches[name] = value # summaries for model: images, scalars, histograms for name, value in examples._asdict().items(): if tf.is_numeric_tensor(value): with tf.name_scope(name + "_summary"): tf.summary.image(name, deprocess(value)) for name, value in model._asdict().items(): if tf.is_numeric_tensor(value): with tf.name_scope(name + "_summary"): if "predict_" in name: # discriminators produce values in [0, 1] tf.summary.image( name, tf.image.convert_image_dtype(value, dtype=tf.uint8)) else: # generators produce values in [-1, 1] tf.summary.image(name, deprocess(value)) elif "grads_and_vars" in name: for grad, var in value: tf.summary.histogram(var.op.name + "/gradients", grad) elif not name == "train": tf.summary.scalar(name, value) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) with sv.managed_session() as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) max_steps = 2**32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": # testing # at most, process the test data once max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) filesets = save_images(results) for i, filename in enumerate(filesets): print("evaluated image", filename["name"]) index_path = append_index(filesets) print("wrote index at %s" % index_path) if a.mode == "predict": # predicting # at most, process the test data once max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) fileset = save_predicted_images(results) for filename in fileset: print("predicted image", filename) print("wrote predicted labels at %s" % a.output_dir) if a.mode == "train": # training start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["progress"] = progress_fetches if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata( run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print( "progress epoch %d step %d image/sec %0.1f remaining %d min" % (train_epoch, train_step, rate, remaining / 60)) for name, value in results["progress"].items(): print(name, value) if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def main(): os.environ["CUDA_VISIBLE_DEVICES"] = "0" if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test": if a.checkpoint is None: raise Exception("checkpoint required for test mode") # load some options from the checkpoint options = {"which_direction", "ngf", "ndf", "lab_colorization"} with open(os.path.join(a.checkpoint, "options.json")) as f: for key, val in json.loads(f.read()).items(): if key in options: print("loaded", key, "=", val) setattr(a, key, val) # disable these features in test mode a.flip = False for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) examples = load_examples() print("examples count = %d" % examples.count) # inputs and targets are [batch_size, height, width, channels] if a.mode == "test": patch_h_cnt, padding_h = find_patch_and_padding( IMAGE_HEIGHT, CROP_SIZE) patch_w_cnt, padding_w = find_patch_and_padding(IMAGE_WIDTH, CROP_SIZE) paddings = [[0, 0], [padding_h, padding_h], [padding_w, padding_w], [0, 0]] inputs_pad = tf.pad(examples.inputs, paddings, "REFLECT") targets_pad = tf.pad(examples.targets, paddings, "REFLECT") IMAGE_PADDING_HEIGHT = IMAGE_HEIGHT + 2 * padding_h IMAGE_PADDING_WIDTH = IMAGE_WIDTH + 2 * padding_w outputs = tf.zeros([1, IMAGE_PADDING_HEIGHT, IMAGE_PADDING_WIDTH, 1], dtype=tf.float32) first = True # combine patchs into images for row in range(patch_h_cnt): for col in range(patch_w_cnt): row_index = int(row * CROP_SIZE / 2) col_index = int(col * CROP_SIZE / 2) if first == True: with tf.variable_scope("create_model"): model = create_model( tf.slice(inputs_pad, [0, row_index, col_index, 0], [1, CROP_SIZE, CROP_SIZE, 1]), tf.slice(targets_pad, [0, row_index, col_index, 0], [1, CROP_SIZE, CROP_SIZE, 1])) first = False else: with tf.variable_scope("create_model", reuse=True): model = create_model( tf.slice(inputs_pad, [0, row_index, col_index, 0], [1, CROP_SIZE, CROP_SIZE, 1]), tf.slice(targets_pad, [0, row_index, col_index, 0], [1, CROP_SIZE, CROP_SIZE, 1])) paddings = [ [0, 0], [row_index, IMAGE_PADDING_HEIGHT - CROP_SIZE - row_index], [col_index, IMAGE_PADDING_WIDTH - CROP_SIZE - col_index], [0, 0] ] outputs = outputs + tf.pad(model.outputs, paddings, "CONSTANT") CROP_HALF = int(CROP_SIZE / 2) o_11 = tf.pad( tf.slice(outputs, [0, 0, 0, 0], [1, CROP_HALF, CROP_HALF, 1]), [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF], [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT") o_12 = tf.pad( tf.slice(outputs, [0, 0, IMAGE_PADDING_WIDTH - CROP_HALF, 0], [1, CROP_HALF, CROP_HALF, 1]), [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF], [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT") o_13 = tf.pad( tf.slice(outputs, [0, IMAGE_PADDING_HEIGHT - CROP_HALF, 0, 0], [1, CROP_HALF, CROP_HALF, 1]), [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0], [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT") o_14 = tf.pad( tf.slice(outputs, [ 0, IMAGE_PADDING_HEIGHT - CROP_HALF, IMAGE_PADDING_WIDTH - CROP_HALF, 0 ], [1, CROP_HALF, CROP_HALF, 1]), [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0], [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT") o_21 = tf.pad( tf.slice(outputs, [0, 0, CROP_HALF, 0], [1, CROP_HALF, IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1]), [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF], [CROP_HALF, CROP_HALF], [0, 0]], "CONSTANT") o_22 = tf.pad( tf.slice(outputs, [0, CROP_HALF, 0, 0], [1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF, CROP_HALF, 1]), [[0, 0], [CROP_HALF, CROP_HALF], [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT") o_23 = tf.pad( tf.slice(outputs, [0, IMAGE_PADDING_HEIGHT - CROP_HALF, CROP_HALF, 0], [1, CROP_HALF, IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1]), [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0], [CROP_HALF, CROP_HALF], [0, 0]], "CONSTANT") o_24 = tf.pad( tf.slice(outputs, [0, CROP_HALF, IMAGE_PADDING_WIDTH - CROP_HALF, 0], [1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF, CROP_HALF, 1]), [[0, 0], [CROP_HALF, CROP_HALF], [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT") o_4 = tf.pad( tf.slice(outputs, [0, CROP_HALF, CROP_HALF, 0], [ 1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF, IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1 ]), [[0, 0], [CROP_HALF, CROP_HALF], [CROP_HALF, CROP_HALF], [0, 0]], "CONSTANT") outputs = o_11 + o_12 + o_13 + o_14 + (o_21 + o_22 + o_23 + o_24) / 2 + o_4 / 4 outputs = tf.slice(outputs, [0, padding_h, padding_w, 0], [1, IMAGE_HEIGHT, IMAGE_WIDTH, 1]) outputs = deprocess(outputs) else: with tf.variable_scope("create_model"): model = create_model(examples.inputs, examples.targets) outputs = deprocess(model.outputs) inputs = deprocess(examples.inputs) targets = deprocess(examples.targets) def convert(image): return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True) # reverse any processing on images so they can be written to disk or displayed to user with tf.name_scope("convert_inputs"): converted_inputs = convert(inputs) with tf.name_scope("convert_targets"): converted_targets = convert(targets) with tf.name_scope("convert_outputs"): converted_outputs = convert(outputs) with tf.name_scope("encode_images"): display_fetches = { "paths": examples.paths, "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"), "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"), "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"), } # summaries with tf.name_scope("inputs_summary"): tf.summary.image("inputs", converted_inputs) with tf.name_scope("targets_summary"): tf.summary.image("targets", converted_targets) with tf.name_scope("outputs_summary"): tf.summary.image("outputs", converted_outputs) tf.summary.scalar("generator_loss_L1", model.gen_loss_L1) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) if a.mode == "train": for grad, var in model.gen_grads_and_vars: tf.summary.histogram(var.op.name + "/gradients", grad) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) with sv.managed_session() as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) max_steps = 2**32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": # testing # at most, process the test data once start = time.time() max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) filesets = save_images_test(results, step) for i, f in enumerate(filesets): print("evaluated image", f["name"]) #index_path = append_index(filesets) #print("wrote index at", index_path) print("rate", (time.time() - start) / max_steps) else: # training start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["gen_loss_L1"] = model.gen_loss_L1 if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata( run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print( "progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("gen_loss_L1", results["gen_loss_L1"]) if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def benchmark_model(self, warmup_runs, bm_runs, num_threads, trace_filename=None): """Benchmark model.""" if self.tensorrt: print('Using tensorrt ', self.tensorrt) self.build_and_save_model() graphdef = self.freeze_model() if num_threads > 0: print('num_threads for benchmarking: {}'.format(num_threads)) sess_config = tf.ConfigProto( intra_op_parallelism_threads=num_threads, inter_op_parallelism_threads=1) else: sess_config = tf.ConfigProto() # rewriter_config_pb2.RewriterConfig.OFF sess_config.graph_options.rewrite_options.dependency_optimization = 2 if self.use_xla: sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_2) with tf.Graph().as_default(), tf.Session(config=sess_config) as sess: inputs = tf.placeholder(tf.float32, name='input', shape=self.inputs_shape) output = self.build_model(inputs, is_training=False) img = np.random.uniform(size=self.inputs_shape) sess.run(tf.global_variables_initializer()) if self.tensorrt: fetches = [inputs.name] + [i.name for i in output] goutput = self.convert_tr(graphdef, fetches) inputs, output = goutput[0], goutput[1:] if not self.use_xla: # Don't use tf.group because XLA removes the whole graph for tf.group. output = tf.group(*output) for i in range(warmup_runs): start_time = time.time() sess.run(output, feed_dict={inputs: img}) print('Warm up: {} {:.4f}s'.format(i, time.time() - start_time)) print('Start benchmark runs total={}'.format(bm_runs)) timev = [] for i in range(bm_runs): if trace_filename and i == (bm_runs // 2): run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() sess.run(output, feed_dict={inputs: img}, options=run_options, run_metadata=run_metadata) logging.info('Dumping trace to %s', trace_filename) trace_dir = os.path.dirname(trace_filename) if not tf.io.gfile.exists(trace_dir): tf.io.gfile.makedirs(trace_dir) with tf.io.gfile.GFile(trace_filename, 'w') as trace_file: from tensorflow.python.client import timeline # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format( show_memory=True)) start_time = time.time() sess.run(output, feed_dict={inputs: img}) timev.append(time.time() - start_time) timev.sort() timev = timev[2:bm_runs - 2] print( '{} {}runs {}threads: mean {:.4f} std {:.4f} min {:.4f} max {:.4f}' .format(self.model_name, len(timev), num_threads, np.mean(timev), np.std(timev), np.min(timev), np.max(timev))) print('Images per second FPS = {:.1f}'.format( self.batch_size / float(np.mean(timev))))
def train(train_phases,model,minibatch,\ sess,train_stat,ph_misc_stat,summary_writer): import time BYPASS = True # saver = tf.train.Saver(var_list=tf.trainable_variables()) saver = tf.train.Saver() epoch_ph_start = 0 f1mic_best, e_best = 0, 0 time_calc_f1, time_train, time_prepare = 0, 0, 0 options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, report_tensor_allocations_upon_oom=True) run_metadata = tf.RunMetadata() many_runs_timeline = [] # only used when TF timeline is enabled for ip, phase in enumerate(train_phases): # We normally only have a single phase of training (see README for defn of 'phase'). # On the other hand, our implementation does support multi-phase training. # e.g., you can use smaller subgraphs during initial epochs and larger subgraphs # when closer to convergence. -- This might speed up convergence. minibatch.set_sampler(phase) num_batches = minibatch.num_training_batches() #printf('START PHASE {:4d}'.format(ip),style='underline') for e in range(epoch_ph_start, int(phase['end'])): #printf('Epoch {:4d}'.format(e),style='bold') minibatch.shuffle() l_loss_tr, l_f1mic_tr, l_f1mac_tr, l_size_subg = [], [], [], [] time_train_ep, time_prepare_ep = 0, 0 while not minibatch.end(): t0 = time.time() feed_dict, labels = minibatch.feed_dict(mode='train') t1 = time.time() if BYPASS: continue if args_global.timeline: # profile the code with Tensorflow Timeline _,__,loss_train,pred_train = sess.run([train_stat[0], \ model.opt_op, model.loss, model.preds], feed_dict=feed_dict, \ options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) many_runs_timeline.append(chrome_trace) else: _,__,loss_train,pred_train = sess.run([train_stat[0], \ model.opt_op, model.loss, model.preds], feed_dict=feed_dict, \ options=tf.RunOptions(report_tensor_allocations_upon_oom=True)) t2 = time.time() time_train_ep += t2 - t1 time_prepare_ep += t1 - t0 if not minibatch.batch_num % args_global.eval_train_every: f1_mic, f1_mac = calc_f1(labels, pred_train, model.sigmoid_loss) l_loss_tr.append(loss_train) l_f1mic_tr.append(f1_mic) l_f1mac_tr.append(f1_mac) l_size_subg.append(minibatch.size_subgraph) if BYPASS: continue time_train += time_train_ep time_prepare += time_prepare_ep if args_global.cpu_eval: # Full batch evaluation using CPU # we have to start a new session so that CPU can perform full-batch eval. # current model params are communicated to the new session via tmp.chkpt saver.save(sess, './tmp.chkpt') with tf.device('/cpu:0'): sess_cpu = tf.Session(config=tf.ConfigProto( device_count={'GPU': 0})) sess_cpu.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess_cpu, './tmp.chkpt') sess_eval = sess_cpu else: sess_eval = sess loss_val,f1mic_val,f1mac_val,time_eval = \ evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='val') #printf(' TRAIN (Ep avg): loss = {:.4f}\tmic = {:.4f}\tmac = {:.4f}\ttrain time = {:.4f} sec'.format(f_mean(l_loss_tr),f_mean(l_f1mic_tr),f_mean(l_f1mac_tr),time_train_ep)) #printf(' VALIDATION: loss = {:.4f}\tmic = {:.4f}\tmac = {:.4f}'.format(loss_val,f1mic_val,f1mac_val),style='yellow') if f1mic_val > f1mic_best: f1mic_best, e_best = f1mic_val, e if not os.path.exists(args_global.dir_log + '/models'): os.makedirs(args_global.dir_log + '/models') #print(' Saving models ...') savepath = saver.save(sess, '{}/models/saved_model_{}.chkpt'.format( args_global.dir_log, timestamp).replace(' ', '_'), write_meta_graph=False, write_state=False) if args_global.tensorboard: misc_stat = sess.run([train_stat[1]],feed_dict={\ ph_misc_stat['val_f1_micro']: f1mic_val, ph_misc_stat['val_f1_macro']: f1mac_val, ph_misc_stat['train_f1_micro']: f_mean(l_f1mic_tr), ph_misc_stat['train_f1_macro']: f_mean(l_f1mac_tr), ph_misc_stat['time_per_epoch']: time_train_ep+time_prepare_ep, ph_misc_stat['size_subgraph']: f_mean(l_size_subg)}) # tensorboard visualization summary_writer.add_summary(_, e) summary_writer.add_summary(misc_stat[0], e) epoch_ph_start = int(phase['end']) printf("Optimization Finished!", style='yellow') timelines = TimeLiner() for tl in many_runs_timeline: timelines.update_timeline(tl) timelines.save('timeline.json') ''' saver.restore(sess_eval, '{}/models/saved_model_{}.chkpt'.format(args_global.dir_log,timestamp).replace(' ','_')) loss_val, f1mic_val, f1mac_val, duration = evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='val') printf("Full validation (Epoch {:4d}): \n F1_Micro = {:.4f}\tF1_Macro = {:.4f}".format(e_best,f1mic_val,f1mac_val),style='red') loss_test, f1mic_test, f1mac_test, duration = evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='test') printf("Full test stats: \n F1_Micro = {:.4f}\tF1_Macro = {:.4f}".format(f1mic_test,f1mac_test),style='red') ''' printf('Total training time: {:6.2f} sec'.format(time_train), style='red') #ret = {'loss_val_opt':loss_val,'f1mic_val_opt':f1mic_val,'f1mac_val_opt':f1mac_val,\ # 'loss_test_opt':loss_test,'f1mic_test_opt':f1mic_test,'f1mac_test_opt':f1mac_test,\ # 'epoch_best':e_best, # 'time_train': time_train} print("sampling_time (graphsaint)", minibatch.sampling_time) print("training_time:", time_train) return # everything is logged by TF. no need to return anything
def word2vec_run(config: Config.ConfigCls, identifier:str, doenload:bool, sim_chk:bool): #global data_index out_dir = os.path.join(config.OutDirGet(), identifier) config.OutDirSet(out_dir) Base.SaveDir(out_dir) data_index = 0 LogWrite(config, """\n\n\n\nWord to Vec """ + identifier) LogWrite(config, """Example of building, training and visualizing a word2vec model.""") ########################################################################### LogWrite(config,'\n\nStep 1: Download the data.') url = config.DownloadUrl();dir=config.DownloadDir(); file = config.DownloadFile(); size = config.DownloadSize(); if(doenload): filename = Base.maybe_download(dir, url, file, size) else: filename = os.path.join(dir, file) LogWrite(config,'Read the data into a list of strings.') vocabulary = Base.read_data(filename) LogWrite(config,'Data size' , len(vocabulary)) ########################################################################### LogWrite(config,'\n\nStep 2: Build the dictionary and replace rare words with UNK token.') #vocabulary_size = 50000 vocabulary_size = config.SessionVocSizeGet() data, count, unused_dictionary, reverse_dictionary = DataSet.build_dataset(vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. LogWrite(config,'Most common words (+UNK)', count[:5]) LogWrite(config,'Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) ########################################################################### LogWrite(config,'\n\nStep 3: Function to generate a training batch for the skip-gram model.') batch, labels, data_index = DataSet.generate_batch(data=data, data_index=data_index,batch_size=8, num_skips=2, skip_window=1) for i in range(8): LogWrite(config,batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) ########################################################################### LogWrite(config,'\n\nStep 4: Build and train a skip-gram model.') batch_size = config.ModelBatchSizeGet() #128 embedding_size = config.ModelEmbedSizeGet() #128 # Dimension of the embedding vector. skip_window = config.ModelSkipWindowGet() #1 # How many words to consider left and right. num_skips = config.ModelNumSkipsGet() #2 # How many times to reuse an input to generate a label. num_sampled =config.ModelNumSampledGet() #64 # Number of negative examples to sample. # We pick a random validation set to sample nearest neighbors. Here we limit # the validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = config.ValidationSize() #16 # Random set of words to evaluate similarity on. valid_window = config.ValidationWindow() #100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) graph = tf.Graph() with graph.as_default(): # Input data. with tf.name_scope('inputs'): train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Look up embeddings for inputs. with tf.name_scope('embeddings'): embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss with tf.name_scope('weights'): nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) with tf.name_scope('biases'): nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. # Explanation of the meaning of NCE loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ with tf.name_scope('loss'): loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) # Construct the SGD optimizer using a learning rate of 1.0. with tf.name_scope('optimizer'): optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all # embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Merge all summaries. merged = tf.summary.merge_all() # Add variable initializer. init = tf.global_variables_initializer() # Create a saver. saver = tf.train.Saver() ########################################################################### LogWrite(config,'\n\nStep 5: Begin training.') #num_steps = 100001 with tf.Session(graph=graph) as session: # Open a writer to write summaries. writer = tf.summary.FileWriter(out_dir, session.graph) # We must initialize all variables before we use them. init.run() LogWrite(config,'Initialized') average_loss = 0 num_steps = config.SessionStepsGet(); for step in xrange(num_steps): batch_inputs, batch_labels, data_index = DataSet.generate_batch(data, data_index, batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} # Define metadata variable. run_metadata = tf.RunMetadata() # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() # Also, evaluate the merged op to get all summaries from the returned # "summary" variable. Feed metadata variable to session for visualizing # the graph in TensorBoard. _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata) average_loss += loss_val # Add returned summaries to writer in each step. writer.add_summary(summary, step) # Add metadata to visualize the graph for the last run. if step == (num_steps - 1): writer.add_run_metadata(run_metadata, 'step%d' % step) loss_step = config.RepLossStep() if 0x00 == (step % loss_step): #2000 == 0: if step > 0: average_loss /= loss_step #2000 # The average loss is an estimate of the loss over the last 2000 batches. LogWrite(config,'Average loss at step ', step, ': ', average_loss) average_loss = 0 # Note that this is expensive (~20% slowdown if computed every 500 steps) if(sim_chk): sim_eval_step = config.RepSimStep(); if 0x00 == (step % sim_eval_step): #10000 == 0: sim = similarity.eval()#.analogys_evaluate() for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in xrange(top_k): close_word = reverse_dictionary[nearest[k]] log_str = '%s %s,' % (log_str, close_word) LogWrite(config,log_str) final_embeddings = normalized_embeddings.eval()#.analogys_evaluate() # Write corresponding labels for the embeddings. meta_file = config.OutMetaFile(); # out_dir + '/metadata.tsv' with open(meta_file, 'w') as f: for i in xrange(vocabulary_size): f.write(reverse_dictionary[i] + '\n') # Save the model for checkpoints. mode_file = config.OutModelFile() #os.path.join(out_dir, 'model.ckpt') saver.save(session, mode_file) # Create a configuration for visualizing embeddings with the labels in TensorBoard. vis_config = projector.ProjectorConfig() embedding_conf = vis_config.embeddings.add() embedding_conf.tensor_name = embeddings.name embedding_conf.metadata_path = meta_file #os.path.join(out_dir, 'metadata.tsv') projector.visualize_embeddings(writer, vis_config) writer.close() ########################################################################### LogWrite(config,'\n\nStep 6: Visualize the embeddings.') # pylint: disable=missing-docstring # Function to draw visualization of distance between embeddings. # def plot_with_labels(low_dim_embs, labels, filename): # assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings' # plt.figure(figsize=(18, 18)) # in inches # for i, label in enumerate(labels): # x, y = low_dim_embs[i, :] # plt.scatter(x, y) # plt.annotate( # label, # xy=(x, y), # xytext=(5, 2), # textcoords='offset points', # ha='right', # va='bottom') # # plt.savefig(filename) # try: # # pylint: disable=g-import-not-at-top # from sklearn.manifold import TSNE # import matplotlib.pyplot as plt # # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') # plot_only = 500 # low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) # labels = [reverse_dictionary[i] for i in xrange(plot_only)] # Plotter.plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png')) # # except ImportError as ex: # LogWrite(config,'Please install sklearn, matplotlib, and scipy to show embeddings.') # LogWrite(config,ex) plot_file=config.OutPlotFile()#'tsne.png' Plotter.PlotGraph(final_embeddings, reverse_dictionary, plot_file)
def main(): if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test" or a.mode == "export": if a.checkpoint is None: raise Exception("checkpoint required for test mode") # load some options from the checkpoint options = {"which_direction", "ngf", "ndf", "lab_colorization"} with open(os.path.join(a.checkpoint, "options.json")) as f: for key, val in json.loads(f.read()).items(): if key in options: print("loaded", key, "=", val) setattr(a, key, val) # disable these features in test mode a.scale_size = CROP_SIZE a.flip = False for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) if a.mode == "export": # export the generator to a meta graph that can be imported later for standalone generation if a.lab_colorization: raise Exception("export not supported for lab_colorization") input = tf.placeholder(tf.string, shape=[1]) input_data = tf.decode_base64(input[0]) input_image = tf.image.decode_png(input_data) # remove alpha channel if present input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 4), lambda: input_image[:,:,:3], lambda: input_image) # convert grayscale to RGB input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 1), lambda: tf.image.grayscale_to_rgb(input_image), lambda: input_image) input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32) input_image.set_shape([CROP_SIZE, CROP_SIZE, 3]) batch_input = tf.expand_dims(input_image, axis=0) with tf.variable_scope("generator"): batch_output = deprocess(create_generator(preprocess(batch_input), 3)) output_image = tf.image.convert_image_dtype(batch_output, dtype=tf.uint8)[0] if a.output_filetype == "png": output_data = tf.image.encode_png(output_image) elif a.output_filetype == "jpeg": output_data = tf.image.encode_jpeg(output_image, quality=80) else: raise Exception("invalid filetype") output = tf.convert_to_tensor([tf.encode_base64(output_data)]) key = tf.placeholder(tf.string, shape=[1]) inputs = { "key": key.name, "input": input.name } tf.add_to_collection("inputs", json.dumps(inputs)) outputs = { "key": tf.identity(key).name, "output": output.name, } tf.add_to_collection("outputs", json.dumps(outputs)) init_op = tf.global_variables_initializer() restore_saver = tf.train.Saver() export_saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: sess.run(init_op) print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) restore_saver.restore(sess, checkpoint) print("exporting model") export_saver.export_meta_graph(filename=os.path.join(a.output_dir, "export.meta")) export_saver.save(sess, os.path.join(a.output_dir, "export"), write_meta_graph=False) return examples = load_examples() print("examples count = %d" % examples.count) # inputs and targets are [batch_size, height, width, channels] model = create_model(examples.inputs, examples.targets) # undo colorization splitting on images that we use for display/output if a.lab_colorization: if a.which_direction == "AtoB": # inputs is brightness, this will be handled fine as a grayscale image # need to augment targets and outputs with brightness targets = augment(examples.targets, examples.inputs) outputs = augment(model.outputs, examples.inputs) # inputs can be deprocessed normally and handled as if they are single channel # grayscale images inputs = deprocess(examples.inputs) elif a.which_direction == "BtoA": # inputs will be color channels only, get brightness from targets inputs = augment(examples.inputs, examples.targets) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) else: raise Exception("invalid direction") else: inputs = deprocess(examples.inputs) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) def convert(image): if a.aspect_ratio != 1.0: # upscale to correct aspect ratio size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))] image = tf.image.resize_images(image, size=size, method=tf.image.ResizeMethod.BICUBIC) return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True) # reverse any processing on images so they can be written to disk or displayed to user with tf.name_scope("convert_inputs"): converted_inputs = convert(inputs) with tf.name_scope("convert_targets"): converted_targets = convert(targets) with tf.name_scope("convert_outputs"): converted_outputs = convert(outputs) with tf.name_scope("encode_images"): display_fetches = { "paths": examples.paths, "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"), "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"), "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"), } # summaries with tf.name_scope("inputs_summary"): tf.summary.image("inputs", converted_inputs) with tf.name_scope("targets_summary"): tf.summary.image("targets", converted_targets) with tf.name_scope("outputs_summary"): tf.summary.image("outputs", converted_outputs) with tf.name_scope("predict_real_summary"): tf.summary.image("predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8)) with tf.name_scope("predict_fake_summary"): tf.summary.image("predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8)) tf.summary.scalar("discriminator_loss", model.discrim_loss) tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN) tf.summary.scalar("generator_loss_L1", model.gen_loss_L1) tf.summary.scalar("generator_loss_L1_1", model.gen_loss_L1_1) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars: tf.summary.histogram(var.op.name + "/gradients", grad) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth=True #os.environ["CUDA_VISIBLE_DEVICES"] = "2" with sv.managed_session(config=config) as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) max_steps = 2**32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": # testing # at most, process the test data once start = time.time() max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) filesets = save_images(results) for i, f in enumerate(filesets): print("evaluated image", f["name"]) index_path = append_index(filesets) print("wrote index at", index_path) print("rate", (time.time() - start) / max_steps) else: # training start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["discrim_loss"] = model.discrim_loss fetches["gen_loss_GAN"] = model.gen_loss_GAN fetches["gen_loss_L1"] = model.gen_loss_L1 fetches["gen_loss_L1_1"]= model.gen_loss_L1_1 if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print("progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("discrim_loss", results["discrim_loss"]) print("gen_loss_GAN", results["gen_loss_GAN"]) print("gen_loss_L1", results["gen_loss_L1"]) print("gen_loss_L1_1", results["gen_loss_L1_1"]) if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def benchmark(self, ckpt_dir, outer_steps=100, inner_steps=1000): """Run repeatedly on dummy data to benchmark inference.""" # Turn off Grappler optimizations. options = {"disable_meta_optimizer": True} tf.config.optimizer.set_experimental_options(options) # Create the model outside the loop body. hparams = registry.hparams(self.hparams_set) hparams_lib.add_problem_hparams(hparams, self.problem_name) model_cls = registry.model(self.model_name) model = model_cls(hparams, tf.estimator.ModeKeys.EVAL) # Run only the model body (no data pipeline) on device. feature_shape = [ hparams.batch_size, 3 * self.image_size * self.image_size ] features = {"targets": tf.zeros(feature_shape, dtype=tf.int32)} # Call the model once to initialize the variables. Note that # this should never execute. with tf.variable_scope(self.model_name) as vso: transformed_features = model.bottom(features) with tf.variable_scope("body") as vsi: body_out = model.body(transformed_features) logits = model.top(body_out, features) model.loss(logits, features) def call_model(features): with tf.variable_scope(vso, reuse=tf.AUTO_REUSE): transformed_features = model.bottom(features) with tf.variable_scope(vsi, reuse=tf.AUTO_REUSE): body_out = model.body(transformed_features) logits = model.top(body_out, features) return model.loss(logits, features) # Run the function body in a loop to amortize session overhead. loop_index = tf.zeros([], dtype=tf.int32) initial_loss = (tf.zeros([]), tf.zeros([])) def loop_cond(idx, _): return tf.less(idx, tf.constant(inner_steps, dtype=tf.int32)) def loop_body(idx, _): return idx + 1, call_model(features) benchmark_op = tf.while_loop(loop_cond, loop_body, [loop_index, initial_loss], parallel_iterations=1, back_prop=False) session_config = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=False, per_process_gpu_memory_fraction=0.95)) run_metadata = tf.RunMetadata() with tf.Session(config=session_config) as sess: self.restore_model(sess, ckpt_dir) tps = [] for idx in range(outer_steps): start_time = time.time() sess.run(benchmark_op, run_metadata=run_metadata) elapsed_time = time.time() - start_time tps.append(inner_steps * hparams.batch_size * (64 * 64 * 3) / elapsed_time) logging.error("Iterations %d processed %f TPS.", idx, tps[-1]) # Skip the first iteration where all the setup and allocation happens. tps = np.asarray(tps[1:]) logging.error("Mean/Std/Max/Min throughput = %f / %f / %f / %f", np.mean(tps), np.std(tps), tps.max(), tps.min())
def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, update, writer, states=None, cliprange_vf=None): """ Training of PPO2 Algorithm :param learning_rate: (float) learning rate :param cliprange: (float) Clipping factor :param obs: (np.ndarray) The current observation of the environment :param returns: (np.ndarray) the rewards :param masks: (np.ndarray) The last masks for done episodes (used in recurent policies) :param actions: (np.ndarray) the actions :param values: (np.ndarray) the values :param neglogpacs: (np.ndarray) Negative Log-likelihood probability of Actions :param update: (int) the current step iteration :param writer: (TensorFlow Summary.writer) the writer for tensorboard :param states: (np.ndarray) For recurrent policies, the internal state of the recurrent model :return: policy gradient loss, value function loss, policy entropy, approximation of kl divergence, updated clipping range, training update operation :param cliprange_vf: (float) Clipping factor for the value function """ advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { self.train_model.obs_ph: obs, self.action_ph: actions, self.advs_ph: advs, self.rewards_ph: returns, self.learning_rate_ph: learning_rate, self.clip_range_ph: cliprange, self.old_neglog_pac_ph: neglogpacs, self.old_vpred_ph: values } if states is not None: td_map[self.train_model.states_ph] = states td_map[self.train_model.dones_ph] = masks if cliprange_vf is not None and cliprange_vf >= 0: td_map[self.clip_range_vf_ph] = cliprange_vf if states is None: update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) else: update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) if self.full_tensorboard_log and (1 + update) % 10 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [ self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train ], td_map, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % (update * update_fac)) else: summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [ self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train ], td_map) writer.add_summary(summary, (update * update_fac)) else: policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [ self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train ], td_map) return policy_loss, value_loss, policy_entropy, approxkl, clipfrac
def train(args, data, show_loss, show_topk): n_user, n_item, n_entity, n_relation = data[0], data[1], data[2], data[3] train_data, eval_data, test_data = data[4], data[5], data[6] adj_entity, adj_relation = data[7], data[8] model = KGCN(args, n_user, n_entity, n_relation, adj_entity, adj_relation) # top-K evaluation settings user_list, train_record, test_record, item_set, k_list = topk_settings( show_topk, train_data, test_data, n_item) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # monitor the usage of memory while training the model profiler = model_analyzer.Profiler(graph=sess.graph) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # tensor-board writer = tf.summary.FileWriter('../data/' + args.dataset + '/logs', tf.get_default_graph()) for step in range(args.n_epochs): # training t = time.time() np.random.shuffle(train_data) start = 0 i = 0 # skip the last incomplete minibatch if its size < batch size while start + args.batch_size <= train_data.shape[0]: _, loss = model.train( sess, get_feed_dict(model, train_data, start, start + args.batch_size), run_options, run_metadata) # add the data into tfprofiler profiler.add_step(step=step, run_meta=run_metadata) if i == 0: writer.add_run_metadata(run_metadata, 'step %d' % step) i += 1 start += args.batch_size # if show_loss: # print(start, loss) # CTR evaluation train_auc, train_f1 = ctr_eval(sess, model, train_data, args.batch_size) eval_auc, eval_f1 = ctr_eval(sess, model, eval_data, args.batch_size) test_auc, test_f1 = ctr_eval(sess, model, test_data, args.batch_size) # values = ps.virtual_memory() # used_memory = values.used / (1024.0 ** 3) train_time = time.time() - t # print('epoch %d train auc: %.4f f1: %.4f eval auc: %.4f f1: %.4f test auc: %.4f f1: %.4f' # % (step, train_auc, train_f1, eval_auc, eval_f1, test_auc, test_f1)) print( 'epoch %d training time: %.5f train auc: %.4f f1: %.4f eval auc: %.4f f1: %.4f test auc: %.4f f1: %.4f' % (step, train_time, train_auc, train_f1, eval_auc, eval_f1, test_auc, test_f1)) # # 统计模型的memory使用大小 profile_scope_opt_builder = option_builder.ProfileOptionBuilder( option_builder.ProfileOptionBuilder.trainable_variables_parameter( )) # 显示字段是params,即参数 profile_scope_opt_builder.select(['params']) # 根据params数量进行显示结果排序 profile_scope_opt_builder.order_by('params') # 显示视图为scope view profiler.profile_name_scope(profile_scope_opt_builder.build()) # ------------------------------------ # 最耗时top 5 ops profile_op_opt_builder = option_builder.ProfileOptionBuilder() # 显示字段:op执行时间,使用该op的node的数量。 注意:op的执行时间即所有使用该op的node的执行时间总和。 profile_op_opt_builder.select(['micros', 'occurrence']) # 根据op执行时间进行显示结果排序 profile_op_opt_builder.order_by('micros') # 过滤条件:只显示排名top 7 profile_op_opt_builder.with_max_depth(6) # 显示视图为op view profiler.profile_operations(profile_op_opt_builder.build()) # ------------------------------------ writer.close()
test_writer = tf.summary.FileWriter(log_dir + '/test') # 运行初始化所有变量 tf.global_variables_initializer().run() #### 如何merge的情况: for i in range(max_steps): if i % 10 == 0: # 记录测试集的summary与accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print('Accuracy at step %s: %s' % (i, acc)) else: # 记录训练集的summary if i % 100 == 99: # Record execution stats run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) train_writer.add_summary(summary, i) train_writer.close() test_writer.close() # scalar中将要显示的量: #
def train(sv, sess, data, max_steps, display_fetches, display_fetches_test, dataTest, saver, loss, output_dir=a.output_dir): sess.run(data.iterator.initializer) try: # training start_time = time.time() for step in range(max_steps): options = None run_metadata = None if helpers.should(a.trace_freq, max_steps, step): options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": loss.trainOp, "global_step": sv.global_step, } if helpers.should(a.progress_freq, max_steps, step) or step <= 1: fetches["loss_value"] = loss.lossValue if helpers.should(a.summary_freq, max_steps, step): fetches["summary"] = sv.summary_op fetches["display"] = display_fetches try: currentLrValue = a.lr if a.checkpoint is None and step < 500: currentLrValue = step * ( 0.002 ) * a.lr # ramps up to a.lr in the 2000 first iterations to avoid crazy first gradients to have too much impact. results = sess.run(fetches, feed_dict={loss.lr: currentLrValue}, options=options, run_metadata=run_metadata) except tf.errors.OutOfRangeError: print( "training fails in OutOfRangeError, probably a problem with the iterator" ) continue global_step = results["global_step"] #helpers.saveInputs(a.output_dir, results["display"], step) if helpers.should(a.summary_freq, max_steps, step): sv.summary_writer.add_summary(results["summary"], global_step) if helpers.should(a.trace_freq, max_steps, step): print("recording trace") sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % global_step) if helpers.should(a.progress_freq, max_steps, step): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(global_step / data.stepsPerEpoch) train_step = global_step - (train_epoch - 1) * data.stepsPerEpoch imagesPerSecond = global_step * a.batch_size / (time.time() - start_time) remainingMinutes = ((max_steps - global_step) * a.batch_size) / (imagesPerSecond * 60) print("progress epoch %d step %d image/sec %0.1f" % (train_epoch, global_step, imagesPerSecond)) print("Remaining %0.1f minutes" % (remainingMinutes)) print("loss_value", results["loss_value"]) if helpers.should(a.save_freq, max_steps, step): print("saving model") try: saver.save(sess, os.path.join(output_dir, "model"), global_step=sv.global_step) except Exception as e: print( "Didn't manage to save model (trainining continues): " + str(e)) if helpers.should(a.test_freq, max_steps, step) or global_step == 1: outputTestDir = os.path.join(a.output_dir, str(global_step)) try: test(sess, dataTest, max_steps, display_fetches_test, outputTestDir) except Exception as e: print( "Didn't manage to do a recurrent test (trainining continues): " + str(e)) if sv.should_stop(): break finally: saver.save(sess, os.path.join(output_dir, "model"), global_step=sv.global_step ) #Does the saver saves everything still ? sess.run(data.iterator.initializer) outputTestDir = os.path.join(a.output_dir, "final") test(sess, dataTest, max_steps, display_fetches_test, outputTestDir)
def run(self, sess): import time # restore from checkpoint if self.restore and os.path.exists(os.path.join(self.train_dir, 'checkpoint')): latest_ckpt = tf.train.latest_checkpoint(self.train_dir, 'checkpoint') self.saver_ckpt.restore(sess, latest_ckpt) # otherwise, initialize from start else: initializers = (tf.initializers.global_variables(), tf.initializers.local_variables()) sess.run(initializers) # restore pre-trained model if self.pretrain_dir: latest_ckpt = tf.train.latest_checkpoint(self.pretrain_dir, 'checkpoint') self.saver_pt.restore(sess, latest_ckpt) # profiler # profile_offset = -1 profile_offset = 100 + self.log_frequency // 2 profile_step = 10000 builder = tf.profiler.ProfileOptionBuilder profiler = tf.profiler.Profiler(sess.graph) # initialization self.log_last = time.time() ckpt_last = time.time() # dataset generator global_step = tf.train.global_step(sess, self.global_step) data_gen = self.data.gen_main(global_step) # run training session while True: # global step global_step = tf.train.global_step(sess, self.global_step) if global_step >= self.max_steps: eprint('Training finished at step={}'.format(global_step)) break # run session if global_step % profile_step == profile_offset: # profiling every few steps options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_meta = tf.RunMetadata() self.run_sess(sess, global_step, data_gen, options, run_meta) profiler.add_step(global_step, run_meta) # profile the parameters if global_step == profile_offset: ofile = os.path.join(self.train_dir, 'parameters.log') profiler.profile_name_scope( builder(builder.trainable_variables_parameter()) .with_file_output(ofile).build()) # profile the timing of model operations ofile = os.path.join(self.train_dir, 'time_and_memory_{:0>7}.log'.format(global_step)) profiler.profile_operations(builder(builder.time_and_memory()) .with_file_output(ofile).build()) # generate a timeline timeline = os.path.join(self.train_dir, 'timeline') profiler.profile_graph(builder(builder.time_and_memory()) .with_step(global_step).with_timeline_output(timeline).build()) else: self.run_sess(sess, global_step, data_gen) # save checkpoints periodically or when training finished if self.ckpt_period > 0: time_current = time.time() if time_current - ckpt_last >= self.ckpt_period or global_step + 1 >= self.max_steps: ckpt_last = time_current self.saver_ckpt.save(sess, os.path.join(self.train_dir, 'model.ckpt'), global_step, 'checkpoint') # save model every few steps if self.save_steps > 0 and global_step % self.save_steps == 0: self.saver.save(sess, os.path.join(self.train_dir, 'model_{:0>7}'.format(global_step)), write_meta_graph=False, write_state=False) # auto detect problems and generate advice ALL_ADVICE = { 'ExpensiveOperationChecker': {}, 'AcceleratorUtilizationChecker': {}, 'JobChecker': {}, 'OperationChecker': {} } profiler.advise(ALL_ADVICE)
def train(flags): """Training entry point.""" log_dir = flags.log_dir flags.pretrained_model_dir = log_dir log_dir = os.path.join(log_dir, 'train') flags.eval_interval_secs = 0 with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int64) global_step_confidence = tf.Variable(0, trainable=False, name='global_step_confidence', dtype=tf.int64) model = build_model(flags) images_query_pl, labels_query_pl, \ images_support_pl, labels_support_pl = \ build_episode_placeholder(flags) # Augments the input. if flags.dataset == 'cifar10' or flags.dataset == 'cifar100': images_query_pl_aug = data_loader.augment_cifar(images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_cifar( images_support_pl, is_training=True) elif flags.dataset == 'tinyimagenet': images_query_pl_aug = data_loader.augment_tinyimagenet( images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_tinyimagenet( images_support_pl, is_training=True) logits, logits_z = build_proto_train_graph( images_query=images_query_pl_aug, images_support=images_support_pl_aug, flags=flags, is_training=True, model=model) # Losses and optimizer ## Classification loss loss_classification = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.one_hot(labels_query_pl, flags.num_classes_train))) # Confidence loss _, top_k_indices = tf.nn.top_k(logits, k=1) pred = tf.squeeze(top_k_indices) incorrect_mask = tf.math.logical_not( tf.math.equal(pred, labels_query_pl)) incorrect_logits_z = tf.boolean_mask(logits_z, incorrect_mask) incorrect_labels_z = tf.boolean_mask(labels_query_pl, incorrect_mask) signal_variance = tf.math.reduce_sum(tf.cast(incorrect_mask, tf.int32)) loss_variance_incorrect = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=incorrect_logits_z, labels=tf.one_hot(incorrect_labels_z, flags.num_classes_train))) loss_variance_zero = 0.0 loss_confidence = tf.cond(tf.greater(signal_variance, 0), lambda: loss_variance_incorrect, lambda: loss_variance_zero) regu_losses = tf.losses.get_regularization_losses() loss = tf.add_n([loss_classification] + regu_losses) # Learning rate if flags.lr_anneal == 'const': learning_rate = flags.init_learning_rate elif flags.lr_anneal == 'pwc': learning_rate = get_pwc_learning_rate(global_step, flags) elif flags.lr_anneal == 'exp': lr_decay_step = flags.number_of_steps // flags.n_lr_decay learning_rate = tf.train.exponential_decay( flags.init_learning_rate, global_step, lr_decay_step, 1.0 / flags.lr_decay_rate, staircase=True) else: raise Exception('Not implemented') # Optimizer optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) optimizer_confidence = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) train_op = contrib_slim.learning.create_train_op( total_loss=loss, optimizer=optimizer, global_step=global_step, clip_gradient_norm=flags.clip_gradient_norm) variable_variance = [] for v in tf.trainable_variables(): if 'fc_variance' in v.name: variable_variance.append(v) train_op_confidence = contrib_slim.learning.create_train_op( total_loss=loss_confidence, optimizer=optimizer_confidence, global_step=global_step_confidence, clip_gradient_norm=flags.clip_gradient_norm, variables_to_train=variable_variance) tf.summary.scalar('loss', loss) tf.summary.scalar('loss_classification', loss_classification) tf.summary.scalar('loss_variance', loss_confidence) tf.summary.scalar('regu_loss', tf.add_n(regu_losses)) tf.summary.scalar('learning_rate', learning_rate) # Merges all summaries except for pretrain summary = tf.summary.merge( tf.get_collection('summaries', scope='(?!pretrain).*')) # Gets datasets few_shot_data_train, test_dataset, train_dataset = get_train_datasets( flags) # Defines session and logging summary_writer_train = tf.summary.FileWriter(log_dir, flush_secs=1) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) print(saver.saver_def.filename_tensor_name) print(saver.saver_def.restore_op_name) # pylint: disable=unused-variable run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() supervisor = tf.train.Supervisor( logdir=log_dir, init_feed_dict=None, summary_op=None, init_op=tf.global_variables_initializer(), summary_writer=summary_writer_train, saver=saver, global_step=global_step, save_summaries_secs=flags.save_summaries_secs, save_model_secs=0) with supervisor.managed_session() as sess: checkpoint_step = sess.run(global_step) if checkpoint_step > 0: checkpoint_step += 1 eval_interval_steps = flags.eval_interval_steps for step in range(checkpoint_step, flags.number_of_steps): # Computes the classification loss using a batch of data. images_query, labels_query,\ images_support, labels_support = \ few_shot_data_train.next_few_shot_batch( query_batch_size_per_task=flags.train_batch_size, num_classes_per_task=flags.num_classes_train, num_supports_per_class=flags.num_shots_train, num_tasks=flags.num_tasks_per_batch) feed_dict = { images_query_pl: images_query.astype(dtype=np.float32), labels_query_pl: labels_query, images_support_pl: images_support.astype(dtype=np.float32), labels_support_pl: labels_support } t_batch = time.time() dt_batch = time.time() - t_batch t_train = time.time() loss, loss_confidence = sess.run( [train_op, train_op_confidence], feed_dict=feed_dict) dt_train = time.time() - t_train if step % 100 == 0: summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer_train.add_summary(summary_str, step) summary_writer_train.flush() logging.info( 'step %d, loss : %.4g, dt: %.3gs, dt_batch: %.3gs', step, loss, dt_train, dt_batch) if float(step) / flags.number_of_steps > 0.5: eval_interval_steps = flags.eval_interval_fine_steps if eval_interval_steps > 0 and step % eval_interval_steps == 0: saver.save(sess, os.path.join(log_dir, 'model'), global_step=step) eval(flags=flags, train_dataset=train_dataset, test_dataset=test_dataset) if float( step ) > 0.5 * flags.number_of_steps + flags.number_of_steps_to_early_stop: break
def main(): if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test" or a.mode == "export": if a.checkpoint is None: raise Exception("checkpoint required for test mode") # load some options from the checkpoint options = {"which_direction", "ngf", "ndf", "lab_colorization"} with open(os.path.join(a.checkpoint, "options.json")) as f: for key, val in json.loads(f.read()).items(): if key in options: print("loaded", key, "=", val) setattr(a, key, val) # disable these features in test mode a.scale_size = CROP_SIZE a.flip = False for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) examples = load_examples() print("examples count = %d" % examples.count) # inputs and targets are [batch_size, height, width, channels] model = create_model(examples.inputs, examples.targets) # undo colorization splitting on images that we use for display/output if a.lab_colorization: if a.which_direction == "AtoB": # inputs is brightness, this will be handled fine as a grayscale image # need to augment targets and outputs with brightness targets = augment(examples.targets, examples.inputs) outputs = augment(model.outputs, examples.inputs) # inputs can be deprocessed normally and handled as if they are single channel # grayscale images inputs = deprocess(examples.inputs) elif a.which_direction == "BtoA": # inputs will be color channels only, get brightness from targets inputs = augment(examples.inputs, examples.targets) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) else: raise Exception("invalid direction") else: inputs = deprocess(examples.inputs) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) def convert(image): if a.aspect_ratio != 1.0: # upscale to correct aspect ratio size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))] image = tf.image.resize_images( image, size=size, method=tf.image.ResizeMethod.BICUBIC) return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True) # reverse any processing on images so they can be written to disk or displayed to user with tf.name_scope("convert_inputs"): converted_inputs = convert(inputs) with tf.name_scope("convert_targets"): converted_targets = convert(targets) with tf.name_scope("convert_outputs"): converted_outputs = convert(outputs) with tf.name_scope("encode_images"): display_fetches = { "paths": examples.paths, "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"), "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"), "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"), } # summaries with tf.name_scope("inputs_summary"): tf.summary.image("inputs", converted_inputs) with tf.name_scope("targets_summary"): tf.summary.image("targets", converted_targets) with tf.name_scope("outputs_summary"): tf.summary.image("outputs", converted_outputs) with tf.name_scope("predict_real_summary"): tf.summary.image( "predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8)) with tf.name_scope("predict_fake_summary"): tf.summary.image( "predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8)) tf.summary.scalar("discriminator_loss", model.discrim_loss) tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN) tf.summary.scalar("generator_loss_L1", model.gen_loss_L1) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars: tf.summary.histogram(var.op.name + "/gradients", grad) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) with sv.managed_session() as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: print("#############################") print( "loading model from checkpoint for continue training or test phase" ) print("#############################") try: checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) except: print("loading was unsuccessful-it will train from scratch") print("#############################") max_steps = 2**32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": # testing # at most, process the test data once start = time.time() max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) filesets = save_images(results) for i, f in enumerate(filesets): print("evaluated image", f["name"]) index_path = append_index(filesets) print("wrote index at", index_path) print("rate", (time.time() - start) / max_steps) else: # training start = time.time() discrim_loss_pre = 10000 gan_loss_pre = 10000 patience_counter = 0 for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["discrim_loss"] = model.discrim_loss fetches["gen_loss_GAN"] = model.gen_loss_GAN fetches["gen_loss_L1"] = model.gen_loss_L1 if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata( run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print( "progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("discrim_loss", results["discrim_loss"]) print("gen_loss_GAN", results["gen_loss_GAN"]) print("gen_loss_L1", results["gen_loss_L1"]) if discrim_loss_pre >= results["discrim_loss"]: if gan_loss_pre <= results["gen_loss_GAN"]: patience_counter = patience_counter + 1 discrim_loss_pre = results["discrim_loss"] gan_loss_pre = results["gen_loss_GAN"] if patience_counter >= float(a.patience_epochs): print("###################") print("early stop, disc is winning") print( "progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) break if results["gen_loss_L1"] < float(a.desired_l1_loss): print("###################") print("Reached desired error") print( "progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) break if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def log_to_tensorboard(self, test_filename, psnr, save_meta_data=True): if self.enable_log is False: return # todo save_meta_data = False org_image = util.set_image_alignment( util.load_image(test_filename, print_console=False), self.scale) if len(org_image.shape ) >= 3 and org_image.shape[2] == 3 and self.channels == 1: org_image = util.convert_rgb_to_y(org_image) input_image = util.resize_image_by_pil( org_image, 1.0 / self.scale, resampling_method=self.resampling_method) bicubic_image = util.resize_image_by_pil( input_image, self.scale, resampling_method=self.resampling_method) if self.max_value != 255.0: input_image = np.multiply(input_image, self.max_value / 255.0) # type: np.ndarray bicubic_image = np.multiply(bicubic_image, self.max_value / 255.0) # type: np.ndarray org_image = np.multiply(org_image, self.max_value / 255.0) # type: np.ndarray feed_dict = { self.x: input_image.reshape([ 1, input_image.shape[0], input_image.shape[1], input_image.shape[2] ]), self.x2: bicubic_image.reshape([ 1, bicubic_image.shape[0], bicubic_image.shape[1], bicubic_image.shape[2] ]), self.y: org_image.reshape([ 1, org_image.shape[0], org_image.shape[1], org_image.shape[2] ]), self.dropout: 1.0, self.is_training: 0 } if save_meta_data: # profiler = tf.profiler.Profile(self.sess.graph) run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) summary_str, _ = self.sess.run([self.summary_op, self.loss], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) self.test_writer.add_run_metadata(run_metadata, "step%d" % self.epochs_completed) filename = self.checkpoint_dir + "/" + self.name + "_metadata.txt" with open(filename, "w") as out: out.write(str(run_metadata)) # filename = self.checkpoint_dir + "/" + self.name + "_memory.txt" # tf.profiler.write_op_log( # tf.get_default_graph(), # log_dir=self.checkpoint_dir, # #op_log=op_log, # run_meta=run_metadata) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), run_meta=run_metadata, tfprof_options=tf.contrib.tfprof.model_analyzer. PRINT_ALL_TIMING_MEMORY) else: summary_str, _ = self.sess.run([self.summary_op, self.loss], feed_dict=feed_dict) self.train_writer.add_summary(summary_str, self.epochs_completed) if not self.use_l1_loss: if self.training_step != 0: util.log_scalar_value( self.train_writer, 'PSNR', self.training_psnr_sum / self.training_step, self.epochs_completed) util.log_scalar_value(self.train_writer, 'LR', self.lr, self.epochs_completed) self.train_writer.flush() util.log_scalar_value(self.test_writer, 'PSNR', psnr, self.epochs_completed) self.test_writer.flush()
def train(config): Model_cls = HandwritingVRNNGmmModel Dataset_cls = HandWritingDatasetConditionalTF # Dataset training_dataset = Dataset_cls(config['training_data'], use_bow_labels=config['use_bow_labels']) num_training_iterations = int(training_dataset.num_samples / config['batch_size']) print("# training steps per epoch: " + str(num_training_iterations)) # Create a tensorflow sub-graph that loads batches of samples. if config.get('use_bucket_feeder', True) and training_dataset.is_dynamic: bucket_edges = training_dataset.get_seq_len_histogram( num_bins=15, collapse_first_and_last_bins=[2, -2]) data_feeder = DataFeederTF(training_dataset, config['num_epochs'], config['batch_size'], queue_capacity=1024) sequence_length, inputs, targets = data_feeder.batch_queue_bucket( bucket_edges, dynamic_pad=training_dataset.is_dynamic, queue_capacity=300, queue_threads=4) else: # Training data data_feeder = DataFeederTF(training_dataset, config['num_epochs'], config['batch_size'], queue_capacity=1024) sequence_length, inputs, targets = data_feeder.batch_queue( dynamic_pad=training_dataset.is_dynamic, queue_capacity=512, queue_threads=4) if config.get('use_staging_area', False): staging_area = TFStagingArea([sequence_length, inputs, targets], device_name="/gpu:0") sequence_length, inputs, targets = staging_area.tensors # Create step counter (used by optimization routine and learning rate function.) global_step = tf.compat.v1.get_variable(name='global_step', trainable=False, initializer=1) # Annealing KL-divergence loss. kld_loss_weight_backup = config['loss_weights']['kld_loss'] if type(config['loss_weights']['kld_loss']) == np.ndarray: # Create a piecewise increasing kld loss weight. num_steps = len(config['loss_weights']['kld_loss']) values = np.linspace(0, 1, num_steps + 1).tolist() boundaries = (config['loss_weights']['kld_loss'] * num_training_iterations).tolist() config['loss_weights']['kld_loss'] = tf.train.piecewise_constant( global_step, boundaries=boundaries, values=values) tf.summary.scalar('training/kld_loss_weight', config['loss_weights']['kld_loss'], collections=["training_status"]) # Create training graph. with tf.name_scope("training"): model = Model_cls(config, reuse=False, input_op=inputs, target_op=targets, input_seq_length_op=sequence_length, input_dims=training_dataset.input_dims, target_dims=training_dataset.target_dims, mode="training", data_processor=training_dataset) model.build_graph() model.create_image_summary(training_dataset.prepare_for_visualization) # Create sampling graph. with tf.name_scope("sampling"): sampling_input_op = tf.compat.v1.placeholder( tf.float32, shape=[ 1, training_dataset.sequence_length, sum(training_dataset.input_dims) ]) sampling_sequence_length_op = tf.compat.v1.placeholder(tf.int32, shape=[1]) sampling_model = Model_cls( config, reuse=True, input_op=sampling_input_op, target_op=None, input_seq_length_op=sampling_sequence_length_op, input_dims=training_dataset.input_dims, target_dims=training_dataset.target_dims, batch_size=1, mode="sampling", data_processor=training_dataset) sampling_model.build_graph() sampling_model.create_image_summary( training_dataset.prepare_for_visualization) # Validation model. if config.get('validate_model', False): validation_dataset = Dataset_cls( config['validation_data'], use_bow_labels=config['use_bow_labels']) num_validation_iterations = int(validation_dataset.num_samples / config['batch_size']) print("# validation steps per epoch: " + str(num_validation_iterations)) valid_data_feeder = DataFeederTF(validation_dataset, config['num_epochs'], config['batch_size'], queue_capacity=1024, shuffle=False) valid_sequence_length, valid_inputs, valid_targets = valid_data_feeder.batch_queue( dynamic_pad=validation_dataset.is_dynamic, queue_capacity=512, queue_threads=4) if 'use_staging_area' in config and config['use_staging_area']: valid_staging_area = TFStagingArea( [valid_sequence_length, valid_inputs, valid_targets], device_name="/gpu:0") valid_sequence_length, valid_inputs, valid_targets = valid_staging_area.tensors with tf.name_scope("validation"): valid_model = Model_cls(config, reuse=True, input_op=valid_inputs, target_op=valid_targets, input_seq_length_op=valid_sequence_length, input_dims=validation_dataset.input_dims, target_dims=validation_dataset.target_dims, mode="training", data_processor=validation_dataset) valid_model.build_graph() # Create a session object and initialize parameters. gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) if config['learning_rate_type'] == 'exponential': learning_rate = tf.train.exponential_decay( config['learning_rate'], global_step=global_step, decay_steps=config['learning_rate_decay_steps'], decay_rate=config['learning_rate_decay_rate'], staircase=False) tf.summary.scalar('training/learning_rate', learning_rate, collections=["training_status"]) elif config['learning_rate_type'] == 'fixed': learning_rate = config['learning_rate'] else: raise Exception("Invalid learning rate type") optimizer = tf.train.AdamOptimizer(learning_rate) # Gradient clipping and a sanity check. grads = list( zip(tf.gradients(model.loss, tf.trainable_variables()), tf.trainable_variables())) grads_clipped = [] with tf.name_scope("grad_clipping"): for grad, var in grads: if grad is not None: if config['grad_clip_by_norm'] > 0: grads_clipped.append( (tf.clip_by_norm(grad, config['grad_clip_by_norm']), var)) elif config['grad_clip_by_value'] > 0: grads_clipped.append( (tf.clip_by_value(grad, -config['grad_clip_by_value'], -config['grad_clip_by_value']), var)) else: grads_clipped.append((grad, var)) train_op = optimizer.apply_gradients(grads_and_vars=grads_clipped, global_step=global_step) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) run_opts = None run_opts_metadata = None if config.get('create_timeline', False): run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, timeout_in_ms=100000) run_opts_metadata = tf.RunMetadata() # Create a saver for writing training checkpoints. saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) if config['model_dir']: # If model directory already exists, continue training by restoring computation graph. # Restore variables. if config['checkpoint_id'] is None: checkpoint_path = tf.train.latest_checkpoint(config['model_dir']) else: checkpoint_path = os.path.join(config['model_dir'], config['checkpoint_id']) print("Continue training with model " + checkpoint_path) saver.restore(sess, checkpoint_path) step = tf.train.global_step(sess, global_step) start_epoch = round( step / (training_dataset.num_samples / config['batch_size'])) else: # Fresh start # Create a unique output directory for this experiment. config['model_dir'] = get_model_dir_timestamp( base_path=config['model_save_dir'], prefix="tf", suffix=config['experiment_name'], connector="-") print("Saving to {}\n".format(config['model_dir'])) start_epoch = 1 step = 1 coord = tf.train.Coordinator() data_feeder.init( sess, coord ) # Enqueue threads must be initialized after definition of train_op. if config.get('validate_model', False): valid_data_feeder.init(sess, coord) queue_threads = tf.train.start_queue_runners(coord=coord, sess=sess) queue_threads.append(data_feeder.enqueue_threads) # Register and create summary ops. summary_dir = os.path.join(config['model_dir'], "summary") summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) # Create summaries to visualize weights and gradients. if config['tensorboard_verbose'] > 1: for grad, var in grads: tf.summary.histogram(var.name, var, collections=["training_status"]) tf.summary.histogram(var.name + '/gradient', grad, collections=["training_status"]) if config['tensorboard_verbose'] > 1: tf.summary.scalar( "training/queue", math_ops.cast(data_feeder.input_queue.size(), dtypes.float32) * (1. / data_feeder.queue_capacity), collections=["training_status"]) # Save configuration config['loss_weights']['kld_loss'] = kld_loss_weight_backup try: # Pickle and json dump. pickle.dump( config, open(os.path.join(config['model_dir'], 'config.pkl'), 'wb')) json.dump(config, open(os.path.join(config['model_dir'], 'config.json'), 'w'), indent=4, sort_keys=True) except: pass training_summary = tf.compat.v1.summary.merge_all('training_status') training_run_ops = [ model.loss_summary, training_summary, model.ops_loss, train_op ] training_run_ops_with_img_summary = [ model.loss_summary, training_summary, model.ops_loss, model.ops_img_summary, train_op ] if config.get('validate_model', False): validation_run_ops = [valid_model.ops_loss] if config['use_staging_area']: training_run_ops.append(staging_area.preload_op) training_run_ops_with_img_summary.append(staging_area.preload_op) # Fill staging area first. for i in range(256): _ = sess.run(staging_area.preload_op, feed_dict={}, options=run_opts, run_metadata=run_opts_metadata) if config.get('validate_model', False): validation_run_ops.append(valid_staging_area.preload_op) # Fill staging area first. for i in range(256): _ = sess.run(valid_staging_area.preload_op, feed_dict={}, options=run_opts, run_metadata=run_opts_metadata) for epoch in range(start_epoch, config['num_epochs'] + 1): for epoch_step in range(num_training_iterations): start_time = time.perf_counter() step = tf.train.global_step(sess, global_step) if (step % config['checkpoint_every_step']) == 0: ckpt_save_path = saver.save( sess, os.path.join(config['model_dir'], 'model'), global_step) print("Model saved in file: %s" % ckpt_save_path) if config['img_summary_every_step'] > 0 and step % config[ 'img_summary_every_step'] == 0: run_training_output = sess.run( training_run_ops_with_img_summary, feed_dict={}, options=run_opts, run_metadata=run_opts_metadata) img_summary = model.get_image_summary( sess, ops_img_summary_evaluated=run_training_output[3], seq_len=500) summary_writer.add_summary(img_summary, step) else: run_training_output = sess.run(training_run_ops, feed_dict={}, options=run_opts, run_metadata=run_opts_metadata) summary_writer.add_summary(run_training_output[0], step) # Loss summary summary_writer.add_summary(run_training_output[1], step) # Training status summary. if step % config['print_every_step'] == 0: time_elapsed = (time.perf_counter() - start_time) / config['print_every_step'] model.log_loss(run_training_output[2], step, epoch, time_elapsed) if config['img_summary_every_step'] > 0 and step % config[ 'img_summary_every_step'] == 0: sampling_img_summary = sampling_model.get_image_summary( sess, ops_img_summary_evaluated=None, seq_len=500) summary_writer.add_summary(sampling_img_summary, step) if config.get('validate_model', False) and step % config['validate_every_step'] == 0: start_time = time.perf_counter() for i in range(num_validation_iterations): run_validation_output = sess.run( validation_run_ops, feed_dict={}, options=run_opts, run_metadata=run_opts_metadata) valid_model.update_validation_loss( run_validation_output[0]) valid_summary, valid_eval_loss = valid_model.get_validation_summary( session=sess) summary_writer.add_summary(valid_summary, step) # Validation loss summary time_elapsed = (time.perf_counter() - start_time) / num_validation_iterations valid_model.log_loss(valid_eval_loss, step, epoch, time_elapsed, prefix="VALID: ") valid_model.reset_validation_loss() if config.get('create_timeline', False): create_tf_timeline(config['model_dir'], run_opts_metadata) print("End-of-Training.") ckpt_save_path = saver.save(sess, os.path.join(config['model_dir'], 'model'), global_step) print("Model saved in file: %s" % ckpt_save_path) print('Model is trained for %d epochs, %d steps.' % (config['num_epochs'], step)) try: sess.run(data_feeder.input_queue.close(cancel_pending_enqueues=True)) coord.request_stop() coord.join(queue_threads, stop_grace_period_secs=5) except: pass sess.close()
def benchmark_model(self, warmup_runs, bm_runs, num_threads, trace_filename=None): """Benchmark model.""" if self.tensorrt: print('Using tensorrt ', self.tensorrt) graphdef = self.freeze_model() if num_threads > 0: print('num_threads for benchmarking: {}'.format(num_threads)) sess_config = tf.ConfigProto( intra_op_parallelism_threads=num_threads, inter_op_parallelism_threads=1) else: sess_config = tf.ConfigProto() # rewriter_config_pb2.RewriterConfig.OFF sess_config.graph_options.rewrite_options.dependency_optimization = 2 if self.use_xla: sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_2) with tf.Graph().as_default(), tf.Session(config=sess_config) as sess: inputs = tf.placeholder(tf.float32, name='input', shape=self.inputs_shape) output = self.build_model(inputs, is_training=False) img = np.random.uniform(size=self.inputs_shape) sess.run(tf.global_variables_initializer()) if self.tensorrt: fetches = [inputs.name] + [i.name for i in output] goutput = self.convert_tr(graphdef, fetches) inputs, output = goutput[0], goutput[1:] if not self.use_xla: # Don't use tf.group because XLA removes the whole graph for tf.group. output = tf.group(*output) else: output = tf.add_n([tf.reduce_sum(x) for x in output]) output_name = [output.name] input_name = inputs.name graphdef = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_name) with tf.Graph().as_default(), tf.Session(config=sess_config) as sess: tf.import_graph_def(graphdef, name='') for i in range(warmup_runs): start_time = time.time() sess.run(output_name, feed_dict={input_name: img}) print('Warm up: {} {:.4f}s'.format(i, time.time() - start_time)) print('Start benchmark runs total={}'.format(bm_runs)) start = time.perf_counter() for i in range(bm_runs): sess.run(output_name, feed_dict={input_name: img}) end = time.perf_counter() inference_time = (end - start) / 10 print('Per batch inference time: ', inference_time) print('FPS: ', self.batch_size / inference_time) if trace_filename: run_options = tf.RunOptions() run_options.trace_level = tf.RunOptions.FULL_TRACE run_metadata = tf.RunMetadata() sess.run(output_name, feed_dict={input_name: img}, options=run_options, run_metadata=run_metadata) logging.info('Dumping trace to %s', trace_filename) trace_dir = os.path.dirname(trace_filename) if not tf.io.gfile.exists(trace_dir): tf.io.gfile.makedirs(trace_dir) with tf.io.gfile.GFile(trace_filename, 'w') as trace_file: from tensorflow.python.client import timeline # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file.write( trace.generate_chrome_trace_format(show_memory=True))
def train(self, log_dir=None, max_epoch=10000, learning_rate=0.001, batch_size=None, interval_sec=300, restore_step=None, run_metadata=False): """Train model. Args: log_dir (str): Log directory where log and model is saved. max_epoch (int): Size of epoch learning_rate (float): Learning rate batch_size (int): Batch size when using mini-batch descent method. If specifying a size larger then learning data or `None`, using batch descent. interfal_sec (float): Specify logging time interval in seconds. Default by 300. restore_step (int): When you specify this argument, this mixin resotres model for specified step. run_metadata (bool): If true, run metadata and write logs. """ if log_dir is None: log_dir = os.path.join(os.path.dirname(__file__), 'tf_logs', datetime.utcnow().strftime('%Y%m%d%H%M%S')) if batch_size is None: batch_size = 1 n_batches = len(self.corpus) // (batch_size * self.time_size) jump = (len(self.corpus) - 1) // batch_size if run_metadata: options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) metadata = tf.RunMetadata() else: options = None metadata = None with self.open_writer(log_dir) as writer: with self.open_session(interval_sec=interval_sec, per_step=n_batches, restore_step=restore_step) as sess: incomes = np.empty([batch_size, self.time_size], dtype=int) labels = np.empty([batch_size, self.time_size], dtype=int) for b in range(batch_size): incomes[b, ] = self.corpus[b * jump:b * jump + self.time_size] labels[b, ] = self.corpus[b * jump + 1:b * jump + self.time_size + 1] step = restore_step or 0 next_h = np.zeros([batch_size, self.hidden_size]) next_c = np.zeros([batch_size, self.hidden_size]) if restore_step is None: for summary in sess.run( self.los_summaries, feed_dict={ self.incomes: incomes[:batch_size], self.labels: labels[:batch_size], self.prev_h: next_h, self.prev_c: next_c }, ): writer.add_summary(summary, step) for epoch_i in range(step // self.data_size, max_epoch): for batch_i in range(n_batches): inc, lab = self.fetch_batch(epoch_i, batch_i, batch_size, jump, incomes, labels) fd = { self.incomes: inc, self.labels: lab, self.prev_h: next_h, self.prev_c: next_c, self.learning_rate: learning_rate, } _, next_h, next_c = sess.run( [self.training_op, self.next_h, self.next_c], feed_dict=fd, options=options, run_metadata=metadata, ) step += 1 if run_metadata: writer.add_run_metadata(metadata, f'step: {step}') self.record(sess, writer, step, feed_dict=fd) print(f'epock {epoch_i}: finished.') self.record(sess, writer, step, feed_dict=fd, force_write=True)