def mesh_to_example(codebase_root_dir, mesh_path, dirpath, skip_existing, log_level): # Logging level must be specified because mesh_to_example is an entry point # for a subprocess call. log.set_level(log_level) ldif_path = path_util.get_path_to_ldif_root() if not skip_existing or not os.path.isfile( f'{dirpath}/depth_and_normals.npz'): sp.check_output( f'{codebase_root_dir}/scripts/process_mesh_local.sh {mesh_path} {dirpath} {ldif_path}', shell=True) write_depth_and_normals_npz(dirpath, f'{dirpath}/depth_and_normals.npz') else: log.verbose(f'Skipping shell script processing for {dirpath},' ' the output already exists.') # Precompute the dodeca samples for later: e = example.InferenceExample.from_directory(dirpath) sample_path = e.precomputed_surface_samples_from_dodeca_path if not skip_existing or not os.path.isfile(sample_path): e.surface_sample_count = 100000 precomputed_samples = e.surface_samples_from_dodeca assert precomputed_samples.shape[0] == 100000 assert precomputed_samples.shape[1] == 6 file_util.write_points(sample_path, precomputed_samples) else: log.verbose( f'Skipping surface sample precompution for {dirpath}, it\'s already done.' )
def mesh_to_example(codebase_root_dir, mesh_path, dirpath, skip_existing, log_level): # Logging level must be specified because mesh_to_example is an entry point # for a subprocess call. log.set_level(log_level) ldif_path = path_util.get_path_to_ldif_root() if not skip_existing or not os.path.isfile( f'{dirpath}/depth_and_normals.npz'): sp.check_output( f'{codebase_root_dir}/scripts/process_mesh_local.sh {mesh_path} {dirpath} {ldif_path}', shell=True) # write_depth_and_normals_npz(dirpath, f'{dirpath}/depth_and_normals.npz') else: log.verbose(f'Skipping shell script processing for {dirpath},' ' the output already exists.')
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.disable_v2_behavior() log.set_level(FLAGS.log_level) log.info('Making dataset...') if not FLAGS.dataset_directory: raise ValueError('A dataset directory must be provided.') # TODO(kgenova) This batch size should match. dataset = local_inputs.make_dataset(FLAGS.dataset_directory, mode='train', batch_size=FLAGS.batch_size, split=FLAGS.split) # Sets up the hyperparameters and tf.Dataset model_config = build_model_config(dataset) # Generates the graph for a single train step, including summaries shared_launcher.sif_transcoder(model_config) summary_op = tf.summary.merge_all() global_step_op = tf.compat.v1.train.get_global_step() saver = tf.train.Saver(max_to_keep=5, pad_step_number=False, save_relative_paths=True) init_op = tf.initialize_all_variables() model_root = get_model_root() experiment_dir = f'{model_root}/sif-transcoder-{FLAGS.experiment_name}' checkpoint_dir = f'{experiment_dir}/1-hparams/train/' if FLAGS.reserve_memory_for_inference_kernel and sys.platform != "darwin": current_free = gpu_util.get_free_gpu_memory(0) allowable = current_free - (1024 + 512) # ~1GB allowable_fraction = allowable / current_free if allowable_fraction <= 0.0: raise ValueError( f"Can't leave 1GB over for the inference kernel, because" f" there is only {allowable} total free GPU memory.") log.info( f'TensorFlow can use up to {allowable_fraction*100}% of the total' ' GPU memory.') else: allowable_fraction = 1.0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=allowable_fraction) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: writer = tf.summary.FileWriter(f'{experiment_dir}/log', session.graph) log.info('Initializing variables...') session.run([init_op]) if FLAGS.visualize: visualize_data(session, model_config.inputs['dataset']) # Check whether the checkpoint directory already exists (resuming) or # needs to be created (new model). if not os.path.isdir(checkpoint_dir): log.info('No previous checkpoint detected, training from scratch.') os.makedirs(checkpoint_dir) # Serialize hparams so eval can load them: hparam_path = f'{checkpoint_dir}/hparam_pickle.txt' if not file_util.exists(hparam_path): hparams.write_hparams(model_config.hparams, hparam_path) initial_index = 0 else: log.info( f'Checkpoint root {checkpoint_dir} exists, attempting to resume.' ) latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) log.info(f'Latest checkpoint: {latest_checkpoint}') saver.restore(session, latest_checkpoint) initial_index = session.run(global_step_op) log.info(f'The global step is {initial_index}') initial_index = int(initial_index) log.info(f'Parsed to {initial_index}') for i in range(initial_index, FLAGS.train_step_count): start_time = time.time() log.info(f'Step {i}') is_summary_step = i % FLAGS.summary_step_interval == 0 if is_summary_step: _, summaries, loss = session.run( [model_config.train_op, summary_op, model_config.loss]) writer.add_summary(summaries, i) else: _, loss = session.run( [model_config.train_op, model_config.loss]) end_time = time.time() steps_per_second = 1.0 / (end_time - start_time) log.info(f'Loss: {loss}\tSteps/second: {steps_per_second}') is_checkpoint_step = i % FLAGS.checkpoint_interval == 0 if is_checkpoint_step or i == FLAGS.train_step_count - 1: ckpt_path = os.path.join(checkpoint_dir, 'model.ckpt') log.info(f'Writing checkpoint to {ckpt_path}...') saver.save(session, ckpt_path, global_step=i) log.info('Done training!')
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') random.seed(2077) log.set_level(FLAGS.log_level) n_jobs = os.cpu_count() assert FLAGS.max_threads != 0 if FLAGS.max_threads > 0: n_jobs = FLAGS.max_threads mesh_directory = FLAGS.mesh_directory if mesh_directory[-1] == '/': mesh_directory = mesh_directory[:-1] files = glob.glob(f'{mesh_directory}/*/*/*.ply') if not files and not FLAGS.optimize_only: raise ValueError(f"Didn't find any ply files in {mesh_directory}. " "Please make sure the directory structure is " "[mesh_directory]/[splits]/[class names]/[ply files]") # Make the directories first because it's not threadsafe and also might fail. if files and not FLAGS.optimize_only: log.info('Creating directories...') for i, f in tqdm.tqdm(enumerate(files)): relpath = f.replace(mesh_directory, '') # log.info(f'Relpath: {relpath}') assert relpath[0] == '/' relpath = relpath[1:] split, synset = relpath.split('/')[:2] if not os.path.isdir(f'{FLAGS.dataset_directory}/{split}'): os.makedirs(f'{FLAGS.dataset_directory}/{split}') if not os.path.isdir( f'{FLAGS.dataset_directory}/{split}/{synset}'): os.mkdir(f'{FLAGS.dataset_directory}/{split}/{synset}') log.info('Making dataset...') # Flags can't be pickled: output_dirs = Parallel(n_jobs=n_jobs)( delayed(process_one)(f, mesh_directory, FLAGS.dataset_directory, FLAGS.skip_existing, FLAGS.log_level) for f in tqdm.tqdm(files)) log.info('Making dataset registry...') else: output_dirs = glob.glob( f'{FLAGS.dataset_directory}/*/*/*/surface_samples_from_dodeca.pts') output_dirs = [os.path.dirname(f) + '/' for f in output_dirs] output_dirs.sort( ) # So randomize with a fixed seed always results in the same order splits = {x.split('/')[-4] for x in output_dirs} if 'optimized' in splits: raise ValueError( f'The keyword "optimized" cannot be used for a split name, it is reserved.' ) for split in splits: elements_of_split = [ x for x in output_dirs if x.split('/')[-4] == split ] with open(f'{FLAGS.dataset_directory}/{split}.txt', 'wt') as f: f.write('\n'.join(elements_of_split) + '\n') log.info('Done!') if FLAGS.optimize: log.info('Precomputing optimized tfrecord files...') opt_dir = f'{FLAGS.dataset_directory}/optimized' if FLAGS.trample_optimized and os.path.isdir(opt_dir): for f in os.listdir(opt_dir): if f.endswith('.tfrecords'): os.remove(os.path.join(opt_dir, f)) if not os.path.isdir(opt_dir): os.mkdir(opt_dir) for split in splits: log.info(f'Optimizing split {split}...') elements_of_split = [ x for x in output_dirs if x.split('/')[-4] == split ] examples_per_shard = 64 # Make sure shards are totally random: random.shuffle(elements_of_split) n_shards = int(len(elements_of_split) / examples_per_shard) if len(elements_of_split) % examples_per_shard: n_shards += 1 shard_dir = f'{FLAGS.dataset_directory}/optimized/{split}' if not os.path.isdir(shard_dir): os.mkdir(shard_dir) for shard_idx in tqdm.tqdm(range(n_shards)): shard_name = f'{shard_dir}/{split}-%.5d-of-%.5d.tfrecords' % ( shard_idx, n_shards) if not FLAGS.trample_optimized and os.path.isfile(shard_name): continue start_idx = shard_idx * examples_per_shard end_idx = (shard_idx + 1) * examples_per_shard options = tf.io.TFRecordOptions( tf.compat.v1.io.TFRecordCompressionType.GZIP) with tf.io.TFRecordWriter(shard_name, options=options) as writer: to_process = elements_of_split[start_idx:end_idx] serialized = Parallel(n_jobs=n_jobs)( delayed(serialize)(d, FLAGS.log_level) for d in to_process) for s in serialized: writer.write(s)
def load_example_dict(example_directory, log_level=None): """Loads an example from disk and makes a str:numpy dictionary out of it.""" if log_level: log.set_level(log_level) entry_t = time.time() start_t = entry_t # Keep the function entry time around for a cumulative print. e = example.InferenceExample.from_directory(example_directory, verbose=False) end_t = time.time() log.verbose(f'Make example: {end_t - start_t}') start_t = end_t # The from_directory method should probably optionally take in a synset. bounding_box_samples = e.uniform_samples end_t = time.time() log.verbose(f'Bounding box: {end_t - start_t}') start_t = end_t # TODO(kgenova) There is a pitfall here where the depth is divided by 1000, # after this. So if some other depth images are provided, they would either # need to also be stored in the GAPS format or be artificially multiplied # by 1000. depth_renders = e.depth_images # [20, 224, 224, 1]. 1 or 1000? trailing 1? assert depth_renders.shape[0] == 1 depth_renders = depth_renders[0, ...] end_t = time.time() log.verbose(f'Depth renders: {end_t - start_t}') start_t = end_t mesh_name = e.mesh_name end_t = time.time() log.verbose(f'Mesh name: {end_t - start_t}') start_t = end_t log.verbose(f'Loading {mesh_name} from split {e.split}') near_surface_samples = e.near_surface_samples end_t = time.time() log.verbose(f'NSS: {end_t - start_t}') start_t = end_t grid = e.grid end_t = time.time() log.verbose(f'Grid: {end_t - start_t}') start_t = end_t world2grid = e.world2grid end_t = time.time() log.verbose(f'world2grid: {end_t - start_t}') start_t = end_t surface_point_samples = e.precomputed_surface_samples_from_dodeca end_t = time.time() log.verbose(f'surface points: {end_t - start_t}') log.verbose(f'load_example_dict total time: {end_t - entry_t}') return { 'bounding_box_samples': bounding_box_samples, 'depth_renders': depth_renders, 'mesh_name': mesh_name, 'near_surface_samples': near_surface_samples, 'grid': grid, 'world2grid': world2grid, 'surface_point_samples': surface_point_samples, }
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') log.set_level(FLAGS.log_level) tf.disable_v2_behavior() gpu_util.get_free_gpu_memory(0) if FLAGS.use_gpu_for_tensorflow and FLAGS.use_inference_kernel: log.info('Limiting TensorFlow memory by 1GB so the inference kernel' ' has enough left over to run.') if not FLAGS.dataset_directory: raise ValueError('A dataset directory must be provided.') if not FLAGS.result_directory: if FLAGS.save_results or FLAGS.save_meshes or FLAGS.save_ldifs: raise ValueError( 'A result directory must be provided to save results.') else: if not os.path.isdir(FLAGS.result_directory): os.makedirs(FLAGS.result_directory) if not FLAGS.use_gpu_for_tensorflow: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' log.info('Loading model...') # Try to detect the most common error early for a good warning message: if not os.path.isdir(get_model_root()): raise ValueError( f"Couldn't find a trained model at {get_model_root()}") encoder, decoder = load_newest_model() log.info('Evaluating metrics...') splits = [x for x in FLAGS.split.split(',') if x] log.info(f'Will evaluate on splits: {splits}') for split in splits: log.info(f'Starting evaluation for split {split}.') dataset_items = get_evaluation_directories(split) log.info(f'The split has {len(dataset_items)} elements.') results = [] to_eval = filter_by_class(dataset_items) to_eval = filter_by_eval_frac(to_eval) for path in tqdm.tqdm(to_eval): e = examples.InferenceExample.from_directory(path) embedding = encoder.run_example(e) iou = decoder.iou(embedding, e) gt_mesh = e.gt_mesh mesh = decoder.extract_mesh(embedding, resolution=FLAGS.resolution) if FLAGS.visualize: # Visualize in the normalized_coordinate frame, so the camera is # always reasonable. Metrics are computed in the original frame. gaps_util.mshview([e.normalized_gt_mesh, mesh]) # TODO(kgenova) gaps2occnet is poorly named, it is really normalized -> # unnormalized (where 'gaps' is the normalized training frame and 'occnet' # is whatever the original frame of the input mesh was) post_extract_start = time.time() mesh.apply_transform(e.gaps2occnet) if FLAGS.save_meshes: path = (f'{FLAGS.result_directory}/meshes/{split}/{e.cat}/' f'{e.mesh_hash}.ply') if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) mesh.export(path) if FLAGS.save_ldifs: path = (f'{FLAGS.result_directory}/ldifs/{split}/{e.cat}/' f'{e.mesh_hash}.txt') if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) decoder.savetxt(embedding, path) nc, fst, fs2t, chamfer = metrics.all_mesh_metrics(mesh, gt_mesh) log.verbose(f'Mesh: {e.mesh_name}') log.verbose(f'IoU: {iou}.') log.verbose(f'F-Score (tau): {fst}') log.verbose(f'Chamfer: {chamfer}') log.verbose(f'F-Score (2*tau): {fs2t}') log.verbose(f'Normal Consistency: {nc}') results.append({ 'key': e.mesh_name, 'Normal Consistency': nc, 'F-Score (tau)': fst, 'F-Score (2*tau)': fs2t, 'Chamfer': chamfer, 'IoU': iou }) post_extract_end = time.time() log.verbose( f'Time post extract: {post_extract_end - post_extract_start}') results = pd.DataFrame(results) if FLAGS.save_results: complete_csv = results.to_csv() result_path = f'{FLAGS.result_directory}/full_results_{split}.csv' file_util.writetxt(result_path, complete_csv) final_results = metrics.aggregate_extracted(results) if FLAGS.save_results: summary_out_path = f'{FLAGS.result_directory}/result_summary_{split}.csv' file_util.writetxt(summary_out_path, final_results.to_csv())
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.disable_v2_behavior() log.set_level(FLAGS.log_level) log.info('Making dataset...') if not FLAGS.dataset_directory: raise ValueError('A dataset directory must be provided.') if not os.path.isdir(FLAGS.dataset_directory): raise ValueError( f'No dataset directory found at {FLAGS.dataset_directory}') # TODO(kgenova) This batch size should match. dataset = local_inputs.make_dataset(FLAGS.dataset_directory, mode='train', batch_size=FLAGS.batch_size, split=FLAGS.split) # Sets up the hyperparameters and tf.Dataset model_config = build_model_config(dataset) #print('[HERE: In train] ******* Printing model_config, right after building model config') #print(type(model_config)) #print(dir(model_config)) #print('[HERE: In train] ******* Printing model_config done, right after building model config') # Generates the graph for a single train step, including summaries # shared_launcher.sif_transcoder sets more configs of model_config shared_launcher.sif_transcoder(model_config) print( '[HERE: In train] ******* Printing model_config, right after running shared_launcher' ) print(type(model_config)) print(dir(model_config)) print('Type of model_config.train_op:', type(model_config.train_op)) print('Type of model_config.loss:', type(model_config.loss)) print('Losses used:', model_config.hparams.loss) print('Hparams:', model_config.hparams) # train_op is a tensor! print( '[HERE: In train] ******* Printing model_config done, right after running shared_launcher' ) summary_op = tf.summary.merge_all() global_step_op = tf.compat.v1.train.get_global_step() saver = tf.train.Saver(max_to_keep=5, pad_step_number=False, save_relative_paths=True) init_op = tf.initialize_all_variables() model_root = get_model_root() experiment_dir = f'{model_root}/sif-transcoder-{FLAGS.experiment_name}' checkpoint_dir = f'{experiment_dir}/1-hparams/train/' if FLAGS.reserve_memory_for_inference_kernel and sys.platform != "darwin": print( '[HERE: In train] --reserve_memory_for_inference_kernel specified.' ) current_free = gpu_util.get_free_gpu_memory(2) allowable = current_free - (1024 + 512) # ~1GB allowable = min(allowable, 10000) allowable_fraction = allowable / current_free print('[HERE: In train] GPU memory usage planning:') #print('[HERE: In train] | allowable is limited to = 5000') print('[HERE: In train] | current_free = %d, allowable = %d' % (current_free, allowable)) if allowable_fraction <= 0.0: raise ValueError( f"Can't leave 1GB over for the inference kernel, because" f" there is only {allowable} total free GPU memory.") log.info( f'TensorFlow can use up to {allowable_fraction*100}% of the total' ' GPU memory.') else: allowable_fraction = 1.0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=allowable_fraction) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: #print('[HERE: In train] ******* Printing model_config, right after session creation') #print(type(model_config)) #print(dir(model_config)) #print('[HERE: In train] ******* Printing model_config done, right after session creation') writer = tf.summary.FileWriter(f'{experiment_dir}/log', session.graph) log.info('Initializing variables...') session.run([init_op]) #print('[HERE: In train] ******* Printing model_config, right after session init') #print(type(model_config)) #print(dir(model_config)) #print('[HERE: In train] ******* Printing model_config done, right after session init') if FLAGS.visualize: visualize_data(session, model_config.inputs['dataset']) # Check whether the checkpoint directory already exists (resuming) or # needs to be created (new model). if not os.path.isdir(checkpoint_dir): log.info('No previous checkpoint detected, training from scratch.') os.makedirs(checkpoint_dir) # Serialize hparams so eval can load them: hparam_path = f'{checkpoint_dir}/hparam_pickle.txt' if not file_util.exists(hparam_path): hparams.write_hparams(model_config.hparams, hparam_path) initial_index = 0 else: log.info( f'Checkpoint root {checkpoint_dir} exists, attempting to resume.' ) latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) log.info(f'Latest checkpoint: {latest_checkpoint}') saver.restore(session, latest_checkpoint) initial_index = session.run(global_step_op) log.info(f'The global step is {initial_index}') initial_index = int(initial_index) log.info(f'Parsed to {initial_index}') print('[HERE: In train] Starting training...') start_time = time.time() log_every = 10 print( '[HERE: In train] ******* Printing model_config, right before training loop starts' ) print(type(model_config)) print(dir(model_config)) print( '[HERE: In train] ******* Printing model_config done, right before training loop starts' ) for i in range(initial_index, FLAGS.train_step_count): print( '[HERE: In train] Starting training, within loop, before log verbose...' ) log.verbose(f'Starting step {i}...') print(f'[HERE: In train] Starting step {i}...') print( '[HERE: In train] Starting training, within loop, after verbose...' ) is_summary_step = i % FLAGS.summary_step_interval == 0 # running the session to get the results if is_summary_step: #print('[HERE: In train] This is a summary step. Computing summaries and loss...') _, summaries, loss = session.run( [model_config.train_op, summary_op, model_config.loss]) writer.add_summary(summaries, i) print( '[HERE: In train] This is a summary step. Done writing summaries and loss...' ) else: print( '[HERE: In train] This is not a summary step. Computing loss...' ) _, loss = session.run( [model_config.train_op, model_config.loss]) print( '[HERE: In train] This is not a summary step. Done computing loss...' ) if not (i % log_every): print('[HERE: In train] This is a log step. Logging...') end_time = time.time() steps_per_second = float(log_every) / (end_time - start_time) start_time = end_time log.info( f'Step: {i}\tLoss: {loss}\tSteps/second: {steps_per_second}' ) print('[HERE: In train] This is a log step. Logging done...') is_checkpoint_step = i % FLAGS.checkpoint_interval == 0 if is_checkpoint_step or i == FLAGS.train_step_count - 1: print( '[HERE: In train] This is a saving checkpoint step. Saving model...' ) ckpt_path = os.path.join(checkpoint_dir, 'model.ckpt') log.info(f'Writing checkpoint to {ckpt_path}...') saver.save(session, ckpt_path, global_step=i) print( '[HERE: In train] This is a saving checkpoint step. Done saving model...' ) print('[HERE: In train] This step done. Starting a new step...') log.info('Done training!')