def train(self, do_monitoring): loss_function = 'categorical_crossentropy' metrics = ['accuracy'] initial_epoch = self.get_initial_epoch() steps_per_epoch = int( self.nb_training_samples / self.options.batch_size) validation_steps = int( self.nb_validation_samples / self.options.batch_size) # Useful for testing if self.options.short_epoch: steps_per_epoch = 1 validation_steps = 1 callbacks = self.make_callbacks(do_monitoring) self.model.compile(self.optimizer, loss_function, metrics=metrics) if do_monitoring: tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(self.tf_logs_path)]) terminate_at_exit(tensorboard_process) self.model.fit_generator( self.training_gen, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, epochs=self.options.nb_epochs, validation_data=self.validation_gen, validation_steps=validation_steps, callbacks=callbacks) if do_monitoring: tensorboard_process.terminate()
def _run_experiment(self, command_dag): tmp_dir = self.tmp_dir or RVConfig.get_tmp_dir().name make_dir(tmp_dir) makefile_name = os.path.join(tmp_dir, 'Makefile') with open(makefile_name, 'w') as makefile: command_ids = command_dag.get_sorted_command_ids() # .PHONY: 0 1 2 3 4 5 makefile.write('.PHONY:') for command_id in command_ids: makefile.write(' {}'.format(command_id)) makefile.write('\n\n') # all: 0 1 2 3 4 5 makefile.write('all:') for command_id in command_ids: makefile.write(' {}'.format(command_id)) makefile.write('\n\n') for command_id in command_ids: # 0: 1 2 makefile.write('{}:'.format(command_id)) for upstream_id in command_dag.get_upstream_command_ids( command_id): makefile.write(' {}'.format(upstream_id)) makefile.write('\n') # \t rastervision ... command_def = command_dag.get_command_definition(command_id) command_config = command_def.command_config command_root_uri = command_config.root_uri command_basename = 'command-config-{}.json'.format( command_config.split_id) command_uri = os.path.join(command_root_uri, command_basename) print('Saving command configuration to {}...'.format( command_uri)) save_json_config(command_config.to_proto(), command_uri) run_command = make_command(command_uri, self.tmp_dir) makefile.write('\t{}\n\n'.format(run_command)) process = Popen(['make', '-j', '-f', makefile_name]) terminate_at_exit(process) exitcode = process.wait() if exitcode != 0: sys.exit(exitcode) else: return 0
def train(config_path, output_dir, num_steps, model_main_py=None, do_monitoring=True): output_train_dir = join(output_dir, 'train') output_eval_dir = join(output_dir, 'eval') model_main_py = model_main_py or '/opt/tf-models/object_detection/model_main.py' train_cmd = [ 'python', model_main_py, '--alsologtostderr', '--pipeline_config_path={}'.format(config_path), '--model_dir={}'.format(output_train_dir), '--num_train_steps={}'.format(num_steps), '--sample_1_of_n_eval_examples={}'.format(1) ] log.info('Running train command: {}'.format(' '.join(train_cmd))) train_process = Popen(train_cmd, stdout=PIPE, stderr=STDOUT) terminate_at_exit(train_process) if do_monitoring: eval_cmd = [ 'python', model_main_py, '--alsologtostderr', '--pipeline_config_path={}'.format(config_path), '--checkpoint_dir={}'.format(output_train_dir), '--model_dir={}'.format(output_eval_dir) ] log.info('Running eval command: {}'.format(' '.join(eval_cmd))) # Don't let the eval process take up GPU space env = deepcopy(os.environ) env['CUDA_VISIBLE_DEVICES'] = '-1' eval_process = Popen(eval_cmd, env=env) tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(output_dir)]) terminate_at_exit(eval_process) terminate_at_exit(tensorboard_process) with train_process: for line in train_process.stdout: log.info(line.decode('utf-8')) log.info('-----DONE TRAINING----') if do_monitoring: eval_process.terminate() tensorboard_process.terminate()
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. batch_size = self.train_opts.batch_size chip_size = self.task_config.chip_size class_names = self.class_map.get_class_names() databunch = build_databunch(chip_dir, chip_size, batch_size, class_names) log.info(databunch) num_labels = len(databunch.label_names) if self.train_opts.debug: make_debug_chips(databunch, self.class_map, tmp_dir, train_uri) # Setup model num_labels = len(databunch.label_names) model = get_model(self.train_opts.model_arch, num_labels, pretrained=True) model = model.to(self.device) model_path = join(train_dir, 'model') # Load weights from a pretrained model. pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: log.info('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) model.load_state_dict( torch.load(pretrained_path, map_location=self.device)) # Possibly resume training from checkpoint. start_epoch = 0 train_state_path = join(train_dir, 'train_state.json') if isfile(train_state_path): log.info('Resuming from checkpoint: {}\n'.format(model_path)) train_state = file_to_json(train_state_path) start_epoch = train_state['epoch'] + 1 model.load_state_dict( torch.load(model_path, map_location=self.device)) # Write header of log CSV file. metric_names = ['precision', 'recall', 'f1'] log_path = join(train_dir, 'log.csv') if not isfile(log_path): with open(log_path, 'w') as log_file: log_writer = csv.writer(log_file) row = ['epoch', 'time', 'train_loss'] + metric_names log_writer.writerow(row) # Setup Tensorboard logging. if self.train_opts.log_tensorboard: log_dir = join(train_dir, 'tb-logs') make_dir(log_dir) tb_writer = SummaryWriter(log_dir=log_dir) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) # Setup optimizer, loss, and LR scheduler. loss_fn = torch.nn.CrossEntropyLoss() lr = self.train_opts.lr opt = optim.Adam(model.parameters(), lr=lr) step_scheduler, epoch_scheduler = None, None num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle and num_epochs > 1: steps_per_epoch = len(databunch.train_ds) // batch_size total_steps = num_epochs * steps_per_epoch step_size_up = (num_epochs // 2) * steps_per_epoch step_size_down = total_steps - step_size_up step_scheduler = CyclicLR(opt, base_lr=lr / 10, max_lr=lr, step_size_up=step_size_up, step_size_down=step_size_down, cycle_momentum=False) for _ in range(start_epoch * steps_per_epoch): step_scheduler.step() # Training loop. for epoch in range(start_epoch, num_epochs): # Train one epoch. log.info('-----------------------------------------------------') log.info('epoch: {}'.format(epoch)) start = time.time() train_loss = train_epoch(model, self.device, databunch.train_dl, opt, loss_fn, step_scheduler) if epoch_scheduler: epoch_scheduler.step() log.info('train loss: {}'.format(train_loss)) # Validate one epoch. metrics = validate_epoch(model, self.device, databunch.valid_dl, num_labels) log.info('validation metrics: {}'.format(metrics)) # Print elapsed time for epoch. end = time.time() epoch_time = datetime.timedelta(seconds=end - start) log.info('epoch elapsed time: {}'.format(epoch_time)) # Save model and state. torch.save(model.state_dict(), model_path) train_state = {'epoch': epoch} json_to_file(train_state, train_state_path) # Append to log CSV file. with open(log_path, 'a') as log_file: log_writer = csv.writer(log_file) row = [epoch, epoch_time, train_loss] row += [metrics[k] for k in metric_names] log_writer.writerow(row) # Write to Tensorboard log. if self.train_opts.log_tensorboard: for key, val in metrics.items(): tb_writer.add_scalar(key, val, epoch) tb_writer.add_scalar('train_loss', train_loss, epoch) for name, param in model.named_parameters(): tb_writer.add_histogram(name, param, epoch) if (train_uri.startswith('s3://') and (((epoch + 1) % self.train_opts.sync_interval) == 0)): sync_to_dir(train_dir, train_uri) # Close Tensorboard. if self.train_opts.log_tensorboard: tb_writer.close() if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def train(self, tmp_dir: str) -> None: """Train a DeepLab model the task and backend config. Args: tmp_dir: (str) temporary directory to use Returns: None """ train_py = self.backend_config.script_locations.train_py eval_py = self.backend_config.script_locations.eval_py export_py = self.backend_config.script_locations.export_py # Setup local input and output directories log.info('Setting up local input and output directories') train_logdir = self.backend_config.training_output_uri train_logdir_local = get_local_path(train_logdir, tmp_dir) dataset_dir = get_record_dir(self.backend_config.training_data_uri, TRAIN) dataset_dir_local = get_local_path(dataset_dir, tmp_dir) make_dir(tmp_dir) make_dir(train_logdir_local) make_dir(dataset_dir_local) # Download training data log.info('Downloading training data') for i, record_file in enumerate(list_paths(dataset_dir)): download_if_needed(record_file, tmp_dir) # Download and untar initial checkpoint. log.info('Downloading and untarring initial checkpoint') tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri download_if_needed(tf_initial_checkpoints_uri, tmp_dir) tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir) tfic_dir = os.path.dirname(tfic_tarball) with tarfile.open(tfic_tarball, 'r:gz') as tar: tar.extractall(tfic_dir) tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0] tfic_ckpt = tfic_ckpt[0:-len('.index')] # Restart support train_restart_dir = self.backend_config.train_options.train_restart_dir if type(train_restart_dir) is not str or len(train_restart_dir) == 0: train_restart_dir = train_logdir # Get output from potential previous run so we can resume training. if type(train_restart_dir) is str and len( train_restart_dir ) > 0 and not self.backend_config.train_options.replace_model: sync_from_dir(train_restart_dir, train_logdir_local) else: if self.backend_config.train_options.replace_model: if os.path.exists(train_logdir_local): shutil.rmtree(train_logdir_local) make_dir(train_logdir_local) # Periodically synchronize with remote sync = start_sync( train_logdir_local, train_logdir, sync_interval=self.backend_config.train_options.sync_interval) with sync: # Setup TFDL config tfdl_config = json_format.ParseDict( self.backend_config.tfdl_config, TrainingParametersMsg()) log.info('tfdl_config={}'.format(tfdl_config)) log.info('Training steps={}'.format( tfdl_config.training_number_of_steps)) # Additional training options max_class = max( list(map(lambda c: c.id, self.class_map.get_items()))) num_classes = len(self.class_map.get_items()) num_classes = max(max_class, num_classes) + 1 (train_args, train_env) = get_training_args( train_py, train_logdir_local, tfic_ckpt, dataset_dir_local, num_classes, tfdl_config) # Start training log.info('Starting training process') log.info(' '.join(train_args)) train_process = Popen(train_args, env=train_env) terminate_at_exit(train_process) if self.backend_config.train_options.do_monitoring: # Start tensorboard log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(train_logdir_local)]) terminate_at_exit(tensorboard_process) if self.backend_config.train_options.do_eval: # Start eval script log.info('Starting eval script') eval_logdir = train_logdir_local eval_args = get_evaluation_args(eval_py, train_logdir_local, dataset_dir_local, eval_logdir, tfdl_config) eval_process = Popen(eval_args, env=train_env) terminate_at_exit(eval_process) # Wait for training and tensorboard log.info('Waiting for training and tensorboard processes') train_process.wait() if self.backend_config.train_options.do_monitoring: tensorboard_process.terminate() # Export frozen graph log.info( 'Exporting frozen graph ({}/model)'.format(train_logdir_local)) export_args = get_export_args(export_py, train_logdir_local, num_classes, tfdl_config) export_process = Popen(export_args) terminate_at_exit(export_process) export_process.wait() # Package up the model files for usage as fine tuning checkpoints fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name latest_checkpoints = get_latest_checkpoint(train_logdir_local) model_checkpoint_files = glob.glob( '{}*'.format(latest_checkpoints)) inference_graph_path = os.path.join(train_logdir_local, 'model') with RVConfig.get_tmp_dir() as tmp_dir: model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name) make_dir(model_dir) model_tar = os.path.join( train_logdir_local, '{}.tar.gz'.format(fine_tune_checkpoint_name)) shutil.copy(inference_graph_path, '{}/frozen_inference_graph.pb'.format(model_dir)) for path in model_checkpoint_files: shutil.copy(path, model_dir) with tarfile.open(model_tar, 'w:gz') as tar: tar.add(model_dir, arcname=os.path.basename(model_dir)) # Perform final sync sync_to_dir(train_logdir_local, train_logdir, delete=False)
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes num_workers = 0 if self.train_opts.debug else 4 data = (SegmentationItemList.from_folder(chip_dir) .split_by_folder(train='train-img', valid='val-img')) train_count = None if self.train_opts.train_count is not None: train_count = min(len(data.train), self.train_opts.train_count) elif self.train_opts.train_prop != 1.0: train_count = int(round(self.train_opts.train_prop * len(data.train))) train_items = data.train.items if train_count is not None: train_inds = np.random.permutation(np.arange(len(data.train)))[0:train_count] train_items = train_items[train_inds] items = np.concatenate([train_items, data.valid.items]) data = (SegmentationItemList(items, chip_dir) .split_by_folder(train='train-img', valid='val-img') .label_from_func(get_label_path, classes=classes) .transform(get_transforms(flip_vert=self.train_opts.flip_vert), size=size, tfm_y=True) .databunch(bs=self.train_opts.batch_sz, num_workers=num_workers)) print(data) # Setup learner. ignore_idx = 0 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx)] model_arch = getattr(models, self.train_opts.model_arch) learn = unet_learner( data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, bottle=True, path=train_dir) learn.unfreeze() if self.train_opts.mixed_prec and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict( torch.load(pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] oversample = self.train_opts.oversample if oversample: weights = get_oversampling_weights( data.train_ds, oversample['rare_class_ids'], oversample['rare_target_prop']) oversample_callback = OverSamplingCallback(learn, weights=weights) callbacks.append(oversample_callback) if self.train_opts.debug: if oversample: oversample_callback.on_train_begin() make_debug_chips(data, class_map, tmp_dir, train_uri) if self.train_opts.log_tensorboard: callbacks.append(TensorboardLogger(learn, 'run')) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') log_dir = join(train_dir, 'logs', 'run') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def run_command(cmd, shell=False): process = subprocess.Popen(cmd, shell=shell) terminate_at_exit(process) exitcode = process.wait() if exitcode != 0: sys.exit(exitcode)