def cached_download(uris, data_dir): paths = [] for uri in uris: path = get_local_path(uri, data_dir) paths.append(path) if not isfile(path): download_if_needed(uri, data_dir) return paths
def test_download_if_needed_local(self): with self.assertRaises(NotReadableError): download_if_needed(self.local_path, self.temp_dir.name) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.local_path) local_path = download_if_needed(self.local_path, self.temp_dir.name) self.assertEqual(local_path, self.local_path)
def test_copy_from_http(self): http_path = ('https://raw.githubusercontent.com/tensorflow/models/' '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md') expected = os.path.join( self.temp_dir.name, 'http', 'raw.githubusercontent.com', 'tensorflow/models', '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md') download_if_needed(http_path, self.temp_dir.name) self.assertTrue(file_exists(expected)) os.remove(expected)
def test_download_if_needed_s3(self): with self.assertRaises(NotReadableError): download_if_needed(self.s3_path, self.temp_dir.name) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.s3_path) local_path = download_if_needed(self.s3_path, self.temp_dir.name) content_str = file_to_str(local_path) self.assertEqual(self.content_str, content_str) wrong_path = 's3://wrongpath/x.txt' with self.assertRaises(NotWritableError): upload_or_copy(local_path, wrong_path)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.inf_learner is None: model_uri = self.config['model_uri'] model_path = download_if_needed(model_uri, tmp_dir) self.inf_learner = load_learner(dirname(model_path), basename(model_path))
def load_model(self, tmp_dir): from rastervision.backend.keras_classification.builders \ import model_builder if self.model is None: model_path = download_if_needed(self.config.model_uri, tmp_dir) self.model = model_builder.build_from_path(model_path) self.model._make_predict_function()
def load_model(self, tmp_dir): import tensorflow as tf # Load and memoize the detection graph and TF session. if self.detection_graph is None: model_path = download_if_needed(self.config.model_uri, tmp_dir) self.detection_graph = load_frozen_graph(model_path) self.session = tf.Session(graph=self.detection_graph)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.inf_learner is None: self.log_options() model_uri = self.backend_opts.model_uri model_path = download_if_needed(model_uri, tmp_dir) self.inf_learner = load_learner( dirname(model_path), basename(model_path))
def from_model_bundle(model_bundle_uri, tmp_dir): model_bundle_path = download_if_needed(model_bundle_uri, tmp_dir) model_bundle_dir = join(tmp_dir, 'model-bundle') unzip(model_bundle_path, model_bundle_dir) config_path = join(model_bundle_dir, 'config.json') model_path = join(model_bundle_dir, 'model.pth') cfg = build_config(file_to_json(config_path)) return cfg.get_learner()(cfg, tmp_dir, model_path=model_path)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.inf_learner is None: self.print_options() model_uri = self.backend_opts.model_uri model_path = download_if_needed(model_uri, tmp_dir) self.inf_learner = load_learner( dirname(model_path), basename(model_path)) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def _download_data(self, temp_dir): """Download any data needed for this Raster Source. Return a single local path representing the image or a VRT of the data. """ if len(self.uris) == 1: return download_if_needed(self.uris[0], temp_dir) else: return download_and_build_vrt(self.uris, temp_dir)
def download_if_needed(self, uri): """Download file if it's remote. Args: uri: (string) URI of file, possibly remote Returns: (string) path of local file that was downloaded """ return download_if_needed(uri, self.temp_dir)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.raster_func is None: self.print_options() model_uri = self.backend_opts.model_uri model_path = download_if_needed(model_uri, tmp_dir) with open(model_path, 'r') as func_file: func_str = func_file.read() print('FUNC', func_str) parsed_func = self._toolbox.compile(expr=func_str) self.raster_func = parsed_func
def load_model(self, tmp_dir, ctx=None): """Load ResNet50v2 and load trained weights. Args: tmp_dir: temporary directory Returns: None """ if self.model is None: self.model = build_model(ctx, len(self.class_map)) model_files = ModelFiles(self.config.training_output_uri, tmp_dir) model_path = download_if_needed(model_files.model_uri, tmp_dir) self.model.load_parameters(model_path)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.model is None: model_uri = self.backend_opts.model_uri model_path = download_if_needed(model_uri, tmp_dir) num_classes = len(self.class_map) model = get_model(self.train_opts.model_arch, num_classes, pretrained=True) model = model.to(self.device) model.load_state_dict( torch.load(model_path, map_location=self.device)) self.model = model
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" if self.model is None: model_uri = self.backend_opts.model_uri model_path = download_if_needed(model_uri, tmp_dir) # add one for background class num_classes = len(self.task_config.class_map) + 1 model = MyFasterRCNN(self.train_opts.model_arch, num_classes, self.task_config.chip_size, pretrained=False) model = model.to(self.device) model.load_state_dict( torch.load(model_path, map_location=self.device)) self.model = model
def load_model(self, tmp_dir: str): """Load the model in preparation for one or more prediction calls. Args: tmp_dir: (str) temporary directory to use """ # noqa Courtesy of https://github.com/tensorflow/models/blob/cbbb2ffcde66e646d4a47628ffe2ece2322b64e8/research/deeplab/deeplab_demo.ipynb import tensorflow as tf if self.sess is None: FROZEN_GRAPH_NAME = download_if_needed( self.backend_config.model_uri, tmp_dir) graph = tf.Graph() with open(FROZEN_GRAPH_NAME, 'rb') as data: graph_def = tf.GraphDef.FromString(data.read()) with graph.as_default(): tf.import_graph_def(graph_def, name='') self.sess = tf.Session(graph=graph)
def _postprocess(pred_uri, experiment_id, root_uri): tmp_pred_uri = download_if_needed(pred_uri, "/opt/data/predict/") tmp_postprocess_uri = tmp_pred_uri.replace("/predict/", "/postprocess/") os.makedirs(dirname(tmp_postprocess_uri), exist_ok=True) out_uri = join(root_uri, "postprocess", experiment_id, basename(pred_uri)) with rasterio.open(tmp_pred_uri) as src: img = src.read() img = np.where(img == 2, 0, img) profile = src.profile with rasterio.open(tmp_postprocess_uri, "w", **profile) as dst: dst.write(img) upload_or_copy(tmp_postprocess_uri, out_uri) for t in (tmp_pred_uri, tmp_postprocess_uri): os.remove(t)
def load_model(self, tmp_dir): """Load the model in preparation for one or more prediction calls.""" import torch from tile2vec.tilenet import make_tilenet if self.model is None: model_path = download_if_needed(self.backend_config.model_uri, tmp_dir) # TODO: config in_channels = len( self.backend_config.scenes[0].raster_source.channel_order) z_dim = 512 self.model = make_tilenet(in_channels=in_channels, z_dim=z_dim) if self.cuda: self.model.cuda() self.model.load_state_dict(torch.load(model_path)) self.model.eval()
def _load_from_files(self, plugin_paths): if not plugin_paths: return self.plugin_sources = [] plugin_base = PluginBase(package='rastervision.plugins') for uri in plugin_paths: plugin_name = os.path.splitext(os.path.basename(uri))[0] plugin_path = os.path.join(self.plugin_root_dir, plugin_name) fs = rv._registry.get_file_system(uri, search_plugins=False) local_path = download_if_needed(uri, plugin_path, fs=fs) local_dir = os.path.dirname(local_path) plugin_source = plugin_base.make_plugin_source( searchpath=[local_dir]) # We're required to hang onto the source # to keep it from getting GC'd. self.plugin_sources.append(plugin_source) self._load_plugin(plugin_source.load_plugin(plugin_name), uri)
def create_local(self, tmp_dir): new_uri = download_if_needed(self.uri, tmp_dir) return self.to_builder() \ .with_uri(new_uri) \ .build()
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. batch_size = self.train_opts.batch_size chip_size = self.task_config.chip_size class_names = self.class_map.get_class_names() databunch = build_databunch(chip_dir, chip_size, batch_size, class_names) log.info(databunch) num_labels = len(databunch.label_names) if self.train_opts.debug: make_debug_chips(databunch, self.class_map, tmp_dir, train_uri) # Setup model num_labels = len(databunch.label_names) model = get_model(self.train_opts.model_arch, num_labels, pretrained=True) model = model.to(self.device) model_path = join(train_dir, 'model') # Load weights from a pretrained model. pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: log.info('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) model.load_state_dict( torch.load(pretrained_path, map_location=self.device)) # Possibly resume training from checkpoint. start_epoch = 0 train_state_path = join(train_dir, 'train_state.json') if isfile(train_state_path): log.info('Resuming from checkpoint: {}\n'.format(model_path)) train_state = file_to_json(train_state_path) start_epoch = train_state['epoch'] + 1 model.load_state_dict( torch.load(model_path, map_location=self.device)) # Write header of log CSV file. metric_names = ['precision', 'recall', 'f1'] log_path = join(train_dir, 'log.csv') if not isfile(log_path): with open(log_path, 'w') as log_file: log_writer = csv.writer(log_file) row = ['epoch', 'time', 'train_loss'] + metric_names log_writer.writerow(row) # Setup Tensorboard logging. if self.train_opts.log_tensorboard: log_dir = join(train_dir, 'tb-logs') make_dir(log_dir) tb_writer = SummaryWriter(log_dir=log_dir) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) # Setup optimizer, loss, and LR scheduler. loss_fn = torch.nn.CrossEntropyLoss() lr = self.train_opts.lr opt = optim.Adam(model.parameters(), lr=lr) step_scheduler, epoch_scheduler = None, None num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle and num_epochs > 1: steps_per_epoch = len(databunch.train_ds) // batch_size total_steps = num_epochs * steps_per_epoch step_size_up = (num_epochs // 2) * steps_per_epoch step_size_down = total_steps - step_size_up step_scheduler = CyclicLR(opt, base_lr=lr / 10, max_lr=lr, step_size_up=step_size_up, step_size_down=step_size_down, cycle_momentum=False) for _ in range(start_epoch * steps_per_epoch): step_scheduler.step() # Training loop. for epoch in range(start_epoch, num_epochs): # Train one epoch. log.info('-----------------------------------------------------') log.info('epoch: {}'.format(epoch)) start = time.time() train_loss = train_epoch(model, self.device, databunch.train_dl, opt, loss_fn, step_scheduler) if epoch_scheduler: epoch_scheduler.step() log.info('train loss: {}'.format(train_loss)) # Validate one epoch. metrics = validate_epoch(model, self.device, databunch.valid_dl, num_labels) log.info('validation metrics: {}'.format(metrics)) # Print elapsed time for epoch. end = time.time() epoch_time = datetime.timedelta(seconds=end - start) log.info('epoch elapsed time: {}'.format(epoch_time)) # Save model and state. torch.save(model.state_dict(), model_path) train_state = {'epoch': epoch} json_to_file(train_state, train_state_path) # Append to log CSV file. with open(log_path, 'a') as log_file: log_writer = csv.writer(log_file) row = [epoch, epoch_time, train_loss] row += [metrics[k] for k in metric_names] log_writer.writerow(row) # Write to Tensorboard log. if self.train_opts.log_tensorboard: for key, val in metrics.items(): tb_writer.add_scalar(key, val, epoch) tb_writer.add_scalar('train_loss', train_loss, epoch) for name, param in model.named_parameters(): tb_writer.add_histogram(name, param, epoch) if (train_uri.startswith('s3://') and (((epoch + 1) % self.train_opts.sync_interval) == 0)): sync_to_dir(train_dir, train_uri) # Close Tensorboard. if self.train_opts.log_tensorboard: tb_writer.close() if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def build_data(self): cfg = self.cfg batch_sz = cfg.solver.batch_sz num_workers = cfg.data.num_workers label_names = cfg.data.labels # download and unzip data if cfg.data.data_format == 'image_folder': if cfg.data.uri.startswith('s3://') or cfg.data.uri.startswith( '/'): data_uri = cfg.data.uri else: data_uri = join(cfg.base_uri, cfg.data.uri) data_dirs = [] zip_uris = [data_uri] if data_uri.endswith('.zip') else list_paths( data_uri, 'zip') for zip_ind, zip_uri in enumerate(zip_uris): zip_path = get_local_path(zip_uri, self.data_cache_dir) if not isfile(zip_path): zip_path = download_if_needed(zip_uri, self.data_cache_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: data_dir = join(self.tmp_dir, 'data', str(zip_ind)) data_dirs.append(data_dir) zipf.extractall(data_dir) train_ds, valid_ds, test_ds = [], [], [] for data_dir in data_dirs: train_dir = join(data_dir, 'train') valid_dir = join(data_dir, 'valid') # build datasets transform = Compose( [Resize((cfg.data.img_sz, cfg.data.img_sz)), ToTensor()]) aug_transform = Compose([ RandomHorizontalFlip(), RandomVerticalFlip(), ColorJitter(0.1, 0.1, 0.1, 0.1), Resize((cfg.data.img_sz, cfg.data.img_sz)), ToTensor() ]) if isdir(train_dir): if cfg.overfit_mode: train_ds.append( ImageFolder( train_dir, transform=transform, classes=label_names)) else: train_ds.append( ImageFolder( train_dir, transform=aug_transform, classes=label_names)) if isdir(valid_dir): valid_ds.append( ImageFolder( valid_dir, transform=transform, classes=label_names)) test_ds.append( ImageFolder( valid_dir, transform=transform, classes=label_names)) train_ds, valid_ds, test_ds = \ ConcatDataset(train_ds), ConcatDataset(valid_ds), ConcatDataset(test_ds) if cfg.overfit_mode: train_ds = Subset(train_ds, range(batch_sz)) valid_ds = train_ds test_ds = train_ds elif cfg.test_mode: train_ds = Subset(train_ds, range(batch_sz)) valid_ds = Subset(valid_ds, range(batch_sz)) test_ds = Subset(test_ds, range(batch_sz)) train_dl = DataLoader( train_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) valid_dl = DataLoader( valid_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) test_dl = DataLoader( test_ds, shuffle=True, batch_size=batch_sz, num_workers=num_workers, pin_memory=True) self.train_ds, self.valid_ds, self.test_ds = (train_ds, valid_ds, test_ds) self.train_dl, self.valid_dl, self.test_dl = (train_dl, valid_dl, test_dl)
def create_local(self, tmp_dir): new_uris = [download_if_needed(uri, tmp_dir) for uri in self.uris] return self.to_builder() \ .with_uris(new_uris) \ .build()
def _zxy2geotiff(tile_schema, zoom, bounds, output_uri, make_cog=False): """Generates a GeoTIFF of a bounded region from a ZXY tile server. Args: tile_schema: (str) the URI schema for zxy tiles (ie. a slippy map tile server) of the form /tileserver-uri/{z}/{x}/{y}.png. If {-y} is used, the tiles are assumed to be indexed using TMS coordinates, where the y axis starts at the southernmost point. The URI can be for http, S3, or the local file system. zoom: (int) the zoom level to use when retrieving tiles bounds: (list) a list of length 4 containing min_lat, min_lng, max_lat, max_lng output_uri: (str) where to save the GeoTIFF. The URI can be for http, S3, or the local file system """ min_lat, min_lng, max_lat, max_lng = bounds if min_lat >= max_lat: raise ValueError('min_lat must be < max_lat') if min_lng >= max_lng: raise ValueError('min_lng must be < max_lng') is_tms = False if '{-y}' in tile_schema: tile_schema = tile_schema.replace('{-y}', '{y}') is_tms = True tmp_dir_obj = tempfile.TemporaryDirectory() tmp_dir = tmp_dir_obj.name # Get range of tiles that cover bounds. output_path = get_local_path(output_uri, tmp_dir) tile_sz = 256 t = mercantile.tile(min_lng, max_lat, zoom) xmin, ymin = t.x, t.y t = mercantile.tile(max_lng, min_lat, zoom) xmax, ymax = t.x, t.y # The supplied bounds are contained within the "tile bounds" -- ie. the # bounds of the set of tiles that covers the supplied bounds. Therefore, # we need to crop out the imagery that lies within the supplied bounds. # We do this by computing a top, bottom, left, and right offset in pixel # units of the supplied bounds against the tile bounds. Getting the offsets # in pixel units involves converting lng/lat to web mercator units since we # assume that is the CRS of the tiles. These offsets are then used to crop # individual tiles and place them correctly into the output raster. nw_merc_x, nw_merc_y = lnglat2merc(min_lng, max_lat) left_pix_offset, top_pix_offset = merc2pixel(xmin, ymin, zoom, nw_merc_x, nw_merc_y) se_merc_x, se_merc_y = lnglat2merc(max_lng, min_lat) se_left_pix_offset, se_top_pix_offset = merc2pixel(xmax, ymax, zoom, se_merc_x, se_merc_y) right_pix_offset = tile_sz - se_left_pix_offset bottom_pix_offset = tile_sz - se_top_pix_offset uncropped_height = tile_sz * (ymax - ymin + 1) uncropped_width = tile_sz * (xmax - xmin + 1) height = uncropped_height - top_pix_offset - bottom_pix_offset width = uncropped_width - left_pix_offset - right_pix_offset transform = rasterio.transform.from_bounds(nw_merc_x, se_merc_y, se_merc_x, nw_merc_y, width, height) with rasterio.open(output_path, 'w', driver='GTiff', height=height, width=width, count=3, crs='epsg:3857', transform=transform, dtype=rasterio.uint8) as dataset: out_x = 0 for xi, x in enumerate(range(xmin, xmax + 1)): tile_xmin, tile_xmax = 0, tile_sz - 1 if x == xmin: tile_xmin += left_pix_offset if x == xmax: tile_xmax -= right_pix_offset window_width = tile_xmax - tile_xmin + 1 out_y = 0 for yi, y in enumerate(range(ymin, ymax + 1)): tile_ymin, tile_ymax = 0, tile_sz - 1 if y == ymin: tile_ymin += top_pix_offset if y == ymax: tile_ymax -= bottom_pix_offset window_height = tile_ymax - tile_ymin + 1 # Convert from xyz to tms if needed. # https://gist.github.com/tmcw/4954720 if is_tms: y = (2**zoom) - y - 1 tile_uri = tile_schema.format(x=x, y=y, z=zoom) tile_path = download_if_needed(tile_uri, tmp_dir) img = np.array(Image.open(tile_path)) img = img[tile_ymin:tile_ymax + 1, tile_xmin:tile_xmax + 1, :] window = Window(out_x, out_y, window_width, window_height) dataset.write(np.transpose(img[:, :, 0:3], (2, 0, 1)), window=window) out_y += window_height out_x += window_width if make_cog: create_cog(output_path, output_uri, tmp_dir) else: upload_or_copy(output_path, output_uri)
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) ''' Get zip file for each group, and unzip them into chip_dir in a way that works well with FastAI. The resulting directory structure would be: <chip_dir>/ train/ training-<uuid1>/ <class1>/ ... <class2>/ ... ... training-<uuid2>/ <class1>/ ... <class2>/ ... ... ... val/ validation-<uuid1>/ <class1>/ ... <class2>/ ... ... validation-<uuid2>/ <class1>/ ... <class2>/ ... ... ... ''' chip_dir = join(tmp_dir, 'chips/') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_name = Path(zip_uri).name if zip_name.startswith('train'): extract_dir = chip_dir + 'train/' elif zip_name.startswith('val'): extract_dir = chip_dir + 'val/' else: continue zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(extract_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() num_workers = 0 if self.train_opts.debug else 4 tfms = get_transforms(flip_vert=self.train_opts.flip_vert) def get_data(train_sampler=None): data = (ImageList.from_folder(chip_dir).split_by_folder( train='train', valid='val').label_from_folder().transform( tfms, size=size).databunch( bs=self.train_opts.batch_sz, num_workers=num_workers, )) return data data = get_data() if self.train_opts.debug: make_debug_chips(data, class_map, tmp_dir, train_uri) # Setup learner. ignore_idx = -1 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx) ] model_arch = getattr(models, self.train_opts.model_arch) learn = cnn_learner(data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, path=train_dir) learn.unfreeze() if self.train_opts.fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict(torch.load( pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def download_if_needed(self, uri): return download_if_needed(uri, self.tmp_dir)
def train(self, tmp_dir: str) -> None: """Train a DeepLab model the task and backend config. Args: tmp_dir: (str) temporary directory to use Returns: None """ train_py = self.backend_config.script_locations.train_py eval_py = self.backend_config.script_locations.eval_py export_py = self.backend_config.script_locations.export_py # Setup local input and output directories log.info('Setting up local input and output directories') train_logdir = self.backend_config.training_output_uri train_logdir_local = get_local_path(train_logdir, tmp_dir) dataset_dir = get_record_dir(self.backend_config.training_data_uri, TRAIN) dataset_dir_local = get_local_path(dataset_dir, tmp_dir) make_dir(tmp_dir) make_dir(train_logdir_local) make_dir(dataset_dir_local) # Download training data log.info('Downloading training data') for i, record_file in enumerate(list_paths(dataset_dir)): download_if_needed(record_file, tmp_dir) # Download and untar initial checkpoint. log.info('Downloading and untarring initial checkpoint') tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri download_if_needed(tf_initial_checkpoints_uri, tmp_dir) tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir) tfic_dir = os.path.dirname(tfic_tarball) with tarfile.open(tfic_tarball, 'r:gz') as tar: tar.extractall(tfic_dir) tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0] tfic_ckpt = tfic_ckpt[0:-len('.index')] # Restart support train_restart_dir = self.backend_config.train_options.train_restart_dir if type(train_restart_dir) is not str or len(train_restart_dir) == 0: train_restart_dir = train_logdir # Get output from potential previous run so we can resume training. if type(train_restart_dir) is str and len( train_restart_dir ) > 0 and not self.backend_config.train_options.replace_model: sync_from_dir(train_restart_dir, train_logdir_local) else: if self.backend_config.train_options.replace_model: if os.path.exists(train_logdir_local): shutil.rmtree(train_logdir_local) make_dir(train_logdir_local) # Periodically synchronize with remote sync = start_sync( train_logdir_local, train_logdir, sync_interval=self.backend_config.train_options.sync_interval) with sync: # Setup TFDL config tfdl_config = json_format.ParseDict( self.backend_config.tfdl_config, TrainingParametersMsg()) log.info('tfdl_config={}'.format(tfdl_config)) log.info('Training steps={}'.format( tfdl_config.training_number_of_steps)) # Additional training options max_class = max( list(map(lambda c: c.id, self.class_map.get_items()))) num_classes = len(self.class_map.get_items()) num_classes = max(max_class, num_classes) + 1 (train_args, train_env) = get_training_args( train_py, train_logdir_local, tfic_ckpt, dataset_dir_local, num_classes, tfdl_config) # Start training log.info('Starting training process') log.info(' '.join(train_args)) train_process = Popen(train_args, env=train_env) terminate_at_exit(train_process) if self.backend_config.train_options.do_monitoring: # Start tensorboard log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(train_logdir_local)]) terminate_at_exit(tensorboard_process) if self.backend_config.train_options.do_eval: # Start eval script log.info('Starting eval script') eval_logdir = train_logdir_local eval_args = get_evaluation_args(eval_py, train_logdir_local, dataset_dir_local, eval_logdir, tfdl_config) eval_process = Popen(eval_args, env=train_env) terminate_at_exit(eval_process) # Wait for training and tensorboard log.info('Waiting for training and tensorboard processes') train_process.wait() if self.backend_config.train_options.do_monitoring: tensorboard_process.terminate() # Export frozen graph log.info( 'Exporting frozen graph ({}/model)'.format(train_logdir_local)) export_args = get_export_args(export_py, train_logdir_local, num_classes, tfdl_config) export_process = Popen(export_args) terminate_at_exit(export_process) export_process.wait() # Package up the model files for usage as fine tuning checkpoints fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name latest_checkpoints = get_latest_checkpoint(train_logdir_local) model_checkpoint_files = glob.glob( '{}*'.format(latest_checkpoints)) inference_graph_path = os.path.join(train_logdir_local, 'model') with RVConfig.get_tmp_dir() as tmp_dir: model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name) make_dir(model_dir) model_tar = os.path.join( train_logdir_local, '{}.tar.gz'.format(fine_tune_checkpoint_name)) shutil.copy(inference_graph_path, '{}/frozen_inference_graph.pb'.format(model_dir)) for path in model_checkpoint_files: shutil.copy(path, model_dir) with tarfile.open(model_tar, 'w:gz') as tar: tar.add(model_dir, arcname=os.path.basename(model_dir)) # Perform final sync sync_to_dir(train_logdir_local, train_logdir, delete=False)
def load_init_weights(self): if self.cfg.model.init_weights: weights_path = download_if_needed(self.cfg.model.init_weights, self.tmp_dir) self.model.load_state_dict( torch.load(weights_path, map_location=self.device))
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. train_images = [] train_lbl_bbox = [] for annotation_path in glob.glob(join(chip_dir, 'train/*.json')): images, lbl_bbox = get_annotations(annotation_path) train_images += images train_lbl_bbox += lbl_bbox val_images = [] val_lbl_bbox = [] for annotation_path in glob.glob(join(chip_dir, 'valid/*.json')): images, lbl_bbox = get_annotations(annotation_path) val_images += images val_lbl_bbox += lbl_bbox images = train_images + val_images lbl_bbox = train_lbl_bbox + val_lbl_bbox img2bbox = dict(zip(images, lbl_bbox)) get_y_func = lambda o: img2bbox[o.name] num_workers = 0 if self.train_opts.debug else 4 data = ObjectItemList.from_folder(chip_dir) data = data.split_by_folder() data = data.label_from_func(get_y_func) data = data.transform( get_transforms(), size=self.task_config.chip_size, tfm_y=True) data = data.databunch( bs=self.train_opts.batch_sz, collate_fn=bb_pad_collate, num_workers=num_workers) print(data) if self.train_opts.debug: make_debug_chips( data, self.task_config.class_map, tmp_dir, train_uri) # Setup callbacks and train model. ratios = [1/2, 1, 2] scales = [1, 2**(-1/3), 2**(-2/3)] model_arch = getattr(models, self.train_opts.model_arch) encoder = create_body(model_arch, cut=-2) model = RetinaNet(encoder, data.c, final_bias=-4) crit = RetinaNetFocalLoss(scales=scales, ratios=ratios) learn = Learner(data, model, loss_func=crit, path=train_dir) learn = learn.split(retina_net_split) model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.load(pretrained_path[:-4]) callbacks = [ TrackEpochCallback(learn), SaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] learn.unfreeze() learn.fit(self.train_opts.num_epochs, self.train_opts.lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)