def save(self, output_uri): """Save this Evaluation to a file. Args: output_uri: string URI for the file to write. """ json_str = json.dumps(self.to_json(), indent=4) str_to_file(json_str, output_uri)
def test_file_exists_local_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertTrue(file_exists(path))
def test_file_to_str_local(self): str_to_file(self.content_str, self.local_path) content_str = file_to_str(self.local_path) self.assertEqual(self.content_str, content_str) wrong_path = '/wrongpath/x.txt' with self.assertRaises(NotReadableError): file_to_str(wrong_path)
def test_check_empty(self): path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, check_empty=False) with self.assertRaises(Exception): make_dir(dir, check_empty=True)
def test_download_if_needed_local(self): with self.assertRaises(NotReadableError): download_if_needed(self.local_path, self.temp_dir.name) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.local_path) local_path = download_if_needed(self.local_path, self.temp_dir.name) self.assertEqual(local_path, self.local_path)
def test_copy_to_http(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') dst = 'http://localhost/' directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertRaises(NotWritableError, lambda: upload_or_copy(path, dst)) os.remove(path)
def test_force_empty(self): path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, force_empty=False) self.assertTrue(os.path.isfile(path)) make_dir(dir, force_empty=True) is_empty = len(os.listdir(dir)) == 0 self.assertTrue(is_empty)
def test_invalid_json(self): invalid_json_str = ''' { "taskType": "CHIP_CLASSIFICATION } ''' str_to_file(invalid_json_str, self.file_path) with self.assertRaises(ProtobufParseException): load_json_config(self.file_path, TaskConfigMsg())
def test_file_exists_s3_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) s3_path = 's3://{}/lorem.txt'.format(self.bucket_name) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_path))
def test_last_modified(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum1.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) fs = FileSystem.get_file_system(path, 'r') str_to_file(self.lorem, path) stamp = fs.last_modified(path) self.assertTrue(isinstance(stamp, datetime.datetime))
def test_file_to_str_s3(self): wrong_path = 's3://wrongpath/x.txt' with self.assertRaises(NotWritableError): str_to_file(self.content_str, wrong_path) str_to_file(self.content_str, self.s3_path) content_str = file_to_str(self.s3_path) self.assertEqual(self.content_str, content_str) with self.assertRaises(NotReadableError): file_to_str(wrong_path)
def test_copy_to_local(self): path1 = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') path2 = os.path.join(self.temp_dir.name, 'yyy', 'ipsum.txt') dir1 = os.path.dirname(path1) dir2 = os.path.dirname(path2) make_dir(dir1, check_empty=False) make_dir(dir2, check_empty=False) str_to_file(self.lorem, path1) upload_or_copy(path1, path2) self.assertEqual(len(list_paths(dir2)), 1)
def test_list_paths_s3(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) list_paths(s3_directory) self.assertEqual(len(list_paths(s3_directory)), 1)
def test_remote(self): with patch('rastervision.utils.files.download_if_needed', side_effect=download_if_needed) as patched_download: s3_path = 's3://{}/{}'.format(self.bucket_name, self.file_name) str_to_file(self.content_str, s3_path) path = get_cached_file(self.cache_dir, s3_path) self.assertTrue(os.path.isfile(path)) # Check that calling it again doesn't invoke the download method again. path = get_cached_file(self.cache_dir, s3_path) self.assertTrue(os.path.isfile(path)) self.assertEqual(patched_download.call_count, 1)
def test_last_modified_s3(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum1.txt') s3_path = 's3://{}/lorem1.txt'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) fs = FileSystem.get_file_system(s3_path, 'r') str_to_file(self.lorem, path) upload_or_copy(path, s3_path) stamp = fs.last_modified(s3_path) self.assertTrue(isinstance(stamp, datetime.datetime))
def test_download_if_needed_s3(self): with self.assertRaises(NotReadableError): download_if_needed(self.s3_path, self.temp_dir.name) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.s3_path) local_path = download_if_needed(self.s3_path, self.temp_dir.name) content_str = file_to_str(local_path) self.assertEqual(self.content_str, content_str) wrong_path = 's3://wrongpath/x.txt' with self.assertRaises(NotWritableError): upload_or_copy(local_path, wrong_path)
def test_file_exists(self): fs = FileSystem.get_file_system(self.temp_dir.name, 'r') path1 = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') dir1 = os.path.dirname(path1) make_dir(dir1, check_empty=False) str_to_file(self.lorem, path1) self.assertTrue(fs.file_exists(dir1, include_dir=True)) self.assertTrue(fs.file_exists(path1, include_dir=False)) self.assertFalse(fs.file_exists(dir1, include_dir=False)) self.assertFalse( fs.file_exists(dir1 + 'NOTPOSSIBLE', include_dir=False))
def test_file_exists(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_directory, include_dir=True)) self.assertTrue(file_exists(s3_path, include_dir=False)) self.assertFalse(file_exists(s3_directory, include_dir=False)) self.assertFalse( file_exists(s3_directory + 'NOTPOSSIBLE', include_dir=False))
def filter_geojson(labels_uri, output_uri, class_names): """Remove features that aren't in class_names and remove class_ids.""" labels_str = file_to_str(labels_uri) labels = json.loads(labels_str) filtered_features = [] for feature in labels['features']: feature = copy.deepcopy(feature) properties = feature.get('properties') if properties: class_name = properties.get('class_name') or properties('label') if class_name in class_names: del properties['class_id'] filtered_features.append(feature) new_labels = {'features': filtered_features} str_to_file(json.dumps(new_labels), output_uri)
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes num_workers = 0 if self.train_opts.debug else 4 data = (SegmentationItemList.from_folder(chip_dir) .split_by_folder(train='train-img', valid='val-img')) train_count = None if self.train_opts.train_count is not None: train_count = min(len(data.train), self.train_opts.train_count) elif self.train_opts.train_prop != 1.0: train_count = int(round(self.train_opts.train_prop * len(data.train))) train_items = data.train.items if train_count is not None: train_inds = np.random.permutation(np.arange(len(data.train)))[0:train_count] train_items = train_items[train_inds] items = np.concatenate([train_items, data.valid.items]) data = (SegmentationItemList(items, chip_dir) .split_by_folder(train='train-img', valid='val-img') .label_from_func(get_label_path, classes=classes) .transform(get_transforms(flip_vert=self.train_opts.flip_vert), size=size, tfm_y=True) .databunch(bs=self.train_opts.batch_sz, num_workers=num_workers)) print(data) # Setup learner. ignore_idx = 0 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx)] model_arch = getattr(models, self.train_opts.model_arch) learn = unet_learner( data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, bottle=True, path=train_dir) learn.unfreeze() if self.train_opts.mixed_prec and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict( torch.load(pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] oversample = self.train_opts.oversample if oversample: weights = get_oversampling_weights( data.train_ds, oversample['rare_class_ids'], oversample['rare_target_prop']) oversample_callback = OverSamplingCallback(learn, weights=weights) callbacks.append(oversample_callback) if self.train_opts.debug: if oversample: oversample_callback.on_train_begin() make_debug_chips(data, class_map, tmp_dir, train_uri) if self.train_opts.log_tensorboard: callbacks.append(TensorboardLogger(learn, 'run')) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') log_dir = join(train_dir, 'logs', 'run') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. train_images = [] train_lbl_bbox = [] for annotation_path in glob.glob(join(chip_dir, 'train/*.json')): images, lbl_bbox = get_annotations(annotation_path) train_images += images train_lbl_bbox += lbl_bbox val_images = [] val_lbl_bbox = [] for annotation_path in glob.glob(join(chip_dir, 'valid/*.json')): images, lbl_bbox = get_annotations(annotation_path) val_images += images val_lbl_bbox += lbl_bbox images = train_images + val_images lbl_bbox = train_lbl_bbox + val_lbl_bbox img2bbox = dict(zip(images, lbl_bbox)) get_y_func = lambda o: img2bbox[o.name] num_workers = 0 if self.train_opts.debug else 4 data = ObjectItemList.from_folder(chip_dir) data = data.split_by_folder() data = data.label_from_func(get_y_func) data = data.transform( get_transforms(), size=self.task_config.chip_size, tfm_y=True) data = data.databunch( bs=self.train_opts.batch_sz, collate_fn=bb_pad_collate, num_workers=num_workers) print(data) if self.train_opts.debug: make_debug_chips( data, self.task_config.class_map, tmp_dir, train_uri) # Setup callbacks and train model. ratios = [1/2, 1, 2] scales = [1, 2**(-1/3), 2**(-2/3)] model_arch = getattr(models, self.train_opts.model_arch) encoder = create_body(model_arch, cut=-2) model = RetinaNet(encoder, data.c, final_bias=-4) crit = RetinaNetFocalLoss(scales=scales, ratios=ratios) learn = Learner(data, model, loss_func=crit, path=train_dir) learn = learn.split(retina_net_split) model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.load(pretrained_path[:-4]) callbacks = [ TrackEpochCallback(learn), SaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] learn.unfreeze() learn.fit(self.train_opts.num_epochs, self.train_opts.lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def save(self, labels): """Save. Args: labels - (SemanticSegmentationLabels) labels to be saved """ local_path = get_local_path(self.uri, self.tmp_dir) make_dir(local_path, use_dirname=True) transform = self.crs_transformer.get_affine_transform() crs = self.crs_transformer.get_image_crs() band_count = 1 dtype = np.uint8 if self.class_trans: band_count = 3 if self.vector_output: # We need to store the whole output mask to run feature extraction. # If the raster is large, this will result in running out of memory, so # more work will be needed to get this to work in a scalable way. But this # is complicated because of the need to merge features that are split # across windows. mask = np.zeros((self.extent.ymax, self.extent.xmax), dtype=np.uint8) else: mask = None # https://github.com/mapbox/rasterio/blob/master/docs/quickstart.rst # https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html with rasterio.open(local_path, 'w', driver='GTiff', height=self.extent.ymax, width=self.extent.xmax, count=band_count, dtype=dtype, transform=transform, crs=crs) as dataset: for window in labels.get_windows(): class_labels = labels.get_label_arr(window, clip_extent=self.extent) clipped_window = ((window.ymin, window.ymin + class_labels.shape[0]), (window.xmin, window.xmin + class_labels.shape[1])) if mask is not None: mask[clipped_window[0][0]:clipped_window[0][1], clipped_window[1][0]:clipped_window[1] [1]] = class_labels if self.class_trans: rgb_labels = self.class_trans.class_to_rgb(class_labels) for chan in range(3): dataset.write_band(chan + 1, rgb_labels[:, :, chan], window=clipped_window) else: img = class_labels.astype(dtype) dataset.write_band(1, img, window=clipped_window) upload_or_copy(local_path, self.uri) if self.vector_output: import mask_to_polygons.vectorification as vectorification import mask_to_polygons.processing.denoise as denoise for vo in self.vector_output: denoise_radius = vo['denoise'] uri = vo['uri'] mode = vo['mode'] class_id = vo['class_id'] class_mask = np.array(mask == class_id, dtype=np.uint8) def transform(x, y): return self.crs_transformer.pixel_to_map((x, y)) if denoise_radius > 0: class_mask = denoise.denoise(class_mask, denoise_radius) if uri and mode == 'buildings': options = vo['building_options'] geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode, min_aspect_ratio=options['min_aspect_ratio'], min_area=options['min_area'], width_factor=options['element_width_factor'], thickness=options['element_thickness']) elif uri and mode == 'polygons': geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode) str_to_file(geojson, uri)
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. # This will either be local or S3. This allows restarting the job if it has been shut down. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. self.chip_dir = join(tmp_dir, 'chips') make_dir(self.chip_dir) train_chip_dir = self.chip_dir + '/train-img' train_truth_dir = self.chip_dir + '/train-labels' fitness_func = partial(fitness, train_chip_dir, train_truth_dir, self._toolbox.compile) self._toolbox.register("evaluate", fitness_func) # This is the key part -- this is how it knows where to get the chips from. # backend_opts comes from RV, and train_opts is where you can define backend-specific stuff. for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(self.chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes # Evolve # Set up hall of fame to track the best individual hof = tools.HallOfFame(1) # Set up debugging mstats = None if self.train_opts.debug: stats_fit = tools.Statistics(lambda ind: ind.fitness.values) stats_size = tools.Statistics(len) mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size) mstats.register("averageaverage", np.mean) mstats.register("stdeviation", np.std) mstats.register("minimumstat", np.min) mstats.register("maximumstat", np.max) pop = self._toolbox.population(n=self.train_opts.pop_size) pop, log = algorithms.eaMuPlusLambda( pop, self._toolbox, self.train_opts.num_individuals, self.train_opts.num_offspring, self.train_opts.crossover_rate, self.train_opts.mutation_rate, self.train_opts.num_generations, stats=mstats, halloffame=hof, verbose=self.train_opts.debug ) # ? What should my model output be given that the output is just a string? Should I output a # text file? # RV uses file-presence based caching to figure out whether a stage has completed (kinda # like Makefiles). So since this outputs a file every epoch, it needs to use something else # to trigger done-ness. # Since model is exported every epoch, we need some other way to # show that training is finished. if self.train_opts.debug: print(str(hof[0])) str_to_file(str(hof[0]), self.backend_opts.train_done_uri) str_to_file(str(hof[0]), self.backend_opts.model_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def write_config_file(self, config): file_contents = json.dumps(config) str_to_file(file_contents, self.file_path)
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) ''' Get zip file for each group, and unzip them into chip_dir in a way that works well with FastAI. The resulting directory structure would be: <chip_dir>/ train/ training-<uuid1>/ <class1>/ ... <class2>/ ... ... training-<uuid2>/ <class1>/ ... <class2>/ ... ... ... val/ validation-<uuid1>/ <class1>/ ... <class2>/ ... ... validation-<uuid2>/ <class1>/ ... <class2>/ ... ... ... ''' chip_dir = join(tmp_dir, 'chips/') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_name = Path(zip_uri).name if zip_name.startswith('train'): extract_dir = chip_dir + 'train/' elif zip_name.startswith('val'): extract_dir = chip_dir + 'val/' else: continue zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(extract_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() num_workers = 0 if self.train_opts.debug else 4 tfms = get_transforms(flip_vert=self.train_opts.flip_vert) def get_data(train_sampler=None): data = (ImageList.from_folder(chip_dir).split_by_folder( train='train', valid='val').label_from_folder().transform( tfms, size=size).databunch( bs=self.train_opts.batch_sz, num_workers=num_workers, )) return data data = get_data() if self.train_opts.debug: make_debug_chips(data, class_map, tmp_dir, train_uri) # Setup learner. ignore_idx = -1 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx) ] model_arch = getattr(models, self.train_opts.model_arch) learn = cnn_learner(data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, path=train_dir) learn.unfreeze() if self.train_opts.fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict(torch.load( pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def test_local(self): local_path = os.path.join(self.temp_dir.name, self.file_name) str_to_file(self.content_str, local_path) path = get_cached_file(self.cache_dir, local_path) self.assertTrue(os.path.isfile(path))
def save(self, stats_uri): # Ensure lists means = list(self.means) stds = list(self.stds) stats = {'means': means, 'stds': stds} str_to_file(json.dumps(stats), stats_uri)
def test_write_str_http(self): self.assertRaises(NotWritableError, lambda: str_to_file('xxx', 'http://localhost/'))
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. batch_size = self.train_opts.batch_size chip_size = self.task_config.chip_size class_names = self.class_map.get_class_names() databunch = build_databunch(chip_dir, chip_size, batch_size, class_names) log.info(databunch) num_labels = len(databunch.label_names) if self.train_opts.debug: make_debug_chips(databunch, self.class_map, tmp_dir, train_uri) # Setup model num_labels = len(databunch.label_names) model = get_model(self.train_opts.model_arch, num_labels, pretrained=True) model = model.to(self.device) model_path = join(train_dir, 'model') # Load weights from a pretrained model. pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: log.info('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) model.load_state_dict( torch.load(pretrained_path, map_location=self.device)) # Possibly resume training from checkpoint. start_epoch = 0 train_state_path = join(train_dir, 'train_state.json') if isfile(train_state_path): log.info('Resuming from checkpoint: {}\n'.format(model_path)) train_state = file_to_json(train_state_path) start_epoch = train_state['epoch'] + 1 model.load_state_dict( torch.load(model_path, map_location=self.device)) # Write header of log CSV file. metric_names = ['precision', 'recall', 'f1'] log_path = join(train_dir, 'log.csv') if not isfile(log_path): with open(log_path, 'w') as log_file: log_writer = csv.writer(log_file) row = ['epoch', 'time', 'train_loss'] + metric_names log_writer.writerow(row) # Setup Tensorboard logging. if self.train_opts.log_tensorboard: log_dir = join(train_dir, 'tb-logs') make_dir(log_dir) tb_writer = SummaryWriter(log_dir=log_dir) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) # Setup optimizer, loss, and LR scheduler. loss_fn = torch.nn.CrossEntropyLoss() lr = self.train_opts.lr opt = optim.Adam(model.parameters(), lr=lr) step_scheduler, epoch_scheduler = None, None num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle and num_epochs > 1: steps_per_epoch = len(databunch.train_ds) // batch_size total_steps = num_epochs * steps_per_epoch step_size_up = (num_epochs // 2) * steps_per_epoch step_size_down = total_steps - step_size_up step_scheduler = CyclicLR(opt, base_lr=lr / 10, max_lr=lr, step_size_up=step_size_up, step_size_down=step_size_down, cycle_momentum=False) for _ in range(start_epoch * steps_per_epoch): step_scheduler.step() # Training loop. for epoch in range(start_epoch, num_epochs): # Train one epoch. log.info('-----------------------------------------------------') log.info('epoch: {}'.format(epoch)) start = time.time() train_loss = train_epoch(model, self.device, databunch.train_dl, opt, loss_fn, step_scheduler) if epoch_scheduler: epoch_scheduler.step() log.info('train loss: {}'.format(train_loss)) # Validate one epoch. metrics = validate_epoch(model, self.device, databunch.valid_dl, num_labels) log.info('validation metrics: {}'.format(metrics)) # Print elapsed time for epoch. end = time.time() epoch_time = datetime.timedelta(seconds=end - start) log.info('epoch elapsed time: {}'.format(epoch_time)) # Save model and state. torch.save(model.state_dict(), model_path) train_state = {'epoch': epoch} json_to_file(train_state, train_state_path) # Append to log CSV file. with open(log_path, 'a') as log_file: log_writer = csv.writer(log_file) row = [epoch, epoch_time, train_loss] row += [metrics[k] for k in metric_names] log_writer.writerow(row) # Write to Tensorboard log. if self.train_opts.log_tensorboard: for key, val in metrics.items(): tb_writer.add_scalar(key, val, epoch) tb_writer.add_scalar('train_loss', train_loss, epoch) for name, param in model.named_parameters(): tb_writer.add_histogram(name, param, epoch) if (train_uri.startswith('s3://') and (((epoch + 1) % self.train_opts.sync_interval) == 0)): sync_to_dir(train_dir, train_uri) # Close Tensorboard. if self.train_opts.log_tensorboard: tb_writer.close() if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes num_workers = 0 if self.train_opts.debug else 4 train_img_dir = self.subset_training_data(chip_dir) def get_data(train_sampler=None): data = (SegmentationItemList.from_folder(chip_dir).split_by_folder( train=train_img_dir, valid='val-img').label_from_func( get_label_path, classes=classes).transform( get_transforms(flip_vert=self.train_opts.flip_vert), size=size, tfm_y=True).databunch(bs=self.train_opts.batch_sz, num_workers=num_workers, train_sampler=train_sampler)) return data data = get_data() oversample = self.train_opts.oversample if oversample: sampler = get_weighted_sampler(data.train_ds, oversample['rare_class_ids'], oversample['rare_target_prop']) data = get_data(train_sampler=sampler) if self.train_opts.debug: make_debug_chips(data, class_map, tmp_dir, train_uri) # Setup learner. ignore_idx = 0 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx) ] model_arch = getattr(models, self.train_opts.model_arch) learn = unet_learner(data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, bottle=True, path=train_dir) learn.unfreeze() if self.train_opts.fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict(torch.load( pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)