def collect_eval_dir(root_uri): eval_json_uris = list_paths(join(root_uri, 'eval'), ext='eval.json') for eval_json_uri in eval_json_uris: eval_json = file_to_json(eval_json_uri) print(basename(dirname(eval_json_uri))) print(eval_json['overall'][-1]['f1']) print()
def collect_experiment(key, root_uri, output_dir, get_pred_package=False): print('\nCollecting experiment {}...\n'.format(key)) if root_uri.startswith('s3://'): predict_package_uris = list_paths(join(root_uri, key, 'bundle'), ext='predict_package.zip') eval_json_uris = list_paths(join(root_uri, key, 'eval'), ext='eval.json') else: predict_package_uris = glob.glob(join(root_uri, key, 'bundle', '*', 'predict_package.zip')) eval_json_uris = glob.glob(join(root_uri, key, 'eval', '*', 'eval.json')) if len(predict_package_uris) > 1 or len(eval_json_uris) > 1: print('Cannot collect from key with multiple experiments!!!') return if len(predict_package_uris) == 0 or len(eval_json_uris) == 0: print('Missing output!!!') return predict_package_uri = predict_package_uris[0] eval_json_uri = eval_json_uris[0] make_dir(join(output_dir, key)) if get_pred_package: download_or_copy(predict_package_uri, join(output_dir, key)) download_or_copy(eval_json_uri, join(output_dir, key)) eval_json = file_to_json(join(output_dir, key, 'eval.json')) pprint.pprint(eval_json['overall'], indent=4)
def __init__(self, img_dir, annotation_uris, transforms=None): self.img_dir = img_dir self.annotation_uris = annotation_uris self.transforms = transforms self.imgs = [] self.img2id = {} self.id2img = {} self.id2boxes = defaultdict(lambda: []) self.id2labels = defaultdict(lambda: []) self.label2name = {} for annotation_uri in annotation_uris: ann_json = file_to_json(annotation_uri) for img in ann_json['images']: self.imgs.append(img['file_name']) self.img2id[img['file_name']] = img['id'] self.id2img[img['id']] = img['file_name'] for ann in ann_json['annotations']: img_id = ann['image_id'] box = ann['bbox'] label = ann['category_id'] box = torch.tensor( [[box[1], box[0], box[1] + box[3], box[0] + box[2]]]) self.id2boxes[img_id].append(box) self.id2labels[img_id].append(label) self.id2boxes = dict([(id, torch.cat(boxes).float()) for id, boxes in self.id2boxes.items()]) self.id2labels = dict([(id, torch.tensor(labels)) for id, labels in self.id2labels.items()])
def get_label_names(coco_path): categories = file_to_json(coco_path)['categories'] label2name = dict([(cat['id'], cat['name']) for cat in categories]) labels = ['background' ] + [label2name[i] for i in range(1, len(label2name) + 1)] return labels
def from_model_bundle(model_bundle_uri, tmp_dir): model_bundle_path = download_if_needed(model_bundle_uri, tmp_dir) model_bundle_dir = join(tmp_dir, 'model-bundle') unzip(model_bundle_path, model_bundle_dir) config_path = join(model_bundle_dir, 'config.json') model_path = join(model_bundle_dir, 'model.pth') cfg = build_config(file_to_json(config_path)) return cfg.get_learner()(cfg, tmp_dir, model_path=model_path)
def _run_command(cfg_json_uri, command, split_ind, num_splits): tmp_root_dir = '/opt/data/tmp' make_dir(tmp_root_dir) tmp_dir_obj = tempfile.TemporaryDirectory(dir=tmp_root_dir) tmp_dir = tmp_dir_obj.name pipeline_cfg_dict = file_to_json(cfg_json_uri) cfg = build_config(pipeline_cfg_dict) pipeline = cfg.get_pipeline()(cfg, tmp_dir) # TODO generalize this to work outside batch if split_ind is None: split_ind = int(os.environ.get('AWS_BATCH_JOB_ARRAY_INDEX', 0)) command_fn = getattr(pipeline, command) if num_splits > 1: print('Running {} command split {}/{}...'.format( command, split_ind + 1, num_splits)) command_fn(split_ind=split_ind, num_splits=num_splits) else: print('Running {} command...'.format(command)) command_fn()
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. batch_size = self.train_opts.batch_size chip_size = self.task_config.chip_size class_names = self.class_map.get_class_names() databunch = build_databunch(chip_dir, chip_size, batch_size, class_names) log.info(databunch) num_labels = len(databunch.label_names) if self.train_opts.debug: make_debug_chips(databunch, self.class_map, tmp_dir, train_uri) # Setup model num_labels = len(databunch.label_names) model = get_model(self.train_opts.model_arch, num_labels, pretrained=True) model = model.to(self.device) model_path = join(train_dir, 'model') # Load weights from a pretrained model. pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: log.info('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) model.load_state_dict( torch.load(pretrained_path, map_location=self.device)) # Possibly resume training from checkpoint. start_epoch = 0 train_state_path = join(train_dir, 'train_state.json') if isfile(train_state_path): log.info('Resuming from checkpoint: {}\n'.format(model_path)) train_state = file_to_json(train_state_path) start_epoch = train_state['epoch'] + 1 model.load_state_dict( torch.load(model_path, map_location=self.device)) # Write header of log CSV file. metric_names = ['precision', 'recall', 'f1'] log_path = join(train_dir, 'log.csv') if not isfile(log_path): with open(log_path, 'w') as log_file: log_writer = csv.writer(log_file) row = ['epoch', 'time', 'train_loss'] + metric_names log_writer.writerow(row) # Setup Tensorboard logging. if self.train_opts.log_tensorboard: log_dir = join(train_dir, 'tb-logs') make_dir(log_dir) tb_writer = SummaryWriter(log_dir=log_dir) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) # Setup optimizer, loss, and LR scheduler. loss_fn = torch.nn.CrossEntropyLoss() lr = self.train_opts.lr opt = optim.Adam(model.parameters(), lr=lr) step_scheduler, epoch_scheduler = None, None num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle and num_epochs > 1: steps_per_epoch = len(databunch.train_ds) // batch_size total_steps = num_epochs * steps_per_epoch step_size_up = (num_epochs // 2) * steps_per_epoch step_size_down = total_steps - step_size_up step_scheduler = CyclicLR(opt, base_lr=lr / 10, max_lr=lr, step_size_up=step_size_up, step_size_down=step_size_down, cycle_momentum=False) for _ in range(start_epoch * steps_per_epoch): step_scheduler.step() # Training loop. for epoch in range(start_epoch, num_epochs): # Train one epoch. log.info('-----------------------------------------------------') log.info('epoch: {}'.format(epoch)) start = time.time() train_loss = train_epoch(model, self.device, databunch.train_dl, opt, loss_fn, step_scheduler) if epoch_scheduler: epoch_scheduler.step() log.info('train loss: {}'.format(train_loss)) # Validate one epoch. metrics = validate_epoch(model, self.device, databunch.valid_dl, num_labels) log.info('validation metrics: {}'.format(metrics)) # Print elapsed time for epoch. end = time.time() epoch_time = datetime.timedelta(seconds=end - start) log.info('epoch elapsed time: {}'.format(epoch_time)) # Save model and state. torch.save(model.state_dict(), model_path) train_state = {'epoch': epoch} json_to_file(train_state, train_state_path) # Append to log CSV file. with open(log_path, 'a') as log_file: log_writer = csv.writer(log_file) row = [epoch, epoch_time, train_loss] row += [metrics[k] for k in metric_names] log_writer.writerow(row) # Write to Tensorboard log. if self.train_opts.log_tensorboard: for key, val in metrics.items(): tb_writer.add_scalar(key, val, epoch) tb_writer.add_scalar('train_loss', train_loss, epoch) for name, param in model.named_parameters(): tb_writer.add_histogram(name, param, epoch) if (train_uri.startswith('s3://') and (((epoch + 1) % self.train_opts.sync_interval) == 0)): sync_to_dir(train_dir, train_uri) # Close Tensorboard. if self.train_opts.log_tensorboard: tb_writer.close() if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)