Exemplo n.º 1
0
    def train(self, do_monitoring):
        loss_function = 'categorical_crossentropy'
        metrics = ['accuracy']
        initial_epoch = self.get_initial_epoch()
        steps_per_epoch = int(
            self.nb_training_samples / self.options.batch_size)
        validation_steps = int(
            self.nb_validation_samples / self.options.batch_size)

        # Useful for testing
        if self.options.short_epoch:
            steps_per_epoch = 1
            validation_steps = 1

        callbacks = self.make_callbacks(do_monitoring)

        self.model.compile(self.optimizer, loss_function, metrics=metrics)

        if do_monitoring:
            tensorboard_process = Popen(
                ['tensorboard', '--logdir={}'.format(self.tf_logs_path)])
            terminate_at_exit(tensorboard_process)

        self.model.fit_generator(
            self.training_gen,
            initial_epoch=initial_epoch,
            steps_per_epoch=steps_per_epoch,
            epochs=self.options.nb_epochs,
            validation_data=self.validation_gen,
            validation_steps=validation_steps,
            callbacks=callbacks)

        if do_monitoring:
            tensorboard_process.terminate()
Exemplo n.º 2
0
    def _run_experiment(self, command_dag):
        tmp_dir = self.tmp_dir or RVConfig.get_tmp_dir().name
        make_dir(tmp_dir)
        makefile_name = os.path.join(tmp_dir, 'Makefile')
        with open(makefile_name, 'w') as makefile:
            command_ids = command_dag.get_sorted_command_ids()

            # .PHONY: 0 1 2 3 4 5
            makefile.write('.PHONY:')
            for command_id in command_ids:
                makefile.write(' {}'.format(command_id))
            makefile.write('\n\n')

            # all: 0 1 2 3 4 5
            makefile.write('all:')
            for command_id in command_ids:
                makefile.write(' {}'.format(command_id))
            makefile.write('\n\n')

            for command_id in command_ids:
                # 0: 1 2
                makefile.write('{}:'.format(command_id))
                for upstream_id in command_dag.get_upstream_command_ids(
                        command_id):
                    makefile.write(' {}'.format(upstream_id))
                makefile.write('\n')

                # \t rastervision ...
                command_def = command_dag.get_command_definition(command_id)
                command_config = command_def.command_config
                command_root_uri = command_config.root_uri
                command_basename = 'command-config-{}.json'.format(
                    command_config.split_id)
                command_uri = os.path.join(command_root_uri, command_basename)
                print('Saving command configuration to {}...'.format(
                    command_uri))
                save_json_config(command_config.to_proto(), command_uri)
                run_command = make_command(command_uri, self.tmp_dir)
                makefile.write('\t{}\n\n'.format(run_command))

        process = Popen(['make', '-j', '-f', makefile_name])
        terminate_at_exit(process)
        exitcode = process.wait()
        if exitcode != 0:
            sys.exit(exitcode)
        else:
            return 0
Exemplo n.º 3
0
def train(config_path,
          output_dir,
          num_steps,
          model_main_py=None,
          do_monitoring=True):
    output_train_dir = join(output_dir, 'train')
    output_eval_dir = join(output_dir, 'eval')

    model_main_py = model_main_py or '/opt/tf-models/object_detection/model_main.py'

    train_cmd = [
        'python', model_main_py, '--alsologtostderr',
        '--pipeline_config_path={}'.format(config_path),
        '--model_dir={}'.format(output_train_dir),
        '--num_train_steps={}'.format(num_steps),
        '--sample_1_of_n_eval_examples={}'.format(1)
    ]

    log.info('Running train command: {}'.format(' '.join(train_cmd)))

    train_process = Popen(train_cmd, stdout=PIPE, stderr=STDOUT)
    terminate_at_exit(train_process)

    if do_monitoring:
        eval_cmd = [
            'python', model_main_py, '--alsologtostderr',
            '--pipeline_config_path={}'.format(config_path),
            '--checkpoint_dir={}'.format(output_train_dir),
            '--model_dir={}'.format(output_eval_dir)
        ]
        log.info('Running eval command: {}'.format(' '.join(eval_cmd)))

        # Don't let the eval process take up GPU space
        env = deepcopy(os.environ)
        env['CUDA_VISIBLE_DEVICES'] = '-1'
        eval_process = Popen(eval_cmd, env=env)

        tensorboard_process = Popen(
            ['tensorboard', '--logdir={}'.format(output_dir)])
        terminate_at_exit(eval_process)
        terminate_at_exit(tensorboard_process)

    with train_process:
        for line in train_process.stdout:
            log.info(line.decode('utf-8'))

    log.info('-----DONE TRAINING----')
    if do_monitoring:
        eval_process.terminate()
        tensorboard_process.terminate()
Exemplo n.º 4
0
    def train(self, tmp_dir):
        """Train a model.

        This downloads any previous output saved to the train_uri,
        starts training (or resumes from a checkpoint), periodically
        syncs contents of train_dir to train_uri and after training finishes.

        Args:
            tmp_dir: (str) path to temp directory
        """
        self.log_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)

        # Get zip file for each group, and unzip them into chip_dir.
        chip_dir = join(tmp_dir, 'chips')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(chip_dir)

        # Setup data loader.
        batch_size = self.train_opts.batch_size
        chip_size = self.task_config.chip_size
        class_names = self.class_map.get_class_names()
        databunch = build_databunch(chip_dir, chip_size, batch_size,
                                    class_names)
        log.info(databunch)
        num_labels = len(databunch.label_names)
        if self.train_opts.debug:
            make_debug_chips(databunch, self.class_map, tmp_dir, train_uri)

        # Setup model
        num_labels = len(databunch.label_names)
        model = get_model(self.train_opts.model_arch,
                          num_labels,
                          pretrained=True)
        model = model.to(self.device)
        model_path = join(train_dir, 'model')

        # Load weights from a pretrained model.
        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            log.info('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            model.load_state_dict(
                torch.load(pretrained_path, map_location=self.device))

        # Possibly resume training from checkpoint.
        start_epoch = 0
        train_state_path = join(train_dir, 'train_state.json')
        if isfile(train_state_path):
            log.info('Resuming from checkpoint: {}\n'.format(model_path))
            train_state = file_to_json(train_state_path)
            start_epoch = train_state['epoch'] + 1
            model.load_state_dict(
                torch.load(model_path, map_location=self.device))

        # Write header of log CSV file.
        metric_names = ['precision', 'recall', 'f1']
        log_path = join(train_dir, 'log.csv')
        if not isfile(log_path):
            with open(log_path, 'w') as log_file:
                log_writer = csv.writer(log_file)
                row = ['epoch', 'time', 'train_loss'] + metric_names
                log_writer.writerow(row)

        # Setup Tensorboard logging.
        if self.train_opts.log_tensorboard:
            log_dir = join(train_dir, 'tb-logs')
            make_dir(log_dir)
            tb_writer = SummaryWriter(log_dir=log_dir)
            if self.train_opts.run_tensorboard:
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(log_dir)])
                terminate_at_exit(tensorboard_process)

        # Setup optimizer, loss, and LR scheduler.
        loss_fn = torch.nn.CrossEntropyLoss()
        lr = self.train_opts.lr
        opt = optim.Adam(model.parameters(), lr=lr)
        step_scheduler, epoch_scheduler = None, None
        num_epochs = self.train_opts.num_epochs

        if self.train_opts.one_cycle and num_epochs > 1:
            steps_per_epoch = len(databunch.train_ds) // batch_size
            total_steps = num_epochs * steps_per_epoch
            step_size_up = (num_epochs // 2) * steps_per_epoch
            step_size_down = total_steps - step_size_up
            step_scheduler = CyclicLR(opt,
                                      base_lr=lr / 10,
                                      max_lr=lr,
                                      step_size_up=step_size_up,
                                      step_size_down=step_size_down,
                                      cycle_momentum=False)
            for _ in range(start_epoch * steps_per_epoch):
                step_scheduler.step()

        # Training loop.
        for epoch in range(start_epoch, num_epochs):
            # Train one epoch.
            log.info('-----------------------------------------------------')
            log.info('epoch: {}'.format(epoch))
            start = time.time()
            train_loss = train_epoch(model, self.device, databunch.train_dl,
                                     opt, loss_fn, step_scheduler)
            if epoch_scheduler:
                epoch_scheduler.step()
            log.info('train loss: {}'.format(train_loss))

            # Validate one epoch.
            metrics = validate_epoch(model, self.device, databunch.valid_dl,
                                     num_labels)
            log.info('validation metrics: {}'.format(metrics))

            # Print elapsed time for epoch.
            end = time.time()
            epoch_time = datetime.timedelta(seconds=end - start)
            log.info('epoch elapsed time: {}'.format(epoch_time))

            # Save model and state.
            torch.save(model.state_dict(), model_path)
            train_state = {'epoch': epoch}
            json_to_file(train_state, train_state_path)

            # Append to log CSV file.
            with open(log_path, 'a') as log_file:
                log_writer = csv.writer(log_file)
                row = [epoch, epoch_time, train_loss]
                row += [metrics[k] for k in metric_names]
                log_writer.writerow(row)

            # Write to Tensorboard log.
            if self.train_opts.log_tensorboard:
                for key, val in metrics.items():
                    tb_writer.add_scalar(key, val, epoch)
                tb_writer.add_scalar('train_loss', train_loss, epoch)
                for name, param in model.named_parameters():
                    tb_writer.add_histogram(name, param, epoch)

            if (train_uri.startswith('s3://')
                    and (((epoch + 1) % self.train_opts.sync_interval) == 0)):
                sync_to_dir(train_dir, train_uri)

        # Close Tensorboard.
        if self.train_opts.log_tensorboard:
            tb_writer.close()
            if self.train_opts.run_tensorboard:
                tensorboard_process.terminate()

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)
Exemplo n.º 5
0
    def train(self, tmp_dir: str) -> None:
        """Train a DeepLab model the task and backend config.

        Args:
            tmp_dir: (str) temporary directory to use

        Returns:
             None
        """
        train_py = self.backend_config.script_locations.train_py
        eval_py = self.backend_config.script_locations.eval_py
        export_py = self.backend_config.script_locations.export_py

        # Setup local input and output directories
        log.info('Setting up local input and output directories')
        train_logdir = self.backend_config.training_output_uri
        train_logdir_local = get_local_path(train_logdir, tmp_dir)
        dataset_dir = get_record_dir(self.backend_config.training_data_uri,
                                     TRAIN)
        dataset_dir_local = get_local_path(dataset_dir, tmp_dir)
        make_dir(tmp_dir)
        make_dir(train_logdir_local)
        make_dir(dataset_dir_local)

        # Download training data
        log.info('Downloading training data')
        for i, record_file in enumerate(list_paths(dataset_dir)):
            download_if_needed(record_file, tmp_dir)

        # Download and untar initial checkpoint.
        log.info('Downloading and untarring initial checkpoint')
        tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri
        download_if_needed(tf_initial_checkpoints_uri, tmp_dir)
        tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir)
        tfic_dir = os.path.dirname(tfic_tarball)
        with tarfile.open(tfic_tarball, 'r:gz') as tar:
            tar.extractall(tfic_dir)
        tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0]
        tfic_ckpt = tfic_ckpt[0:-len('.index')]

        # Restart support
        train_restart_dir = self.backend_config.train_options.train_restart_dir
        if type(train_restart_dir) is not str or len(train_restart_dir) == 0:
            train_restart_dir = train_logdir

        # Get output from potential previous run so we can resume training.
        if type(train_restart_dir) is str and len(
                train_restart_dir
        ) > 0 and not self.backend_config.train_options.replace_model:
            sync_from_dir(train_restart_dir, train_logdir_local)
        else:
            if self.backend_config.train_options.replace_model:
                if os.path.exists(train_logdir_local):
                    shutil.rmtree(train_logdir_local)
                make_dir(train_logdir_local)

        # Periodically synchronize with remote
        sync = start_sync(
            train_logdir_local,
            train_logdir,
            sync_interval=self.backend_config.train_options.sync_interval)

        with sync:
            # Setup TFDL config
            tfdl_config = json_format.ParseDict(
                self.backend_config.tfdl_config, TrainingParametersMsg())
            log.info('tfdl_config={}'.format(tfdl_config))
            log.info('Training steps={}'.format(
                tfdl_config.training_number_of_steps))

            # Additional training options
            max_class = max(
                list(map(lambda c: c.id, self.class_map.get_items())))
            num_classes = len(self.class_map.get_items())
            num_classes = max(max_class, num_classes) + 1
            (train_args, train_env) = get_training_args(
                train_py, train_logdir_local, tfic_ckpt, dataset_dir_local,
                num_classes, tfdl_config)

            # Start training
            log.info('Starting training process')
            log.info(' '.join(train_args))
            train_process = Popen(train_args, env=train_env)
            terminate_at_exit(train_process)

            if self.backend_config.train_options.do_monitoring:
                # Start tensorboard
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(train_logdir_local)])
                terminate_at_exit(tensorboard_process)

            if self.backend_config.train_options.do_eval:
                # Start eval script
                log.info('Starting eval script')
                eval_logdir = train_logdir_local
                eval_args = get_evaluation_args(eval_py, train_logdir_local,
                                                dataset_dir_local, eval_logdir,
                                                tfdl_config)
                eval_process = Popen(eval_args, env=train_env)
                terminate_at_exit(eval_process)

            # Wait for training and tensorboard
            log.info('Waiting for training and tensorboard processes')
            train_process.wait()
            if self.backend_config.train_options.do_monitoring:
                tensorboard_process.terminate()

            # Export frozen graph
            log.info(
                'Exporting frozen graph ({}/model)'.format(train_logdir_local))
            export_args = get_export_args(export_py, train_logdir_local,
                                          num_classes, tfdl_config)
            export_process = Popen(export_args)
            terminate_at_exit(export_process)
            export_process.wait()

            # Package up the model files for usage as fine tuning checkpoints
            fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name
            latest_checkpoints = get_latest_checkpoint(train_logdir_local)
            model_checkpoint_files = glob.glob(
                '{}*'.format(latest_checkpoints))
            inference_graph_path = os.path.join(train_logdir_local, 'model')

            with RVConfig.get_tmp_dir() as tmp_dir:
                model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name)
                make_dir(model_dir)
                model_tar = os.path.join(
                    train_logdir_local,
                    '{}.tar.gz'.format(fine_tune_checkpoint_name))
                shutil.copy(inference_graph_path,
                            '{}/frozen_inference_graph.pb'.format(model_dir))
                for path in model_checkpoint_files:
                    shutil.copy(path, model_dir)
                with tarfile.open(model_tar, 'w:gz') as tar:
                    tar.add(model_dir, arcname=os.path.basename(model_dir))

        # Perform final sync
        sync_to_dir(train_logdir_local, train_logdir, delete=False)
Exemplo n.º 6
0
    def train(self, tmp_dir):
        """Train a model.

        This downloads any previous output saved to the train_uri,
        starts training (or resumes from a checkpoint), periodically
        syncs contents of train_dir to train_uri and after training finishes.

        Args:
            tmp_dir: (str) path to temp directory
        """
        self.log_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)

        # Get zip file for each group, and unzip them into chip_dir.
        chip_dir = join(tmp_dir, 'chips')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(chip_dir)

        # Setup data loader.
        def get_label_path(im_path):
            return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name

        size = self.task_config.chip_size
        class_map = self.task_config.class_map
        classes = class_map.get_class_names()
        if 0 not in class_map.get_keys():
            classes = ['nodata'] + classes
        num_workers = 0 if self.train_opts.debug else 4

        data = (SegmentationItemList.from_folder(chip_dir)
                .split_by_folder(train='train-img', valid='val-img'))
        train_count = None
        if self.train_opts.train_count is not None:
            train_count = min(len(data.train), self.train_opts.train_count)
        elif self.train_opts.train_prop != 1.0:
            train_count = int(round(self.train_opts.train_prop * len(data.train)))
        train_items = data.train.items
        if train_count is not None:
            train_inds = np.random.permutation(np.arange(len(data.train)))[0:train_count]
            train_items = train_items[train_inds]
        items = np.concatenate([train_items, data.valid.items])

        data = (SegmentationItemList(items, chip_dir)
                .split_by_folder(train='train-img', valid='val-img')
                .label_from_func(get_label_path, classes=classes)
                .transform(get_transforms(flip_vert=self.train_opts.flip_vert),
                           size=size, tfm_y=True)
                .databunch(bs=self.train_opts.batch_sz,
                           num_workers=num_workers))
        print(data)

        # Setup learner.
        ignore_idx = 0
        metrics = [
            Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx),
            Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx),
            FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx)]
        model_arch = getattr(models, self.train_opts.model_arch)
        learn = unet_learner(
            data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay,
            bottle=True, path=train_dir)
        learn.unfreeze()

        if self.train_opts.mixed_prec and torch.cuda.is_available():
            # This loss_scale works for Resnet 34 and 50. You might need to adjust this
            # for other models.
            learn = learn.to_fp16(loss_scale=256)

        # Setup callbacks and train model.
        model_path = get_local_path(self.backend_opts.model_uri, tmp_dir)

        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            print('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            learn.model.load_state_dict(
                torch.load(pretrained_path, map_location=learn.data.device),
                strict=False)

        # Save every epoch so that resume functionality provided by
        # TrackEpochCallback will work.
        callbacks = [
            TrackEpochCallback(learn),
            MySaveModelCallback(learn, every='epoch'),
            MyCSVLogger(learn, filename='log'),
            ExportCallback(learn, model_path, monitor='f_beta'),
            SyncCallback(train_dir, self.backend_opts.train_uri,
                         self.train_opts.sync_interval)
        ]

        oversample = self.train_opts.oversample
        if oversample:
            weights = get_oversampling_weights(
                data.train_ds, oversample['rare_class_ids'],
                oversample['rare_target_prop'])
            oversample_callback = OverSamplingCallback(learn, weights=weights)
            callbacks.append(oversample_callback)

        if self.train_opts.debug:
            if oversample:
                oversample_callback.on_train_begin()
            make_debug_chips(data, class_map, tmp_dir, train_uri)

        if self.train_opts.log_tensorboard:
            callbacks.append(TensorboardLogger(learn, 'run'))

        if self.train_opts.run_tensorboard:
            log.info('Starting tensorboard process')
            log_dir = join(train_dir, 'logs', 'run')
            tensorboard_process = Popen(
                ['tensorboard', '--logdir={}'.format(log_dir)])
            terminate_at_exit(tensorboard_process)

        lr = self.train_opts.lr
        num_epochs = self.train_opts.num_epochs
        if self.train_opts.one_cycle:
            if lr is None:
                learn.lr_find()
                learn.recorder.plot(suggestion=True, return_fig=True)
                lr = learn.recorder.min_grad_lr
                print('lr_find() found lr: {}'.format(lr))
            learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks)
        else:
            learn.fit(num_epochs, lr, callbacks=callbacks)

        if self.train_opts.run_tensorboard:
            tensorboard_process.terminate()

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)
Exemplo n.º 7
0
def run_command(cmd, shell=False):
    process = subprocess.Popen(cmd, shell=shell)
    terminate_at_exit(process)
    exitcode = process.wait()
    if exitcode != 0:
        sys.exit(exitcode)