예제 #1
0
def main(out_dir=None,
         data_dir=None,
         feature_fn_format='feature-seq.pkl',
         label_fn_format='label_seq.pkl',
         cv_params={}):

    out_dir = os.path.expanduser(out_dir)
    data_dir = os.path.expanduser(data_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    # Load data
    trial_ids = utils.getUniqueIds(data_dir,
                                   prefix='trial=',
                                   suffix=feature_fn_format,
                                   to_array=True)
    dataset = utils.CvDataset(trial_ids,
                              data_dir,
                              feature_fn_format=feature_fn_format,
                              label_fn_format=label_fn_format,
                              vocab=[])
    utils.saveMetadata(dataset.metadata, out_data_dir)

    # Make folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size,
                                    metadata=dataset.metadata,
                                    **cv_params)
    save_cv_folds(cv_folds, os.path.join(out_data_dir, 'cv-folds.json'))

    # Check folds
    for cv_index, cv_fold in enumerate(cv_folds):
        train_data, val_data, test_data = dataset.getFold(cv_fold)
        train_feats, train_labels, train_ids = train_data
        test_feats, test_labels, test_ids = test_data
        val_feats, val_labels, val_ids = val_data

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )
예제 #2
0
def main(out_dir=None,
         data_dir=None,
         event_scores_dir=None,
         action_scores_dir=None,
         part_scores_dir=None,
         as_atomic_events=False,
         only_fold=None,
         plot_io=None,
         prefix='seq=',
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={}):

    data_dir = os.path.expanduser(data_dir)
    event_scores_dir = os.path.expanduser(event_scores_dir)
    action_scores_dir = os.path.expanduser(action_scores_dir)
    part_scores_dir = os.path.expanduser(part_scores_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    loader = DataLoader(
        {
            'event': os.path.join(data_dir, 'event-dataset'),
            'action': os.path.join(data_dir, 'action-dataset'),
            'part': os.path.join(data_dir, 'part-dataset')
        }, {
            'event': event_scores_dir,
            'action': action_scores_dir,
            'part': part_scores_dir
        },
        prefix=prefix)
    logger.info(
        f"Loaded ids for {len(loader.seq_ids)} sequences from {data_dir}")

    all_metrics = collections.defaultdict(list)

    if as_atomic_events:
        map_df = pd.read_csv(os.path.join(data_dir, 'labels',
                                          'event-vocab.csv'),
                             index_col=False,
                             keep_default_na=False)
    else:
        map_df = pd.DataFrame(
            [[f"{action}({part})", action] +
             [part == n for n in loader.vocabs['part'] if n]
             for action in loader.vocabs['action']
             for part in loader.vocabs['part']],
            columns=['event', 'action'] +
            [f"{n}_active" for n in loader.vocabs['part'] if n])
    event_vocab = map_df['event'].to_list()
    utils.saveVariable(event_vocab, 'vocab', out_data_dir)

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(loader.seq_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    for cv_index, cv_fold in enumerate(cv_folds):
        if only_fold is not None and cv_index != only_fold:
            continue

        train_indices, val_indices, test_indices = cv_fold
        logger.info(
            f"CV FOLD {cv_index + 1} / {len(cv_folds)}: "
            f"{len(train_indices)} train, {len(val_indices)} val, {len(test_indices)} test"
        )

        action_part_counts = sum(
            count_action_part_bigrams(loader.vocabs['action'],
                                      loader.vocabs['part'], la, lp)
            for la, lp in loader._getLabels(train_indices,
                                            label_name=['action', 'part'],
                                            load_from_data=True))
        model = AttributeClassifier(action_part_counts)

        plot_action_part_bigrams(
            loader.vocabs['action'], loader.vocabs['part'],
            (model.action_part_counts > 0).astype(int),
            os.path.join(fig_dir,
                         f'cvfold={cv_index}_action-part-coocurrence.png'))

        for i in test_indices:
            seq_id = loader.seq_ids[i]
            logger.info(f"  Processing sequence {seq_id}...")

            action_score_seq, part_score_seq, true_action_seq, true_part_seq = loader[
                seq_id]
            event_score_seq = model.forward(action_score_seq, part_score_seq)
            pred_action_seq, pred_part_seq = model.predict(event_score_seq)

            true_event_seq = ap_to_event(true_action_seq, true_part_seq,
                                         map_df, loader.vocabs)
            pred_event_seq = ap_to_event(pred_action_seq, pred_part_seq,
                                         map_df, loader.vocabs)

            metric_dict = {}
            metric_dict = eval_metrics(pred_event_seq,
                                       true_event_seq,
                                       name_prefix='Event ',
                                       append_to=metric_dict)
            metric_dict = eval_metrics(pred_action_seq,
                                       true_action_seq,
                                       name_prefix='Action ',
                                       append_to=metric_dict)
            metric_dict = eval_metrics(pred_part_seq,
                                       true_part_seq,
                                       name_prefix='Part ',
                                       append_to=metric_dict)
            for name, value in metric_dict.items():
                logger.info(f"    {name}: {value * 100:.2f}%")
                all_metrics[name].append(value)

            seq_id_str = f"seq={seq_id}"
            utils.saveVariable(event_score_seq, f'{seq_id_str}_score-seq',
                               out_data_dir)
            utils.saveVariable(true_event_seq, f'{seq_id_str}_true-label-seq',
                               out_data_dir)
            utils.saveVariable(pred_event_seq, f'{seq_id_str}_pred-label-seq',
                               out_data_dir)
            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)

            if plot_io:
                utils.plot_array(
                    event_score_seq.reshape(event_score_seq.shape[0], -1).T,
                    (true_event_seq, pred_event_seq), ('true', 'pred'),
                    fn=os.path.join(fig_dir, f"seq={seq_id:03d}_event.png"))
                utils.plot_array(action_score_seq.T,
                                 (true_action_seq, pred_action_seq),
                                 ('true', 'pred'),
                                 fn=os.path.join(
                                     fig_dir, f"seq={seq_id:03d}_action.png"))
                utils.plot_array(part_score_seq.T,
                                 (true_part_seq, pred_part_seq),
                                 ('true', 'pred'),
                                 fn=os.path.join(fig_dir,
                                                 f"seq={seq_id:03d}_part.png"))
예제 #3
0
def main(out_dir=None,
         data_dir=None,
         attr_dir=None,
         model_name=None,
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    attr_dir = os.path.expanduser(attr_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, f'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def loadData(seq_id):
        var_name = f"trial-{seq_id}_rgb-frame-seq"
        data = joblib.load(os.path.join(data_dir, f'{var_name}.pkl'))
        return data.swapaxes(1, 3)

    def loadLabels(seq_id):
        var_name = f"trial-{seq_id}_label-seq"
        return joblib.load(os.path.join(attr_dir, f'{var_name}.pkl'))

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir)
    label_seqs = tuple(map(loadLabels, trial_ids))

    device = torchutils.selectDevice(gpu_dev_id)

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, **cv_params)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs) for s in (label_seqs, trial_ids))
        return split_data

    for cv_index, cv_splits in enumerate(cv_folds):
        train_data, val_data, test_data = tuple(map(getSplit, cv_splits))

        criterion = torch.nn.BCEWithLogitsLoss()
        labels_dtype = torch.float

        train_labels, train_ids = train_data
        train_set = torchutils.PickledVideoDataset(loadData,
                                                   train_labels,
                                                   device=device,
                                                   labels_dtype=labels_dtype,
                                                   seq_ids=train_ids,
                                                   batch_size=batch_size)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=1,
                                                   shuffle=True)

        test_labels, test_ids = test_data
        test_set = torchutils.PickledVideoDataset(loadData,
                                                  test_labels,
                                                  device=device,
                                                  labels_dtype=labels_dtype,
                                                  seq_ids=test_ids,
                                                  batch_size=batch_size)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=1,
                                                  shuffle=False)

        val_labels, val_ids = val_data
        val_set = torchutils.PickledVideoDataset(loadData,
                                                 val_labels,
                                                 device=device,
                                                 labels_dtype=labels_dtype,
                                                 seq_ids=val_ids,
                                                 batch_size=batch_size)
        val_loader = torch.utils.data.DataLoader(val_set,
                                                 batch_size=1,
                                                 shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        if model_name == 'resnet':
            # input_dim = train_set.num_obsv_dims
            output_dim = train_set.num_label_types
            model = ImageClassifier(output_dim,
                                    **model_params).to(device=device)
        else:
            raise AssertionError()

        train_epoch_log = collections.defaultdict(list)
        val_epoch_log = collections.defaultdict(list)
        metric_dict = {
            'Avg Loss': metrics.AverageLoss(),
            'Accuracy': metrics.Accuracy(),
            'Precision': metrics.Precision(),
            'Recall': metrics.Recall(),
            'F1': metrics.Fmeasure()
        }

        optimizer_ft = torch.optim.Adam(model.parameters(),
                                        lr=learning_rate,
                                        betas=(0.9, 0.999),
                                        eps=1e-08,
                                        weight_decay=0,
                                        amsgrad=False)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                       step_size=1,
                                                       gamma=1.00)

        model, last_model_wts = torchutils.trainModel(
            model,
            criterion,
            optimizer_ft,
            lr_scheduler,
            train_loader,
            val_loader,
            device=device,
            metrics=metric_dict,
            train_epoch_log=train_epoch_log,
            val_epoch_log=val_epoch_log,
            **train_params)

        # Test model
        metric_dict = {
            'Avg Loss': metrics.AverageLoss(),
            'Accuracy': metrics.Accuracy(),
            'Precision': metrics.Precision(),
            'Recall': metrics.Recall(),
            'F1': metrics.Fmeasure()
        }
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)
        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        utils.writeResults(results_file, metric_dict, sweep_param_name,
                           model_params)

        if plot_predictions:
            # imu.plot_prediction_eg(test_io_history, fig_dir, fig_type=fig_type, **viz_params)
            imu.plot_prediction_eg(test_io_history, fig_dir, **viz_params)

        def saveTrialData(pred_seq, score_seq, feat_seq, label_seq, trial_id):
            saveVariable(pred_seq.cpu().numpy(),
                         f'trial={trial_id}_pred-label-seq')
            saveVariable(score_seq.cpu().numpy(),
                         f'trial={trial_id}_score-seq')
            saveVariable(label_seq.cpu().numpy(),
                         f'trial={trial_id}_true-label-seq')

        for io in test_io_history:
            saveTrialData(*io)

        saveVariable(train_ids, f'cvfold={cv_index}_train-ids')
        saveVariable(test_ids, f'cvfold={cv_index}_test-ids')
        saveVariable(val_ids, f'cvfold={cv_index}_val-ids')
        saveVariable(train_epoch_log,
                     f'cvfold={cv_index}_{model_name}-train-epoch-log')
        saveVariable(val_epoch_log,
                     f'cvfold={cv_index}_{model_name}-val-epoch-log')
        saveVariable(metric_dict,
                     f'cvfold={cv_index}_{model_name}-metric-dict')
        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        model.load_state_dict(last_model_wts)
        saveVariable(model, f'cvfold={cv_index}_{model_name}-last')

        torchutils.plotEpochLog(train_epoch_log,
                                subfig_size=(10, 2.5),
                                title='Training performance',
                                fn=os.path.join(
                                    fig_dir,
                                    f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))
예제 #4
0
def main(
        out_dir=None, data_dir=None, model_name=None,
        model_params={}, cv_params={}, train_params={}, viz_params={},
        plot_predictions=None, results_file=None, sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, f'results.csv')
        write_mode = 'w'
    else:
        results_file = os.path.expanduser(results_file)
        write_mode = 'a'

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'trial={seq_id}_{var_name}')
            return joblib.load(fn)
        return tuple(map(loadOne, seq_ids))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial=')
    feature_seqs = loadAll(trial_ids, 'feature-seq.pkl', data_dir)
    label_seqs = loadAll(trial_ids, 'label-seq.pkl', data_dir)

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, **cv_params)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (feature_seqs, label_seqs, trial_ids)
        )
        return split_data

    for cv_index, cv_splits in enumerate(cv_folds):
        train_data, val_data, test_data = tuple(map(getSplit, cv_splits))

        train_feats, train_labels, train_ids = train_data
        test_feats, test_labels, test_ids = test_data
        val_feats, val_labels, val_ids = val_data

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        input_dim = train_set.num_obsv_dims
        output_dim = train_set.num_label_types

        elif model_name == 'dummy':
            # FIXME
            model = None
        else:
            raise AssertionError()

        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        d = {k: v.value for k, v in metric_dict.items()}
        utils.writeResults(
            results_file, d, sweep_param_name, model_params,
            write_mode=write_mode
        )

        if plot_predictions:
            io_fig_dir = os.path.join(fig_dir, 'model-io')
            if not os.path.exists(io_fig_dir):
                os.makedirs(io_fig_dir)

            label_names = ('gt', 'pred')
            preds, scores, inputs, gt_labels, ids = zip(*test_io_history)
            for batch in test_io_history:
                batch = map(lambda x: x.cpu().numpy(), batch)
                for preds, _, inputs, gt_labels, seq_id in zip(*batch):
                    fn = os.path.join(io_fig_dir, f"trial={seq_id}_model-io.png")
                    utils.plot_array(inputs, (gt_labels, preds), label_names, fn=fn)

        def saveTrialData(pred_seq, score_seq, feat_seq, label_seq, trial_id):
            saveVariable(pred_seq, f'trial={trial_id}_pred-label-seq')
            saveVariable(score_seq, f'trial={trial_id}_score-seq')
            saveVariable(label_seq, f'trial={trial_id}_true-label-seq')
        for batch in test_io_history:
            batch = map(lambda x: x.cpu().numpy(), batch)
            for io in zip(*batch):
                saveTrialData(*io)

        saveVariable(train_ids, f'cvfold={cv_index}_train-ids')
        saveVariable(test_ids, f'cvfold={cv_index}_test-ids')
        saveVariable(val_ids, f'cvfold={cv_index}_val-ids')
        saveVariable(train_epoch_log, f'cvfold={cv_index}_{model_name}-train-epoch-log')
        saveVariable(val_epoch_log, f'cvfold={cv_index}_{model_name}-val-epoch-log')

        train_fig_dir = os.path.join(fig_dir, 'train-plots')
        if not os.path.exists(train_fig_dir):
            os.makedirs(train_fig_dir)
예제 #5
0
def main(out_dir=None,
         data_dir=None,
         segs_dir=None,
         scores_dir=None,
         vocab_dir=None,
         label_type='edges',
         gpu_dev_id=None,
         start_from=None,
         stop_at=None,
         num_disp_imgs=None,
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={}):

    data_dir = os.path.expanduser(data_dir)
    segs_dir = os.path.expanduser(segs_dir)
    scores_dir = os.path.expanduser(scores_dir)
    vocab_dir = os.path.expanduser(vocab_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    io_dir_images = os.path.join(fig_dir, 'model-io_images')
    if not os.path.exists(io_dir_images):
        os.makedirs(io_dir_images)

    io_dir_plots = os.path.join(fig_dir, 'model-io_plots')
    if not os.path.exists(io_dir_plots):
        os.makedirs(io_dir_plots)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    seq_ids = utils.getUniqueIds(scores_dir,
                                 prefix='trial=',
                                 suffix='score-seq.*',
                                 to_array=True)

    logger.info(
        f"Loaded scores for {len(seq_ids)} sequences from {scores_dir}")

    link_vocab = {}
    joint_vocab = {}
    joint_type_vocab = {}
    vocab, parts_vocab, part_labels = load_vocab(link_vocab, joint_vocab,
                                                 joint_type_vocab, vocab_dir)
    pred_vocab = []  # FIXME

    if label_type == 'assembly':
        logger.info("Converting assemblies -> edges")
        state_pred_seqs = tuple(
            utils.loadVariable(f"trial={seq_id}_pred-label-seq", scores_dir)
            for seq_id in seq_ids)
        state_true_seqs = tuple(
            utils.loadVariable(f"trial={seq_id}_true-label-seq", scores_dir)
            for seq_id in seq_ids)
        edge_pred_seqs = tuple(part_labels[seq] for seq in state_pred_seqs)
        edge_true_seqs = tuple(part_labels[seq] for seq in state_true_seqs)
    elif label_type == 'edge':
        logger.info("Converting edges -> assemblies (will take a few minutes)")
        edge_pred_seqs = tuple(
            utils.loadVariable(f"trial={seq_id}_pred-label-seq", scores_dir)
            for seq_id in seq_ids)
        edge_true_seqs = tuple(
            utils.loadVariable(f"trial={seq_id}_true-label-seq", scores_dir)
            for seq_id in seq_ids)
        state_pred_seqs = tuple(
            edges_to_assemblies(seq, pred_vocab, parts_vocab, part_labels)
            for seq in edge_pred_seqs)
        state_true_seqs = tuple(
            edges_to_assemblies(seq, vocab, parts_vocab, part_labels)
            for seq in edge_true_seqs)

    device = torchutils.selectDevice(gpu_dev_id)
    dataset = sim2real.LabeledConnectionDataset(
        utils.loadVariable('parts-vocab', vocab_dir),
        utils.loadVariable('part-labels', vocab_dir),
        utils.loadVariable('vocab', vocab_dir),
        device=device)

    all_metrics = collections.defaultdict(list)

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(seq_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    for cv_index, cv_fold in enumerate(cv_folds):
        train_indices, val_indices, test_indices = cv_fold
        logger.info(
            f"CV FOLD {cv_index + 1} / {len(cv_folds)}: "
            f"{len(train_indices)} train, {len(val_indices)} val, {len(test_indices)} test"
        )

        train_states = np.hstack(
            tuple(state_true_seqs[i] for i in (train_indices)))
        train_edges = part_labels[train_states]
        # state_train_vocab = np.unique(train_states)
        # edge_train_vocab = part_labels[state_train_vocab]
        train_freq_bigram, train_freq_unigram = edge_joint_freqs(train_edges)
        # state_probs = utils.makeHistogram(len(vocab), train_states, normalize=True)

        test_states = np.hstack(
            tuple(state_true_seqs[i] for i in (test_indices)))
        test_edges = part_labels[test_states]
        # state_test_vocab = np.unique(test_states)
        # edge_test_vocab = part_labels[state_test_vocab]
        test_freq_bigram, test_freq_unigram = edge_joint_freqs(test_edges)

        f, axes = plt.subplots(1, 2)
        axes[0].matshow(train_freq_bigram)
        axes[0].set_title('Train')
        axes[1].matshow(test_freq_bigram)
        axes[1].set_title('Test')
        plt.tight_layout()
        plt.savefig(
            os.path.join(fig_dir, f"edge-freqs-bigram_cvfold={cv_index}.png"))

        f, axis = plt.subplots(1)
        axis.stem(train_freq_unigram,
                  label='Train',
                  linefmt='C0-',
                  markerfmt='C0o')
        axis.stem(test_freq_unigram,
                  label='Test',
                  linefmt='C1--',
                  markerfmt='C1o')
        plt.legend()
        plt.tight_layout()
        plt.savefig(
            os.path.join(fig_dir, f"edge-freqs-unigram_cvfold={cv_index}.png"))

        for i in test_indices:
            seq_id = seq_ids[i]
            logger.info(f"  Processing sequence {seq_id}...")

            trial_prefix = f"trial={seq_id}"
            # I include the '.' to differentiate between 'rgb-frame-seq' and
            # 'rgb-frame-seq-before-first-touch'
            # rgb_seq = utils.loadVariable(f"{trial_prefix}_rgb-frame-seq.", data_dir)
            # seg_seq = utils.loadVariable(f"{trial_prefix}_seg-labels-seq", segs_dir)
            score_seq = utils.loadVariable(f"{trial_prefix}_score-seq",
                                           scores_dir)
            # if score_seq.shape[0] != rgb_seq.shape[0]:
            #     err_str = f"scores shape {score_seq.shape} != data shape {rgb_seq.shape}"
            #     raise AssertionError(err_str)

            edge_pred_seq = edge_pred_seqs[i]
            edge_true_seq = edge_true_seqs[i]
            state_pred_seq = state_pred_seqs[i]
            state_true_seq = state_true_seqs[i]

            num_types = np.unique(state_pred_seq).shape[0]
            num_samples = state_pred_seq.shape[0]
            num_total = len(pred_vocab)
            logger.info(
                f"    {num_types} assemblies predicted ({num_total} total); "
                f"{num_samples} samples")

            # edge_freq_bigram, edge_freq_unigram = edge_joint_freqs(edge_true_seq)
            # dist_shift = np.linalg.norm(train_freq_unigram - edge_freq_unigram)
            metric_dict = {
                # 'State OOV rate': oov_rate_state(state_true_seq, state_train_vocab),
                # 'Edge OOV rate': oov_rate_edges(edge_true_seq, edge_train_vocab),
                # 'State avg prob, true': state_probs[state_true_seq].mean(),
                # 'State avg prob, pred': state_probs[state_pred_seq].mean(),
                # 'Edge distribution shift': dist_shift
            }
            metric_dict = eval_edge_metrics(edge_pred_seq,
                                            edge_true_seq,
                                            append_to=metric_dict)
            metric_dict = eval_state_metrics(state_pred_seq,
                                             state_true_seq,
                                             append_to=metric_dict)
            for name, value in metric_dict.items():
                logger.info(f"    {name}: {value * 100:.2f}%")
                all_metrics[name].append(value)

            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)

            if num_disp_imgs is not None:
                pred_images = tuple(
                    render(dataset, vocab[seg_label])
                    for seg_label in utils.computeSegments(state_pred_seq)[0])
                imageprocessing.displayImages(
                    *pred_images,
                    file_path=os.path.join(
                        io_dir_images,
                        f"seq={seq_id:03d}_pred-assemblies.png"),
                    num_rows=None,
                    num_cols=5)
                true_images = tuple(
                    render(dataset, vocab[seg_label])
                    for seg_label in utils.computeSegments(state_true_seq)[0])
                imageprocessing.displayImages(
                    *true_images,
                    file_path=os.path.join(
                        io_dir_images,
                        f"seq={seq_id:03d}_true-assemblies.png"),
                    num_rows=None,
                    num_cols=5)

                utils.plot_array(score_seq.T,
                                 (edge_true_seq.T, edge_pred_seq.T),
                                 ('true', 'pred'),
                                 fn=os.path.join(io_dir_plots,
                                                 f"seq={seq_id:03d}.png"))
예제 #6
0
def main(out_dir=None,
         data_dir=None,
         scores_dir=None,
         start_from=None,
         stop_at=None,
         results_file=None,
         sweep_param_name=None,
         cv_params={}):

    data_dir = os.path.expanduser(data_dir)
    scores_dir = os.path.expanduser(scores_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    io_dir = os.path.join(fig_dir, 'model-io')
    if not os.path.exists(io_dir):
        os.makedirs(io_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def loadVariable(var_name, from_dir=scores_dir):
        var = utils.loadVariable(var_name, from_dir)
        return var

    def saveVariable(var, var_name, to_dir=out_data_dir):
        utils.saveVariable(var, var_name, out_data_dir)

    def makeSeqBatches(unflatten, seq_ids):
        d = collections.defaultdict(list)
        for batch_index, (seq_index, win_index) in enumerate(unflatten):
            seq_id = seq_ids[seq_index]
            d[seq_id].append(batch_index)
        return d

    def loadBatchData(cv_index, batch_index):
        prefix = f"cvfold={cv_index}_batch={batch_index}"
        batch_score = loadVariable(f"{prefix}_score-seq")
        batch_pred = loadVariable(f"{prefix}_pred-label-seq").astype(int)
        batch_true = loadVariable(f"{prefix}_true-label-seq").astype(int)
        return batch_score, batch_pred, batch_true

    vocab = loadVariable('vocab')
    parts_vocab = loadVariable('parts-vocab')
    edge_labels = loadVariable('part-labels')
    saveVariable(vocab, 'vocab')
    saveVariable(parts_vocab, 'parts-vocab')
    saveVariable(edge_labels, 'part-labels')

    trial_ids = utils.getUniqueIds(data_dir,
                                   prefix='trial=',
                                   suffix='',
                                   to_array=True)
    cv_folds = utils.makeDataSplits(len(trial_ids), **cv_params)

    for cv_index, (__, __, test_indices) in enumerate(cv_folds):
        logger.info(f"CV FOLD {cv_index + 1} / {len(cv_folds)}")
        test_ids = trial_ids[test_indices]
        unflatten = loadVariable(f"cvfold={cv_index}_test-set-unflatten")
        flatten = makeSeqBatches(unflatten, test_ids)
        for seq_id in test_ids:
            logger.info(f"  Processing sequence {seq_id}...")
            batch_idxs = flatten[seq_id]
            score_seq, pred_seq, true_seq = map(
                np.vstack,
                zip(*tuple(loadBatchData(cv_index, i) for i in batch_idxs)))

            trial_prefix = f"trial={seq_id}"
            rgb_seq = loadVariable(f"{trial_prefix}_rgb-frame-seq.",
                                   from_dir=data_dir)
            if score_seq.shape[0] != rgb_seq.shape[0]:
                err_str = f"scores shape {score_seq.shape} != data shape {rgb_seq.shape}"
                raise AssertionError(err_str)

            saveVariable(score_seq, f"{trial_prefix}_score-seq")
            saveVariable(pred_seq, f"{trial_prefix}_pred-label-seq")
            saveVariable(true_seq, f"{trial_prefix}_true-label-seq")
예제 #7
0
def main(out_dir=None,
         data_dir=None,
         scores_dir=None,
         model_name=None,
         model_params={},
         results_file=None,
         sweep_param_name=None,
         cv_params={},
         viz_params={},
         plot_predictions=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, f'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def loadVariable(var_name):
        return joblib.load(os.path.join(data_dir, f'{var_name}.pkl'))

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    # Load data
    trial_ids = loadVariable('trial_ids')
    feature_seqs = loadVariable('imu_sample_seqs')
    label_seqs = loadVariable('imu_label_seqs')

    if scores_dir is not None:
        scores_dir = os.path.expanduser(scores_dir)
        feature_seqs = tuple(
            joblib.load(
                os.path.join(scores_dir, f'trial={trial_id}_score-seq.pkl')).
            swapaxes(0, 1) for trial_id in trial_ids)

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, **cv_params)

    metric_dict = {'accuracy': [], 'edit_score': [], 'overlap_score': []}

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (feature_seqs, label_seqs, trial_ids))
        return split_data

    for cv_index, cv_splits in enumerate(cv_folds):
        train_data, val_data, test_data = tuple(map(getSplit, cv_splits))

        train_ids = train_data[-1]
        test_ids = test_data[-1]
        val_ids = val_data[-1]

        logger.info(
            f'CV fold {cv_index + 1}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        for name in metric_dict.keys():
            value = None  # FIXME
            metric_dict[name] += [value]
        metric_str = '  '.join(f"{k}: {v[-1]:.1f}%"
                               for k, v in metric_dict.items())
        logger.info('[TST]  ' + metric_str)

        d = {k: v[-1] for k, v in metric_dict.items()}
        utils.writeResults(results_file, d, sweep_param_name, model_params)

        test_io_history = None  # FIXME

        if plot_predictions:
            imu.plot_prediction_eg(test_io_history, fig_dir, **viz_params)

        def saveTrialData(pred_seq, score_seq, feat_seq, label_seq, trial_id):
            saveVariable(pred_seq, f'trial={trial_id}_pred-label-seq')
            saveVariable(score_seq, f'trial={trial_id}_score-seq')
            saveVariable(label_seq, f'trial={trial_id}_true-label-seq')

        for io in test_io_history:
            saveTrialData(*io)
예제 #8
0
def main(out_dir=None,
         data_dir=None,
         cv_data_dir=None,
         score_dirs=[],
         fusion_method='sum',
         prune_imu=None,
         standardize=None,
         decode=None,
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None,
         gpu_dev_id=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={}):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    score_dirs = tuple(map(os.path.expanduser, score_dirs))
    if cv_data_dir is not None:
        cv_data_dir = os.path.expanduser(cv_data_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'trial={seq_id}_{var_name}')
            return joblib.load(fn)

        return tuple(map(loadOne, seq_ids))

    device = torchutils.selectDevice(gpu_dev_id)

    # Load data
    dir_trial_ids = tuple(
        set(utils.getUniqueIds(d, prefix='trial=', to_array=True))
        for d in score_dirs)
    dir_trial_ids += (set(
        utils.getUniqueIds(data_dir, prefix='trial=', to_array=True)), )
    trial_ids = np.array(list(sorted(set.intersection(*dir_trial_ids))))

    for dir_name, t_ids in zip(score_dirs + (data_dir, ), dir_trial_ids):
        logger.info(f"{len(t_ids)} trial ids from {dir_name}:")
        logger.info(f"  {t_ids}")
    logger.info(f"{len(trial_ids)} trials in intersection: {trial_ids}")

    assembly_seqs = loadAll(trial_ids, 'assembly-seq.pkl', data_dir)
    feature_seqs = tuple(
        loadAll(trial_ids, 'data-scores.pkl', d) for d in score_dirs)
    feature_seqs = tuple(zip(*feature_seqs))

    # Combine feature seqs
    include_indices = []
    for i, seq_feats in enumerate(feature_seqs):
        feat_shapes = tuple(f.shape for f in seq_feats)
        include_seq = all(f == feat_shapes[0] for f in feat_shapes)
        if include_seq:
            include_indices.append(i)
        else:
            warn_str = (
                f'Excluding trial {trial_ids[i]} with mismatched feature shapes: '
                f'{feat_shapes}')
            logger.warning(warn_str)

    trial_ids = trial_ids[include_indices]
    assembly_seqs = tuple(assembly_seqs[i] for i in include_indices)
    feature_seqs = tuple(feature_seqs[i] for i in include_indices)

    feature_seqs = tuple(np.stack(f) for f in feature_seqs)

    # Define cross-validation folds
    if cv_data_dir is None:
        dataset_size = len(trial_ids)
        cv_folds = utils.makeDataSplits(dataset_size, **cv_params)
        cv_fold_trial_ids = tuple(
            tuple(map(lambda x: trial_ids[x], splits)) for splits in cv_folds)
    else:
        fn = os.path.join(cv_data_dir, 'cv-fold-trial-ids.pkl')
        cv_fold_trial_ids = joblib.load(fn)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (feature_seqs, assembly_seqs, trial_ids))
        return split_data

    gt_scores = []
    all_scores = []
    num_keyframes_total = 0
    num_rgb_errors_total = 0
    num_correctable_errors_total = 0
    num_oov_total = 0
    num_changed_total = 0
    for cv_index, (train_ids, test_ids) in enumerate(cv_fold_trial_ids):
        try:
            test_idxs = np.array(
                [trial_ids.tolist().index(i) for i in test_ids])
            include_indices.append(cv_index)
        except ValueError:
            logger.info(
                f"  Skipping fold {cv_index}: missing test data {test_ids}")

        logger.info(f'CV fold {cv_index + 1}: {len(trial_ids)} total '
                    f'({len(train_ids)} train, {len(test_ids)} test)')

        # TRAIN PHASE
        if cv_data_dir is None:
            train_idxs = np.array([trial_ids.index(i) for i in train_ids])
            train_assembly_seqs = tuple(assembly_seqs[i] for i in train_idxs)
            train_assemblies = []
            for seq in train_assembly_seqs:
                list(
                    labels.gen_eq_classes(seq,
                                          train_assemblies,
                                          equivalent=None))
            model = None
        else:
            fn = f'cvfold={cv_index}_train-assemblies.pkl'
            train_assemblies = joblib.load(os.path.join(cv_data_dir, fn))
            train_idxs = [
                i for i in range(len(trial_ids)) if i not in test_idxs
            ]

            fn = f'cvfold={cv_index}_model.pkl'
            model = joblib.load(os.path.join(cv_data_dir, fn))

        train_features, train_assembly_seqs, train_ids = getSplit(train_idxs)

        if False:
            train_labels = tuple(
                np.array(
                    list(
                        labels.gen_eq_classes(assembly_seq,
                                              train_assemblies,
                                              equivalent=None)), )
                for assembly_seq in train_assembly_seqs)

            train_set = torchutils.SequenceDataset(train_features,
                                                   train_labels,
                                                   seq_ids=train_ids,
                                                   device=device)
            train_loader = torch.utils.data.DataLoader(train_set,
                                                       batch_size=1,
                                                       shuffle=True)

            train_epoch_log = collections.defaultdict(list)
            # val_epoch_log = collections.defaultdict(list)
            metric_dict = {
                'Avg Loss': metrics.AverageLoss(),
                'Accuracy': metrics.Accuracy()
            }

            criterion = torch.nn.CrossEntropyLoss()
            optimizer_ft = torch.optim.Adam(model.parameters(),
                                            lr=1e-3,
                                            betas=(0.9, 0.999),
                                            eps=1e-08,
                                            weight_decay=0,
                                            amsgrad=False)
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                           step_size=1,
                                                           gamma=1.00)

            model = FusionClassifier(num_sources=train_features[0].shape[0])
            model, last_model_wts = torchutils.trainModel(
                model,
                criterion,
                optimizer_ft,
                lr_scheduler,
                train_loader,  # val_loader,
                device=device,
                metrics=metric_dict,
                train_epoch_log=train_epoch_log,
                # val_epoch_log=val_epoch_log,
                **train_params)

            torchutils.plotEpochLog(train_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Training performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_train-plot.png'))

        test_assemblies = train_assemblies.copy()
        for feature_seq, gt_assembly_seq, trial_id in zip(
                *getSplit(test_idxs)):
            gt_seq = np.array(
                list(
                    labels.gen_eq_classes(gt_assembly_seq,
                                          test_assemblies,
                                          equivalent=None)))

        if plot_predictions:
            assembly_fig_dir = os.path.join(fig_dir, 'assembly-imgs')
            if not os.path.exists(assembly_fig_dir):
                os.makedirs(assembly_fig_dir)
            for i, assembly in enumerate(test_assemblies):
                assembly.draw(assembly_fig_dir, i)

        # TEST PHASE
        accuracies = []
        for feature_seq, gt_assembly_seq, trial_id in zip(
                *getSplit(test_idxs)):
            gt_seq = np.array(
                list(
                    labels.gen_eq_classes(gt_assembly_seq,
                                          test_assemblies,
                                          equivalent=None)))

            num_labels = gt_seq.shape[0]
            num_features = feature_seq.shape[-1]
            if num_labels != num_features:
                err_str = (f"Skipping trial {trial_id}: "
                           f"{num_labels} labels != {num_features} features")
                logger.info(err_str)
                continue

            # Ignore OOV states in ground-truth
            sample_idxs = np.arange(feature_seq.shape[-1])
            score_idxs = gt_seq[gt_seq < feature_seq.shape[1]]
            sample_idxs = sample_idxs[gt_seq < feature_seq.shape[1]]
            gt_scores.append(feature_seq[:, score_idxs, sample_idxs])
            all_scores.append(feature_seq.reshape(feature_seq.shape[0], -1))

            if fusion_method == 'sum':
                score_seq = feature_seq.sum(axis=0)
            elif fusion_method == 'rgb_only':
                score_seq = feature_seq[1]
            elif fusion_method == 'imu_only':
                score_seq = feature_seq[0]
            else:
                raise NotImplementedError()

            if not decode:
                model = None

            if model is None:
                pred_seq = score_seq.argmax(axis=0)
            elif isinstance(model, torch.nn.Module):
                inputs = torch.tensor(feature_seq[None, ...],
                                      dtype=torch.float,
                                      device=device)
                outputs = model.forward(inputs)
                pred_seq = model.predict(outputs)[0].cpu().numpy()
            else:
                dummy_samples = np.arange(score_seq.shape[1])
                pred_seq, _, _, _ = model.viterbi(dummy_samples,
                                                  log_likelihoods=score_seq,
                                                  ml_decode=(not decode))

            pred_assemblies = [train_assemblies[i] for i in pred_seq]
            gt_assemblies = [test_assemblies[i] for i in gt_seq]

            acc = metrics.accuracy_upto(pred_assemblies,
                                        gt_assemblies,
                                        equivalence=None)
            accuracies.append(acc)

            rgb_pred_seq = feature_seq[1].argmax(axis=0)
            num_changed = np.sum(rgb_pred_seq != pred_seq)
            rgb_is_wrong = rgb_pred_seq != gt_seq
            num_rgb_errors = np.sum(rgb_is_wrong)
            imu_scores = feature_seq[0]
            imu_scores_gt = np.array([
                imu_scores[s_idx,
                           t] if s_idx < imu_scores.shape[0] else -np.inf
                for t, s_idx in enumerate(gt_seq)
            ])
            imu_scores_rgb = np.array([
                imu_scores[s_idx,
                           t] if s_idx < imu_scores.shape[0] else -np.inf
                for t, s_idx in enumerate(rgb_pred_seq)
            ])
            # imu_scores_gt = imu_scores[gt_seq, range(len(rgb_pred_seq))]
            best_imu_scores = imu_scores.max(axis=0)
            imu_is_right = imu_scores_gt >= best_imu_scores
            rgb_pred_score_is_lower = imu_scores_gt > imu_scores_rgb
            is_correctable_error = rgb_is_wrong & imu_is_right & rgb_pred_score_is_lower
            num_correctable_errors = np.sum(is_correctable_error)
            prop_correctable = num_correctable_errors / num_rgb_errors

            num_oov = np.sum(gt_seq >= len(train_assemblies))
            num_states = len(gt_seq)

            num_keyframes_total += num_states
            num_rgb_errors_total += num_rgb_errors
            num_correctable_errors_total += num_correctable_errors
            num_oov_total += num_oov
            num_changed_total += num_changed

            logger.info(f"  trial {trial_id}: {num_states} keyframes")
            logger.info(f"    accuracy (fused): {acc * 100:.1f}%")
            logger.info(
                f"    {num_oov} OOV states ({num_oov / num_states * 100:.1f}%)"
            )
            logger.info(
                f"    {num_rgb_errors} RGB errors; "
                f"{num_correctable_errors} correctable from IMU ({prop_correctable * 100:.1f}%)"
            )

            saveVariable(score_seq, f'trial={trial_id}_data-scores')
            saveVariable(pred_assemblies,
                         f'trial={trial_id}_pred-assembly-seq')
            saveVariable(gt_assemblies, f'trial={trial_id}_gt-assembly-seq')

            if plot_predictions:
                io_figs_dir = os.path.join(fig_dir, 'system-io')
                if not os.path.exists(io_figs_dir):
                    os.makedirs(io_figs_dir)
                fn = os.path.join(io_figs_dir, f'trial={trial_id:03}.png')
                utils.plot_array(feature_seq, (gt_seq, pred_seq, score_seq),
                                 ('gt', 'pred', 'scores'),
                                 fn=fn)

                score_figs_dir = os.path.join(fig_dir, 'modality-scores')
                if not os.path.exists(score_figs_dir):
                    os.makedirs(score_figs_dir)
                plot_scores(feature_seq,
                            k=25,
                            fn=os.path.join(score_figs_dir,
                                            f"trial={trial_id:03}.png"))

                paths_dir = os.path.join(fig_dir, 'path-imgs')
                if not os.path.exists(paths_dir):
                    os.makedirs(paths_dir)
                assemblystats.drawPath(pred_seq, trial_id,
                                       f"trial={trial_id}_pred-seq", paths_dir,
                                       assembly_fig_dir)
                assemblystats.drawPath(gt_seq, trial_id,
                                       f"trial={trial_id}_gt-seq", paths_dir,
                                       assembly_fig_dir)

                label_seqs = (gt_seq, ) + tuple(
                    scores.argmax(axis=0) for scores in feature_seq)
                label_seqs = np.row_stack(label_seqs)
                k = 10
                for i, scores in enumerate(feature_seq):
                    label_score_seqs = tuple(
                        np.array([
                            scores[s_idx,
                                   t] if s_idx < scores.shape[0] else -np.inf
                            for t, s_idx in enumerate(label_seq)
                        ]) for label_seq in label_seqs)
                    label_score_seqs = np.row_stack(label_score_seqs)
                    drawPaths(label_seqs,
                              f"trial={trial_id}_pred-scores_modality={i}",
                              paths_dir,
                              assembly_fig_dir,
                              path_scores=label_score_seqs)

                    topk_seq = (-scores).argsort(axis=0)[:k, :]
                    path_scores = np.column_stack(
                        tuple(scores[idxs, i]
                              for i, idxs in enumerate(topk_seq.T)))
                    drawPaths(topk_seq,
                              f"trial={trial_id}_topk_modality={i}",
                              paths_dir,
                              assembly_fig_dir,
                              path_scores=path_scores)
                label_score_seqs = tuple(
                    np.array([
                        score_seq[s_idx,
                                  t] if s_idx < score_seq.shape[0] else -np.inf
                        for t, s_idx in enumerate(label_seq)
                    ]) for label_seq in label_seqs)
                label_score_seqs = np.row_stack(label_score_seqs)
                drawPaths(label_seqs,
                          f"trial={trial_id}_pred-scores_fused",
                          paths_dir,
                          assembly_fig_dir,
                          path_scores=label_score_seqs)
                topk_seq = (-score_seq).argsort(axis=0)[:k, :]
                path_scores = np.column_stack(
                    tuple(score_seq[idxs, i]
                          for i, idxs in enumerate(topk_seq.T)))
                drawPaths(topk_seq,
                          f"trial={trial_id}_topk_fused",
                          paths_dir,
                          assembly_fig_dir,
                          path_scores=path_scores)

        if accuracies:
            fold_accuracy = float(np.array(accuracies).mean())
            # logger.info(f'  acc: {fold_accuracy * 100:.1f}%')
            metric_dict = {'Accuracy': fold_accuracy}
            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)

    num_unexplained_errors = num_rgb_errors_total - (
        num_oov_total + num_correctable_errors_total)
    prop_correctable = num_correctable_errors_total / num_rgb_errors_total
    prop_oov = num_oov_total / num_rgb_errors_total
    prop_unexplained = num_unexplained_errors / num_rgb_errors_total
    prop_changed = num_changed_total / num_keyframes_total
    logger.info("PERFORMANCE ANALYSIS")
    logger.info(
        f"  {num_rgb_errors_total} / {num_keyframes_total} "
        f"RGB errors ({num_rgb_errors_total / num_keyframes_total * 100:.1f}%)"
    )
    logger.info(f"  {num_oov_total} / {num_rgb_errors_total} "
                f"RGB errors are OOV ({prop_oov * 100:.1f}%)")
    logger.info(f"  {num_correctable_errors_total} / {num_rgb_errors_total} "
                f"RGB errors are correctable ({prop_correctable * 100:.1f}%)")
    logger.info(f"  {num_unexplained_errors} / {num_rgb_errors_total} "
                f"RGB errors are unexplained ({prop_unexplained * 100:.1f}%)")
    logger.info(
        f"  {num_changed_total} / {num_keyframes_total} "
        f"Predictions changed after fusion ({prop_changed * 100:.1f}%)")

    gt_scores = np.hstack(tuple(gt_scores))
    plot_hists(np.exp(gt_scores),
               fn=os.path.join(fig_dir, "score-hists_gt.png"))
    all_scores = np.hstack(tuple(all_scores))
    plot_hists(np.exp(all_scores),
               fn=os.path.join(fig_dir, "score-hists_all.png"))
예제 #9
0
def main(out_dir=None,
         data_dir=None,
         model_name=None,
         predict_mode='classify',
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         independent_signals=None,
         active_only=None,
         output_dim_from_vocab=False,
         prefix='trial=',
         feature_fn_format='feature-seq.pkl',
         label_fn_format='label_seq.pkl',
         dataset_params={},
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         metric_names=['Loss', 'Accuracy', 'Precision', 'Recall', 'F1'],
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name, to_dir=out_data_dir):
        return utils.saveVariable(var, var_name, to_dir)

    # Load data
    device = torchutils.selectDevice(gpu_dev_id)
    trial_ids = utils.getUniqueIds(data_dir,
                                   prefix=prefix,
                                   suffix=feature_fn_format,
                                   to_array=True)
    dataset = utils.CvDataset(
        trial_ids,
        data_dir,
        prefix=prefix,
        feature_fn_format=feature_fn_format,
        label_fn_format=label_fn_format,
    )
    utils.saveMetadata(dataset.metadata, out_data_dir)
    utils.saveVariable(dataset.vocab, 'vocab', out_data_dir)

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(trial_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    if predict_mode == 'binary multiclass':
        # criterion = torchutils.BootstrappedCriterion(
        #     0.25, base_criterion=torch.nn.functional.binary_cross_entropy_with_logits,
        # )
        criterion = torch.nn.BCEWithLogitsLoss()
        labels_dtype = torch.float
    elif predict_mode == 'multiclass':
        criterion = torch.nn.CrossEntropyLoss()
        labels_dtype = torch.long
    elif predict_mode == 'classify':
        criterion = torch.nn.CrossEntropyLoss()
        labels_dtype = torch.long
    else:
        raise AssertionError()

    def make_dataset(feats, labels, ids, shuffle=True):
        dataset = torchutils.SequenceDataset(feats,
                                             labels,
                                             device=device,
                                             labels_dtype=labels_dtype,
                                             seq_ids=ids,
                                             **dataset_params)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True)
        return dataset, loader

    for cv_index, cv_fold in enumerate(cv_folds):
        train_data, val_data, test_data = dataset.getFold(cv_fold)
        if independent_signals:
            train_data = splitSeqs(*train_data, active_only=active_only)
            val_data = splitSeqs(*val_data, active_only=active_only)
            test_data = splitSeqs(*test_data, active_only=False)
        train_set, train_loader = make_dataset(*train_data, shuffle=True)
        test_set, test_loader = make_dataset(*test_data, shuffle=False)
        val_set, val_loader = make_dataset(*val_data, shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(dataset.trial_ids)} total '
            f'({len(train_set)} train, {len(val_set)} val, {len(test_set)} test)'
        )

        logger.info(f'{train_set.num_label_types} unique labels in train set; '
                    f'vocab size is {len(dataset.vocab)}')

        input_dim = train_set.num_obsv_dims
        output_dim = train_set.num_label_types
        if output_dim_from_vocab:
            output_dim = len(dataset.vocab)

        if model_name == 'linear':
            model = torchutils.LinearClassifier(
                input_dim, output_dim, **model_params).to(device=device)
        elif model_name == 'conv':
            model = ConvClassifier(input_dim, output_dim,
                                   **model_params).to(device=device)
        elif model_name == 'TCN':
            if predict_mode == 'multiclass':
                num_multiclass = train_set[0][1].shape[-1]
                output_dim = max([
                    train_set.num_label_types, test_set.num_label_types,
                    val_set.num_label_types
                ])
            else:
                num_multiclass = None
            model = TcnClassifier(input_dim,
                                  output_dim,
                                  num_multiclass=num_multiclass,
                                  **model_params).to(device=device)
        elif model_name == 'LSTM':
            if predict_mode == 'multiclass':
                num_multiclass = train_set[0][1].shape[-1]
                output_dim = max([
                    train_set.num_label_types, test_set.num_label_types,
                    val_set.num_label_types
                ])
            else:
                num_multiclass = None
            model = LstmClassifier(input_dim,
                                   output_dim,
                                   num_multiclass=num_multiclass,
                                   **model_params).to(device=device)
        else:
            raise AssertionError()

        optimizer_ft = torch.optim.Adam(model.parameters(),
                                        lr=learning_rate,
                                        betas=(0.9, 0.999),
                                        eps=1e-08,
                                        weight_decay=0,
                                        amsgrad=False)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                       step_size=1,
                                                       gamma=1.00)

        train_epoch_log = collections.defaultdict(list)
        val_epoch_log = collections.defaultdict(list)
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        model, last_model_wts = torchutils.trainModel(
            model,
            criterion,
            optimizer_ft,
            lr_scheduler,
            train_loader,
            val_loader,
            device=device,
            metrics=metric_dict,
            train_epoch_log=train_epoch_log,
            val_epoch_log=val_epoch_log,
            **train_params)

        # Test model
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)
        if independent_signals:
            test_io_history = tuple(joinSeqs(test_io_history))

        logger.info('[TST]  ' +
                    '  '.join(str(m) for m in metric_dict.values()))
        utils.writeResults(results_file,
                           {k: v.value
                            for k, v in metric_dict.items()}, sweep_param_name,
                           model_params)

        if plot_predictions:
            io_fig_dir = os.path.join(fig_dir, 'model-io')
            if not os.path.exists(io_fig_dir):
                os.makedirs(io_fig_dir)

            label_names = ('gt', 'pred')
            preds, scores, inputs, gt_labels, ids = zip(*test_io_history)
            for batch in test_io_history:
                batch = tuple(
                    x.cpu().numpy() if isinstance(x, torch.Tensor) else x
                    for x in batch)
                for preds, _, inputs, gt_labels, seq_id in zip(*batch):
                    fn = os.path.join(io_fig_dir,
                                      f"{prefix}{seq_id}_model-io.png")
                    utils.plot_array(inputs, (gt_labels.T, preds.T),
                                     label_names,
                                     fn=fn)

        for batch in test_io_history:
            batch = tuple(x.cpu().numpy() if isinstance(x, torch.Tensor) else x
                          for x in batch)
            for pred_seq, score_seq, feat_seq, label_seq, trial_id in zip(
                    *batch):
                saveVariable(pred_seq, f'{prefix}{trial_id}_pred-label-seq')
                saveVariable(score_seq, f'{prefix}{trial_id}_score-seq')
                saveVariable(label_seq, f'{prefix}{trial_id}_true-label-seq')

        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        train_fig_dir = os.path.join(fig_dir, 'train-plots')
        if not os.path.exists(train_fig_dir):
            os.makedirs(train_fig_dir)

        if train_epoch_log:
            torchutils.plotEpochLog(train_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Training performance',
                                    fn=os.path.join(
                                        train_fig_dir,
                                        f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        train_fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))
예제 #10
0
def main(out_dir=None,
         data_dir=None,
         scores_dir=None,
         frames_dir=None,
         vocab_from_scores_dir=None,
         only_fold=None,
         plot_io=None,
         prefix='seq=',
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={}):

    data_dir = os.path.expanduser(data_dir)
    scores_dir = os.path.expanduser(scores_dir)
    frames_dir = os.path.expanduser(frames_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    io_dir_images = os.path.join(fig_dir, 'model-io_images')
    if not os.path.exists(io_dir_images):
        os.makedirs(io_dir_images)

    io_dir_plots = os.path.join(fig_dir, 'model-io_plots')
    if not os.path.exists(io_dir_plots):
        os.makedirs(io_dir_plots)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    seq_ids = utils.getUniqueIds(scores_dir,
                                 prefix=prefix,
                                 suffix='pred-label-seq.*',
                                 to_array=True)

    logger.info(
        f"Loaded scores for {len(seq_ids)} sequences from {scores_dir}")

    if vocab_from_scores_dir:
        vocab = utils.loadVariable('vocab', scores_dir)
    else:
        vocab = utils.loadVariable('vocab', data_dir)

    all_metrics = collections.defaultdict(list)

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(seq_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    all_pred_seqs = []
    all_true_seqs = []

    for cv_index, cv_fold in enumerate(cv_folds):
        if only_fold is not None and cv_index != only_fold:
            continue

        train_indices, val_indices, test_indices = cv_fold
        logger.info(
            f"CV FOLD {cv_index + 1} / {len(cv_folds)}: "
            f"{len(train_indices)} train, {len(val_indices)} val, {len(test_indices)} test"
        )

        for i in test_indices:
            seq_id = seq_ids[i]
            logger.info(f"  Processing sequence {seq_id}...")

            trial_prefix = f"{prefix}{seq_id}"
            score_seq = utils.loadVariable(f"{trial_prefix}_score-seq",
                                           scores_dir)
            pred_seq = utils.loadVariable(f"{trial_prefix}_pred-label-seq",
                                          scores_dir)
            true_seq = utils.loadVariable(f"{trial_prefix}_true-label-seq",
                                          scores_dir)

            metric_dict = eval_metrics(pred_seq, true_seq)
            for name, value in metric_dict.items():
                logger.info(f"    {name}: {value * 100:.2f}%")
                all_metrics[name].append(value)

            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)

            all_pred_seqs.append(pred_seq)
            all_true_seqs.append(true_seq)

            if plot_io:
                utils.plot_array(score_seq.T, (true_seq, pred_seq),
                                 ('true', 'pred'),
                                 fn=os.path.join(io_dir_plots,
                                                 f"seq={seq_id:03d}.png"))

    if False:
        confusions = metrics.confusionMatrix(all_pred_seqs, all_true_seqs,
                                             len(vocab))
        utils.saveVariable(confusions, "confusions", out_data_dir)

        per_class_acc, class_counts = metrics.perClassAcc(confusions,
                                                          return_counts=True)
        class_preds = confusions.sum(axis=1)
        logger.info(f"MACRO ACC: {np.nanmean(per_class_acc) * 100:.2f}%")

        metrics.plotConfusions(os.path.join(fig_dir, 'confusions.png'),
                               confusions, vocab)
        metrics.plotPerClassAcc(os.path.join(fig_dir,
                                             'per-class-results.png'), vocab,
                                per_class_acc, class_preds, class_counts)
예제 #11
0
def main(out_dir=None, data_dir=None, detections_dir=None, modality=None, normalization=None):
    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if detections_dir is not None:
        detections_dir = os.path.expanduser(detections_dir)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadVariable(var_name):
        return joblib.load(os.path.join(data_dir, f'{var_name}.pkl'))

    def loadAll(var_name, trial_ids):
        def loadOne(trial_id):
            return loadVariable(f"trial-{trial_id}_{var_name}")
        return tuple(map(loadOne, trial_ids))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial-', to_array=True)

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(
        dataset_size,
        precomputed_fn=os.path.join(data_dir, 'cv-folds.pkl')
    )

    cv_folds = tuple(tuple(map(np.array, splits)) for splits in cv_folds)

    # Validate CV folds by checking that each split covers all trial ids
    for train_idxs, test_idxs in cv_folds:
        num_train = len(train_idxs)
        num_test = len(test_idxs)
        num_total = len(trial_ids)
        if num_train + num_test != num_total:
            err_str = f"{num_train} train + {num_test} test != {num_total} total"
            raise AssertionError(err_str)

    cv_fold_trial_ids = tuple(
        tuple(map(lambda x: trial_ids[x], splits))
        for splits in cv_folds
    )
    saveVariable(cv_fold_trial_ids, "cv-fold-trial-ids")

    for cv_index, (train_idxs, test_idxs) in enumerate(cv_folds):
        logger.info(
            f'CV fold {cv_index + 1}: {len(trial_ids)} total '
            f'({len(train_idxs)} train, {len(test_idxs)} test)'
        )

        hmm = loadVariable(f"hmm-fold{cv_index}")
        train_assemblies = hmm.states

        saveVariable(train_assemblies, f"cvfold={cv_index}_train-assemblies")
        saveVariable(hmm, f"cvfold={cv_index}_model")

        for trial_id in trial_ids[test_idxs]:
            # true_state_seqs = loadVariable('trial-{trial_id}_true-state-seq-orig', trial_ids)
            saved_predictions = loadVariable(f'trial-{trial_id}_pred-state-seq')
            # shape: (num_states, num_samples)
            data_scores = loadVariable(f'trial-{trial_id}_data-scores')

            if data_scores.shape[0] != len(train_assemblies):
                warn_str = (
                    f"Trial {trial_id}: {data_scores.shape[0]} data scores "
                    f"!= {len(train_assemblies)} assemblies in vocab"
                )
                logger.warning(warn_str)
                continue

            if detections_dir is not None:
                fn = f'trial-{trial_id}_class-label-frame-seq.pkl'
                pixel_label_frame_seq = joblib.load(os.path.join(detections_dir, fn))
                for i, label_frame in enumerate(pixel_label_frame_seq):
                    # px_is_unoccluded = (label_frame == 0) + (label_frame == 3)
                    if normalization == 'per-pixel':
                        px_is_blocks = label_frame == 3
                        denominator = px_is_blocks.sum()
                        if modality == 'RGB':
                            denominator *= 3
                        else:
                            raise NotImplementedError(f"Modality {modality} not recognized")
                        data_scores[:, i] /= denominator
                    elif normalization == 'marginal':
                        # This scalar value is broadcast to a uniform (log) prior
                        log_prior = -np.log(data_scores.shape[0])
                        joint_probs = data_scores + log_prior
                        data_marginals = scipy.special.logsumexp(joint_probs, axis=0)
                        data_scores = joint_probs - data_marginals
                    elif normalization is not None:
                        raise NotImplementedError()

            # Validate the loaded data
            pred_assembly_idxs = data_scores.argmax(axis=0)
            pred_assembly_seq = tuple(train_assemblies[i] for i in pred_assembly_idxs)

            preds_same = all(p1 == p1 for p1, p2 in zip(pred_assembly_seq, saved_predictions))
            if not preds_same:
                raise AssertionError('Computed predictions differ from saved predictions')

            saveVariable(data_scores, f"trial={trial_id}_data-scores")
예제 #12
0
def main(out_dir=None,
         data_dir=None,
         prefix='trial=',
         model_name=None,
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         file_fn_format=None,
         label_fn_format=None,
         start_from=None,
         stop_at=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         num_disp_imgs=None,
         viz_templates=None,
         results_file=None,
         sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    io_dir = os.path.join(fig_dir, 'model-io')
    if not os.path.exists(io_dir):
        os.makedirs(io_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name, to_dir=out_data_dir):
        utils.saveVariable(var, var_name, to_dir)

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix=prefix, to_array=True)
    vocab = utils.loadVariable('vocab', data_dir)
    saveVariable(vocab, 'vocab')

    # Define cross-validation folds
    data_loader = utils.CvDataset(trial_ids,
                                  data_dir,
                                  vocab=vocab,
                                  prefix=prefix,
                                  feature_fn_format=file_fn_format,
                                  label_fn_format=label_fn_format)
    cv_folds = utils.makeDataSplits(len(data_loader.trial_ids), **cv_params)

    device = torchutils.selectDevice(gpu_dev_id)
    labels_dtype = torch.long
    criterion = torch.nn.CrossEntropyLoss()
    metric_names = ('Loss', 'Accuracy')

    def make_dataset(fns, labels, ids, batch_mode='sample', shuffle=True):
        dataset = VideoDataset(fns,
                               labels,
                               device=device,
                               labels_dtype=labels_dtype,
                               seq_ids=ids,
                               batch_size=batch_size,
                               batch_mode=batch_mode)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=1,
                                             shuffle=shuffle)
        return dataset, loader

    for cv_index, cv_fold in enumerate(cv_folds):
        if start_from is not None and cv_index < start_from:
            continue

        if stop_at is not None and cv_index > stop_at:
            break

        train_data, val_data, test_data = data_loader.getFold(cv_fold)
        train_set, train_loader = make_dataset(*train_data,
                                               batch_mode='flatten',
                                               shuffle=True)
        test_set, test_loader = make_dataset(*test_data,
                                             batch_mode='flatten',
                                             shuffle=False)
        val_set, val_loader = make_dataset(*val_data,
                                           batch_mode='flatten',
                                           shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(data_loader.trial_ids)} total '
            f'({len(train_set)} train, {len(val_set)} val, {len(test_set)} test)'
        )

        model = ImageClassifier(len(vocab), **model_params)

        optimizer_ft = torch.optim.Adam(model.parameters(),
                                        lr=learning_rate,
                                        betas=(0.9, 0.999),
                                        eps=1e-08,
                                        weight_decay=0,
                                        amsgrad=False)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                       step_size=1,
                                                       gamma=1.00)

        train_epoch_log = collections.defaultdict(list)
        val_epoch_log = collections.defaultdict(list)
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        model, last_model_wts = torchutils.trainModel(
            model,
            criterion,
            optimizer_ft,
            lr_scheduler,
            train_loader,
            val_loader,
            device=device,
            metrics=metric_dict,
            train_epoch_log=train_epoch_log,
            val_epoch_log=val_epoch_log,
            **train_params)

        # Test model
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)
        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        utils.writeResults(results_file,
                           {name: m.value
                            for name, m in metric_dict.items()},
                           sweep_param_name, model_params)

        for pred_seq, score_seq, feat_seq, label_seq, batch_id in test_io_history:
            prefix = f'cvfold={cv_index}_batch={batch_id}'
            saveVariable(pred_seq.cpu().numpy(), f'{prefix}_pred-label-seq')
            saveVariable(score_seq.cpu().numpy(), f'{prefix}_score-seq')
            saveVariable(label_seq.cpu().numpy(), f'{prefix}_true-label-seq')
        saveVariable(test_set.unflatten,
                     f'cvfold={cv_index}_test-set-unflatten')
        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        if train_epoch_log:
            torchutils.plotEpochLog(train_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Training performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))
예제 #13
0
def main(out_dir=None,
         data_dir=None,
         scores_dirs={},
         vocab_from_scores_dir=None,
         only_fold=None,
         plot_io=None,
         prefix='seq=',
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={}):

    data_dir = os.path.expanduser(data_dir)
    scores_dirs = {
        name: os.path.expanduser(dir_)
        for name, dir_ in scores_dirs.items()
    }
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if len(scores_dirs) != 2:
        err_str = (
            f"scores_dirs has {len(scores_dirs)} entries, but this script "
            "compare exactly 2")
        raise NotImplementedError(err_str)

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    seq_ids = utils.getUniqueIds(data_dir,
                                 prefix=prefix,
                                 suffix='labels.*',
                                 to_array=True)

    logger.info(f"Loaded scores for {len(seq_ids)} sequences from {data_dir}")

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(seq_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    vocabs = {}
    confusions = {}
    accs = {}
    counts = {}
    for expt_name, scores_dir in scores_dirs.items():
        if vocab_from_scores_dir:
            vocabs[expt_name] = utils.loadVariable('vocab', scores_dir)
        else:
            vocabs[expt_name] = utils.loadVariable('vocab', data_dir)

        confusions[expt_name] = utils.loadVariable("confusions", scores_dir)
        per_class_accs, class_counts = metrics.perClassAcc(
            confusions[expt_name], return_counts=True)

        accs[expt_name] = per_class_accs
        counts[expt_name] = class_counts

    vocab = utils.reduce_all_equal(tuple(vocabs.values()))
    class_counts = utils.reduce_all_equal(tuple(counts.values()))

    first_name, second_name = scores_dirs.keys()
    confusions_diff = confusions[first_name] - confusions[second_name]
    acc_diff = accs[first_name] - accs[second_name]

    metrics.plotConfusions(
        os.path.join(fig_dir,
                     f'confusions_{first_name}-minus-{second_name}.png'),
        confusions_diff, vocab)
    metrics.plotPerClassAcc(
        os.path.join(fig_dir, f'accs_{first_name}-minus-{second_name}.png'),
        vocab, acc_diff, confusions_diff.sum(axis=1), class_counts)
예제 #14
0
def main(out_dir=None,
         data_dir=None,
         model_name=None,
         part_symmetries=None,
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None):

    if part_symmetries is None:
        part_symmetries = {
            'beam_side': ('backbeam_hole_1', 'backbeam_hole_2',
                          'frontbeam_hole_1', 'frontbeam_hole_2'),
            'beam_top': ('backbeam_hole_3', 'frontbeam_hole_3'),
            'backrest': ('backrest_hole_1', 'backrest_hole_2')
        }

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'trial={seq_id}_{var_name}')
            return joblib.load(fn)

        return tuple(map(loadOne, seq_ids))

    # Load vocab
    with open(os.path.join(data_dir, "part-vocab.yaml"), 'rt') as f:
        link_vocab = yaml.safe_load(f)
    assembly_vocab = joblib.load(os.path.join(data_dir, 'assembly-vocab.pkl'))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial=')
    feature_seqs = loadAll(trial_ids, 'feature-seq.pkl', data_dir)
    label_seqs = loadAll(trial_ids, 'label-seq.pkl', data_dir)

    if part_symmetries:
        # Construct equivalence classes from vocab
        eq_classes, assembly_eq_classes, eq_class_vocab = makeEqClasses(
            assembly_vocab, part_symmetries)
        lib_assembly.writeAssemblies(
            os.path.join(fig_dir, 'eq-class-vocab.txt'), eq_class_vocab)
        label_seqs = tuple(assembly_eq_classes[label_seq]
                           for label_seq in label_seqs)
        saveVariable(eq_class_vocab, 'assembly-vocab')
    else:
        eq_classes = None

    def impute_nan(input_seq):
        input_is_nan = np.isnan(input_seq)
        logger.info(f"{input_is_nan.sum()} NaN elements")
        input_seq[input_is_nan] = 0  # np.nanmean(input_seq)
        return input_seq

    # feature_seqs = tuple(map(impute_nan, feature_seqs))

    for trial_id, label_seq, feat_seq in zip(trial_ids, label_seqs,
                                             feature_seqs):
        saveVariable(feat_seq, f"trial={trial_id}_feature-seq")
        saveVariable(label_seq, f"trial={trial_id}_label-seq")

    device = torchutils.selectDevice(gpu_dev_id)

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, **cv_params)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (feature_seqs, label_seqs, trial_ids))
        return split_data

    for cv_index, cv_splits in enumerate(cv_folds):
        train_data, val_data, test_data = tuple(map(getSplit, cv_splits))

        train_feats, train_labels, train_ids = train_data
        train_set = torchutils.SequenceDataset(train_feats,
                                               train_labels,
                                               device=device,
                                               labels_dtype=torch.long,
                                               seq_ids=train_ids,
                                               transpose_data=True)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=batch_size,
                                                   shuffle=True)

        test_feats, test_labels, test_ids = test_data
        test_set = torchutils.SequenceDataset(test_feats,
                                              test_labels,
                                              device=device,
                                              labels_dtype=torch.long,
                                              seq_ids=test_ids,
                                              transpose_data=True)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=batch_size,
                                                  shuffle=False)

        val_feats, val_labels, val_ids = val_data
        val_set = torchutils.SequenceDataset(val_feats,
                                             val_labels,
                                             device=device,
                                             labels_dtype=torch.long,
                                             seq_ids=val_ids,
                                             transpose_data=True)
        val_loader = torch.utils.data.DataLoader(val_set,
                                                 batch_size=batch_size,
                                                 shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        input_dim = train_set.num_obsv_dims
        output_dim = train_set.num_label_types
        if model_name == 'linear':
            model = torchutils.LinearClassifier(
                input_dim, output_dim, **model_params).to(device=device)
        elif model_name == 'dummy':
            model = DummyClassifier(input_dim, output_dim, **model_params)
        elif model_name == 'AssemblyClassifier':
            model = AssemblyClassifier(assembly_vocab,
                                       link_vocab,
                                       eq_classes=eq_classes,
                                       **model_params)
        else:
            raise AssertionError()

        criterion = torch.nn.CrossEntropyLoss()
        if model_name != 'dummy':
            train_epoch_log = collections.defaultdict(list)
            val_epoch_log = collections.defaultdict(list)
            metric_dict = {
                'Avg Loss': metrics.AverageLoss(),
                'Accuracy': metrics.Accuracy(),
                'Precision': metrics.Precision(),
                'Recall': metrics.Recall(),
                'F1': metrics.Fmeasure()
            }

            optimizer_ft = torch.optim.Adam(model.parameters(),
                                            lr=learning_rate,
                                            betas=(0.9, 0.999),
                                            eps=1e-08,
                                            weight_decay=0,
                                            amsgrad=False)
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                           step_size=1,
                                                           gamma=1.00)

            model, last_model_wts = torchutils.trainModel(
                model,
                criterion,
                optimizer_ft,
                lr_scheduler,
                train_loader,
                val_loader,
                device=device,
                metrics=metric_dict,
                train_epoch_log=train_epoch_log,
                val_epoch_log=val_epoch_log,
                **train_params)

        logger.info(f'scale={float(model._scale)}')
        logger.info(f'alpha={float(model._alpha)}')

        # Test model
        metric_dict = {
            'Avg Loss': metrics.AverageLoss(),
            'Accuracy': metrics.Accuracy(),
            'Precision': metrics.Precision(),
            'Recall': metrics.Recall(),
            'F1': metrics.Fmeasure()
        }
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)

        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        d = {k: v.value for k, v in metric_dict.items()}
        utils.writeResults(results_file, d, sweep_param_name, model_params)

        if plot_predictions:
            io_fig_dir = os.path.join(fig_dir, 'model-io')
            if not os.path.exists(io_fig_dir):
                os.makedirs(io_fig_dir)

            label_names = ('gt', 'pred')
            preds, scores, inputs, gt_labels, ids = zip(*test_io_history)
            for batch in test_io_history:
                batch = tuple(
                    x.cpu().numpy() if isinstance(x, torch.Tensor) else x
                    for x in batch)
                for preds, _, inputs, gt_labels, seq_id in zip(*batch):
                    fn = os.path.join(io_fig_dir,
                                      f"trial={seq_id}_model-io.png")
                    utils.plot_array(inputs.sum(axis=-1), (gt_labels, preds),
                                     label_names,
                                     fn=fn,
                                     **viz_params)

        def saveTrialData(pred_seq, score_seq, feat_seq, label_seq, trial_id):
            saveVariable(pred_seq, f'trial={trial_id}_pred-label-seq')
            saveVariable(score_seq, f'trial={trial_id}_score-seq')
            saveVariable(label_seq, f'trial={trial_id}_true-label-seq')

        for batch in test_io_history:
            batch = tuple(x.cpu().numpy() if isinstance(x, torch.Tensor) else x
                          for x in batch)
            for io in zip(*batch):
                saveTrialData(*io)

        saveVariable(train_ids, f'cvfold={cv_index}_train-ids')
        saveVariable(test_ids, f'cvfold={cv_index}_test-ids')
        saveVariable(val_ids, f'cvfold={cv_index}_val-ids')
        saveVariable(train_epoch_log,
                     f'cvfold={cv_index}_{model_name}-train-epoch-log')
        saveVariable(val_epoch_log,
                     f'cvfold={cv_index}_{model_name}-val-epoch-log')
        saveVariable(metric_dict,
                     f'cvfold={cv_index}_{model_name}-metric-dict')
        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        train_fig_dir = os.path.join(fig_dir, 'train-plots')
        if not os.path.exists(train_fig_dir):
            os.makedirs(train_fig_dir)

        if train_epoch_log:
            torchutils.plotEpochLog(train_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Training performance',
                                    fn=os.path.join(
                                        train_fig_dir,
                                        f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        train_fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))
예제 #15
0
def main(out_dir=None,
         data_dir=None,
         segs_dir=None,
         pretrained_model_dir=None,
         model_name=None,
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         start_from=None,
         stop_at=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         num_disp_imgs=None,
         viz_templates=None,
         results_file=None,
         sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    segs_dir = os.path.expanduser(segs_dir)
    pretrained_model_dir = os.path.expanduser(pretrained_model_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    io_dir = os.path.join(fig_dir, 'model-io')
    if not os.path.exists(io_dir):
        os.makedirs(io_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name, to_dir=out_data_dir):
        utils.saveVariable(var, var_name, to_dir)

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial=', to_array=True)
    vocab = utils.loadVariable('vocab', pretrained_model_dir)
    parts_vocab = utils.loadVariable('parts-vocab', pretrained_model_dir)
    edge_labels = utils.loadVariable('part-labels', pretrained_model_dir)
    saveVariable(vocab, 'vocab')
    saveVariable(parts_vocab, 'parts-vocab')
    saveVariable(edge_labels, 'part-labels')

    # Define cross-validation folds
    data_loader = VideoLoader(trial_ids,
                              data_dir,
                              segs_dir,
                              vocab=vocab,
                              label_fn_format='assembly-seq')
    cv_folds = utils.makeDataSplits(len(data_loader.trial_ids), **cv_params)

    Dataset = sim2real.BlocksConnectionDataset
    device = torchutils.selectDevice(gpu_dev_id)
    label_dtype = torch.long
    labels_dtype = torch.long  # FIXME
    criterion = torch.nn.CrossEntropyLoss()

    def make_dataset(labels, ids, batch_mode='sample', shuffle=True):
        dataset = Dataset(vocab,
                          edge_labels,
                          label_dtype,
                          data_loader.loadData,
                          labels,
                          device=device,
                          labels_dtype=labels_dtype,
                          seq_ids=ids,
                          batch_size=batch_size,
                          batch_mode=batch_mode)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=1,
                                             shuffle=shuffle)
        return dataset, loader

    for cv_index, cv_fold in enumerate(cv_folds):
        if start_from is not None and cv_index < start_from:
            continue

        if stop_at is not None and cv_index > stop_at:
            break

        train_data, val_data, test_data = data_loader.getFold(cv_fold)
        train_set, train_loader = make_dataset(*train_data,
                                               batch_mode='sample',
                                               shuffle=True)
        test_set, test_loader = make_dataset(*test_data,
                                             batch_mode='flatten',
                                             shuffle=False)
        val_set, val_loader = make_dataset(*val_data,
                                           batch_mode='sample',
                                           shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(data_loader.trial_ids)} total '
            f'({len(train_set)} train, {len(val_set)} val, {len(test_set)} test)'
        )

        logger.info(
            f"Class freqs (train): {np.squeeze(train_set.class_freqs)}")
        logger.info(f"Class freqs   (val): {np.squeeze(val_set.class_freqs)}")
        logger.info(f"Class freqs  (test): {np.squeeze(test_set.class_freqs)}")

        if model_name == 'template':
            model = sim2real.AssemblyClassifier(vocab, **model_params)
        elif model_name == 'pretrained':
            pretrained_model = utils.loadVariable("cvfold=0_model-best",
                                                  pretrained_model_dir)
            model = sim2real.SceneClassifier(pretrained_model, **model_params)
            metric_names = ('Loss', 'Accuracy', 'Precision', 'Recall', 'F1')
            criterion = torch.nn.CrossEntropyLoss()
            # criterion = torchutils.BootstrappedCriterion(
            #     0.25, base_criterion=torch.nn.functional.cross_entropy,
            # )
        else:
            raise AssertionError()

        optimizer_ft = torch.optim.Adam(model.parameters(),
                                        lr=learning_rate,
                                        betas=(0.9, 0.999),
                                        eps=1e-08,
                                        weight_decay=0,
                                        amsgrad=False)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                       step_size=1,
                                                       gamma=1.00)

        train_epoch_log = collections.defaultdict(list)
        val_epoch_log = collections.defaultdict(list)
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        model, last_model_wts = torchutils.trainModel(
            model,
            criterion,
            optimizer_ft,
            lr_scheduler,
            train_loader,
            val_loader,
            device=device,
            metrics=metric_dict,
            train_epoch_log=train_epoch_log,
            val_epoch_log=val_epoch_log,
            **train_params)

        # Test model
        metric_dict = {name: metrics.makeMetric(name) for name in metric_names}
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)
        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        utils.writeResults(results_file,
                           {name: m.value
                            for name, m in metric_dict.items()},
                           sweep_param_name, model_params)

        for pred_seq, score_seq, feat_seq, label_seq, batch_id in test_io_history:
            prefix = f'cvfold={cv_index}_batch={batch_id}'
            saveVariable(pred_seq.cpu().numpy(), f'{prefix}_pred-label-seq')
            saveVariable(score_seq.cpu().numpy(), f'{prefix}_score-seq')
            saveVariable(label_seq.cpu().numpy(), f'{prefix}_true-label-seq')
        saveVariable(test_set.unflatten,
                     f'cvfold={cv_index}_test-set-unflatten')
        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        if train_epoch_log:
            torchutils.plotEpochLog(train_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Training performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))

        if model_name == 'pretrained' and num_disp_imgs is not None:
            cvfold_dir = os.path.join(io_dir, f'cvfold={cv_index}')
            if not os.path.exists(cvfold_dir):
                os.makedirs(cvfold_dir)
            model.plotBatches(test_io_history,
                              cvfold_dir,
                              images_per_fig=num_disp_imgs,
                              dataset=test_set)

        if model_name == 'template' and num_disp_imgs is not None:
            io_dir = os.path.join(fig_dir, 'model-io')
            if not os.path.exists(io_dir):
                os.makedirs(io_dir)
            plot_topk(model, test_io_history, num_disp_imgs,
                      os.path.join(io_dir, f"cvfold={cv_index}.png"))

        if viz_templates:
            sim2real.viz_model_params(model, templates_dir=None)
예제 #16
0
def main(
        out_dir=None, data_dir=None, prefix='trial=',
        feature_fn_format='feature-seq.pkl', label_fn_format='label_seq.pkl',
        slowfast_labels_path=None, cv_params={}, slowfast_csv_params={}):

    out_dir = os.path.expanduser(out_dir)
    data_dir = os.path.expanduser(data_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    # Load data
    trial_ids = utils.getUniqueIds(
        data_dir, prefix=prefix, suffix=feature_fn_format,
        to_array=True
    )
    dataset = utils.CvDataset(
        trial_ids, data_dir,
        feature_fn_format=feature_fn_format, label_fn_format=label_fn_format,
        vocab=[], prefix=prefix
    )
    utils.saveMetadata(dataset.metadata, out_data_dir)

    # Make folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, metadata=dataset.metadata, **cv_params)
    save_cv_folds(cv_folds, os.path.join(out_data_dir, 'cv-folds.json'))

    split_names = ('train', 'val', 'test')

    # Check folds
    for cv_index, cv_fold in enumerate(cv_folds):
        train_data, val_data, test_data = dataset.getFold(cv_fold)
        train_feats, train_labels, train_ids = train_data
        test_feats, test_labels, test_ids = test_data
        val_feats, val_labels, val_ids = val_data

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        slowfast_labels_pattern = os.path.join(data_dir, 'slowfast-labels*.csv')
        for slowfast_labels_path in glob.glob(slowfast_labels_pattern):
            cv_str = f"cvfold={cv_index}"
            fn = os.path.basename(slowfast_labels_path)
            slowfast_labels = pd.read_csv(
                slowfast_labels_path, index_col=0, keep_default_na=False,
                **slowfast_csv_params
            )
            for split_indices, split_name in zip(cv_fold, split_names):
                matches = tuple(
                    slowfast_labels.loc[slowfast_labels['video_name'] == vid_id]
                    for vid_id in dataset.metadata.iloc[split_indices]['dir_name'].to_list()
                )
                if matches:
                    split = pd.concat(matches, axis=0)
                    split.to_csv(
                        os.path.join(out_data_dir, f"{cv_str}_{split_name}_{fn}"),
                        **slowfast_csv_params
                    )
                else:
                    logger.info(f'  Skipping empty slowfast split: {split_name}')
예제 #17
0
def main(out_dir=None,
         data_dir=None,
         model_name=None,
         pretrained_model_dir=None,
         gpu_dev_id=None,
         batch_size=None,
         learning_rate=None,
         independent_signals=None,
         active_only=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={},
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None,
         label_mapping=None,
         eval_label_mapping=None):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'trial={seq_id}_{var_name}')
            return joblib.load(fn)

        return tuple(map(loadOne, seq_ids))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial=', to_array=True)
    feature_seqs = loadAll(trial_ids, 'feature-seq.pkl', data_dir)
    label_seqs = loadAll(trial_ids, 'label-seq.pkl', data_dir)

    device = torchutils.selectDevice(gpu_dev_id)

    if label_mapping is not None:

        def map_labels(labels):
            for i, j in label_mapping.items():
                labels[labels == i] = j
            return labels

        label_seqs = tuple(map(map_labels, label_seqs))

    # Define cross-validation folds
    dataset_size = len(trial_ids)
    cv_folds = utils.makeDataSplits(dataset_size, **cv_params)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (feature_seqs, label_seqs, trial_ids))
        return split_data

    for cv_index, cv_splits in enumerate(cv_folds):
        if pretrained_model_dir is not None:

            def loadFromPretrain(fn):
                return joblib.load(
                    os.path.join(pretrained_model_dir, f"{fn}.pkl"))

            model = loadFromPretrain(f'cvfold={cv_index}_{model_name}-best')
            train_ids = loadFromPretrain(f'cvfold={cv_index}_train-ids')
            val_ids = loadFromPretrain(f'cvfold={cv_index}_val-ids')
            test_ids = tuple(i for i in trial_ids
                             if i not in (train_ids + val_ids))
            test_idxs = tuple(trial_ids.tolist().index(i) for i in test_ids)
            test_data = getSplit(test_idxs)

            if independent_signals:
                criterion = torch.nn.CrossEntropyLoss()
                labels_dtype = torch.long
                test_data = splitSeqs(*test_data, active_only=False)
            else:
                # FIXME
                # criterion = torch.nn.BCEWithLogitsLoss()
                # labels_dtype = torch.float
                criterion = torch.nn.CrossEntropyLoss()
                labels_dtype = torch.long

            test_feats, test_labels, test_ids = test_data
            test_set = torchutils.SequenceDataset(test_feats,
                                                  test_labels,
                                                  device=device,
                                                  labels_dtype=labels_dtype,
                                                  seq_ids=test_ids,
                                                  transpose_data=True)
            test_loader = torch.utils.data.DataLoader(test_set,
                                                      batch_size=batch_size,
                                                      shuffle=False)

            # Test model
            metric_dict = {
                'Avg Loss': metrics.AverageLoss(),
                'Accuracy': metrics.Accuracy(),
                'Precision': metrics.Precision(),
                'Recall': metrics.Recall(),
                'F1': metrics.Fmeasure()
            }
            test_io_history = torchutils.predictSamples(
                model.to(device=device),
                test_loader,
                criterion=criterion,
                device=device,
                metrics=metric_dict,
                data_labeled=True,
                update_model=False,
                seq_as_batch=train_params['seq_as_batch'],
                return_io_history=True)
            if independent_signals:
                test_io_history = tuple(joinSeqs(test_io_history))

            metric_str = '  '.join(str(m) for m in metric_dict.values())
            logger.info('[TST]  ' + metric_str)

            d = {k: v.value for k, v in metric_dict.items()}
            utils.writeResults(results_file, d, sweep_param_name, model_params)

            if plot_predictions:
                imu.plot_prediction_eg(test_io_history, fig_dir, **viz_params)

            def saveTrialData(pred_seq, score_seq, feat_seq, label_seq,
                              trial_id):
                if label_mapping is not None:

                    def dup_score_cols(scores):
                        num_cols = scores.shape[-1] + len(label_mapping)
                        col_idxs = torch.arange(num_cols)
                        for i, j in label_mapping.items():
                            col_idxs[i] = j
                        return scores[..., col_idxs]

                    score_seq = dup_score_cols(score_seq)
                saveVariable(pred_seq.cpu().numpy(),
                             f'trial={trial_id}_pred-label-seq')
                saveVariable(score_seq.cpu().numpy(),
                             f'trial={trial_id}_score-seq')
                saveVariable(label_seq.cpu().numpy(),
                             f'trial={trial_id}_true-label-seq')

            for io in test_io_history:
                saveTrialData(*io)
            continue

        train_data, val_data, test_data = tuple(map(getSplit, cv_splits))

        if independent_signals:
            criterion = torch.nn.CrossEntropyLoss()
            labels_dtype = torch.long
            split_ = functools.partial(splitSeqs, active_only=active_only)
            train_data = split_(*train_data)
            val_data = split_(*val_data)
            test_data = splitSeqs(*test_data, active_only=False)
        else:
            # FIXME
            # criterion = torch.nn.BCEWithLogitsLoss()
            # labels_dtype = torch.float
            criterion = torch.nn.CrossEntropyLoss()
            labels_dtype = torch.long

        train_feats, train_labels, train_ids = train_data
        train_set = torchutils.SequenceDataset(train_feats,
                                               train_labels,
                                               device=device,
                                               labels_dtype=labels_dtype,
                                               seq_ids=train_ids,
                                               transpose_data=True)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=batch_size,
                                                   shuffle=True)

        test_feats, test_labels, test_ids = test_data
        test_set = torchutils.SequenceDataset(test_feats,
                                              test_labels,
                                              device=device,
                                              labels_dtype=labels_dtype,
                                              seq_ids=test_ids,
                                              transpose_data=True)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=batch_size,
                                                  shuffle=False)

        val_feats, val_labels, val_ids = val_data
        val_set = torchutils.SequenceDataset(val_feats,
                                             val_labels,
                                             device=device,
                                             labels_dtype=labels_dtype,
                                             seq_ids=val_ids,
                                             transpose_data=True)
        val_loader = torch.utils.data.DataLoader(val_set,
                                                 batch_size=batch_size,
                                                 shuffle=True)

        logger.info(
            f'CV fold {cv_index + 1} / {len(cv_folds)}: {len(trial_ids)} total '
            f'({len(train_ids)} train, {len(val_ids)} val, {len(test_ids)} test)'
        )

        input_dim = train_set.num_obsv_dims
        output_dim = train_set.num_label_types
        if model_name == 'linear':
            model = torchutils.LinearClassifier(
                input_dim, output_dim, **model_params).to(device=device)
        elif model_name == 'conv':
            model = ConvClassifier(input_dim, output_dim,
                                   **model_params).to(device=device)
        elif model_name == 'TCN':
            model = TcnClassifier(input_dim, output_dim, **model_params)
        else:
            raise AssertionError()

        train_epoch_log = collections.defaultdict(list)
        val_epoch_log = collections.defaultdict(list)
        metric_dict = {
            'Avg Loss': metrics.AverageLoss(),
            'Accuracy': metrics.Accuracy(),
            'Precision': metrics.Precision(),
            'Recall': metrics.Recall(),
            'F1': metrics.Fmeasure()
        }

        optimizer_ft = torch.optim.Adam(model.parameters(),
                                        lr=learning_rate,
                                        betas=(0.9, 0.999),
                                        eps=1e-08,
                                        weight_decay=0,
                                        amsgrad=False)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft,
                                                       step_size=1,
                                                       gamma=1.00)

        model, last_model_wts = torchutils.trainModel(
            model,
            criterion,
            optimizer_ft,
            lr_scheduler,
            train_loader,
            val_loader,
            device=device,
            metrics=metric_dict,
            train_epoch_log=train_epoch_log,
            val_epoch_log=val_epoch_log,
            **train_params)

        # Test model
        metric_dict = {
            'Avg Loss': metrics.AverageLoss(),
            'Accuracy': metrics.Accuracy(),
            'Precision': metrics.Precision(),
            'Recall': metrics.Recall(),
            'F1': metrics.Fmeasure()
        }
        test_io_history = torchutils.predictSamples(
            model.to(device=device),
            test_loader,
            criterion=criterion,
            device=device,
            metrics=metric_dict,
            data_labeled=True,
            update_model=False,
            seq_as_batch=train_params['seq_as_batch'],
            return_io_history=True)
        if independent_signals:
            test_io_history = tuple(joinSeqs(test_io_history))

        metric_str = '  '.join(str(m) for m in metric_dict.values())
        logger.info('[TST]  ' + metric_str)

        d = {k: v.value for k, v in metric_dict.items()}
        utils.writeResults(results_file, d, sweep_param_name, model_params)

        if plot_predictions:
            # imu.plot_prediction_eg(test_io_history, fig_dir, fig_type=fig_type, **viz_params)
            imu.plot_prediction_eg(test_io_history, fig_dir, **viz_params)

        def saveTrialData(pred_seq, score_seq, feat_seq, label_seq, trial_id):
            if label_mapping is not None:

                def dup_score_cols(scores):
                    num_cols = scores.shape[-1] + len(label_mapping)
                    col_idxs = torch.arange(num_cols)
                    for i, j in label_mapping.items():
                        col_idxs[i] = j
                    return scores[..., col_idxs]

                score_seq = dup_score_cols(score_seq)
            saveVariable(pred_seq.cpu().numpy(),
                         f'trial={trial_id}_pred-label-seq')
            saveVariable(score_seq.cpu().numpy(),
                         f'trial={trial_id}_score-seq')
            saveVariable(label_seq.cpu().numpy(),
                         f'trial={trial_id}_true-label-seq')

        for io in test_io_history:
            saveTrialData(*io)

        saveVariable(train_ids, f'cvfold={cv_index}_train-ids')
        saveVariable(test_ids, f'cvfold={cv_index}_test-ids')
        saveVariable(val_ids, f'cvfold={cv_index}_val-ids')
        saveVariable(train_epoch_log,
                     f'cvfold={cv_index}_{model_name}-train-epoch-log')
        saveVariable(val_epoch_log,
                     f'cvfold={cv_index}_{model_name}-val-epoch-log')
        saveVariable(metric_dict,
                     f'cvfold={cv_index}_{model_name}-metric-dict')
        saveVariable(model, f'cvfold={cv_index}_{model_name}-best')

        model.load_state_dict(last_model_wts)
        saveVariable(model, f'cvfold={cv_index}_{model_name}-last')

        torchutils.plotEpochLog(train_epoch_log,
                                subfig_size=(10, 2.5),
                                title='Training performance',
                                fn=os.path.join(
                                    fig_dir,
                                    f'cvfold={cv_index}_train-plot.png'))

        if val_epoch_log:
            torchutils.plotEpochLog(val_epoch_log,
                                    subfig_size=(10, 2.5),
                                    title='Heldout performance',
                                    fn=os.path.join(
                                        fig_dir,
                                        f'cvfold={cv_index}_val-plot.png'))

        if eval_label_mapping is not None:
            metric_dict = {
                'Avg Loss': metrics.AverageLoss(),
                'Accuracy': metrics.Accuracy(),
                'Precision': metrics.Precision(),
                'Recall': metrics.Recall(),
                'F1': metrics.Fmeasure()
            }
            test_io_history = torchutils.predictSamples(
                model.to(device=device),
                test_loader,
                criterion=criterion,
                device=device,
                metrics=metric_dict,
                data_labeled=True,
                update_model=False,
                seq_as_batch=train_params['seq_as_batch'],
                return_io_history=True,
                label_mapping=eval_label_mapping)
            if independent_signals:
                test_io_history = joinSeqs(test_io_history)
            metric_str = '  '.join(str(m) for m in metric_dict.values())
            logger.info('[TST]  ' + metric_str)
예제 #18
0
def main(out_dir=None,
         data_dir=None,
         assembly_data_dir=None,
         scores_dir=None,
         event_attr_fn=None,
         connection_attr_fn=None,
         assembly_attr_fn=None,
         only_fold=None,
         plot_io=None,
         prefix='seq=',
         stop_after=None,
         background_action='',
         model_params={},
         cv_params={},
         stride=None,
         results_file=None,
         sweep_param_name=None):

    data_dir = os.path.expanduser(data_dir)
    assembly_data_dir = os.path.expanduser(assembly_data_dir)
    scores_dir = os.path.expanduser(scores_dir)
    event_attr_fn = os.path.expanduser(event_attr_fn)
    connection_attr_fn = os.path.expanduser(connection_attr_fn)
    assembly_attr_fn = os.path.expanduser(assembly_attr_fn)
    out_dir = os.path.expanduser(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    logger = utils.setupRootLogger(filename=os.path.join(out_dir, 'log.txt'))

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    misc_dir = os.path.join(out_dir, 'misc')
    if not os.path.exists(misc_dir):
        os.makedirs(misc_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    seq_ids = utils.getUniqueIds(data_dir,
                                 prefix=prefix,
                                 suffix='labels.*',
                                 to_array=True)

    dataset = utils.FeaturelessCvDataset(seq_ids,
                                         data_dir,
                                         prefix=prefix,
                                         label_fn_format='labels')

    logger.info(
        f"Loaded scores for {len(seq_ids)} sequences from {scores_dir}")

    # Define cross-validation folds
    cv_folds = utils.makeDataSplits(len(seq_ids), **cv_params)
    utils.saveVariable(cv_folds, 'cv-folds', out_data_dir)

    # Load event, connection attributes
    assembly_vocab = tuple(
        tuple(sorted(tuple(sorted(joint)) for joint in a))
        for a in utils.loadVariable('vocab', assembly_data_dir))
    (*probs, assembly_transition_probs), vocabs = loadPartInfo(
        event_attr_fn,
        connection_attr_fn,
        assembly_attr_fn,
        background_action=background_action,
        assembly_vocab=assembly_vocab)
    event_assembly_scores = event_to_assembly_scores(*probs, vocabs)
    assembly_transition_scores = np.log(assembly_transition_probs)
    viz_transition_probs(os.path.join(fig_dir, 'action-transitions'),
                         np.exp(event_assembly_scores), vocabs['event_vocab'])
    write_transition_probs(os.path.join(misc_dir, 'action-transitions'),
                           np.exp(event_assembly_scores),
                           vocabs['event_vocab'], vocabs['assembly_vocab'])

    for cv_index, cv_fold in enumerate(cv_folds):
        if only_fold is not None and cv_index != only_fold:
            continue

        train_indices, val_indices, test_indices = cv_fold
        logger.info(
            f"CV FOLD {cv_index + 1} / {len(cv_folds)}: "
            f"{len(train_indices)} train, {len(val_indices)} val, {len(test_indices)} test"
        )

        train_data, val_data, test_data = dataset.getFold(cv_fold)

        cv_str = f'cvfold={cv_index}'

        class_priors, event_dur_probs = count_priors(train_data[0],
                                                     len(dataset.vocab),
                                                     stride=stride,
                                                     approx_upto=0.95,
                                                     support_only=True)
        event_dur_scores = np.log(event_dur_probs)
        event_dur_scores = np.zeros_like(event_dur_scores)
        scores = (event_dur_scores, event_assembly_scores,
                  assembly_transition_scores)

        model = decode.AssemblyActionRecognizer(scores, vocabs, model_params)

        viz_priors(os.path.join(fig_dir, f'{cv_str}_priors'), class_priors,
                   event_dur_probs)
        model.write_fsts(os.path.join(misc_dir, f'{cv_str}_fsts'))
        model.save_vocabs(os.path.join(out_data_dir, f'{cv_str}_model-vocabs'))

        for i, (_, seq_id) in enumerate(zip(*test_data)):
            if stop_after is not None and i >= stop_after:
                break

            trial_prefix = f"{prefix}{seq_id}"

            if model_params['return_label'] == 'input':
                true_seq = utils.loadVariable(f"{trial_prefix}_true-label-seq",
                                              scores_dir)
            elif model_params['return_label'] == 'output':
                try:
                    true_seq = utils.loadVariable(f"{trial_prefix}_label-seq",
                                                  assembly_data_dir)
                    true_seq = true_seq[::stride]
                except AssertionError:
                    # logger.info(f'  Skipping sequence {seq_id}: {e}')
                    continue

            logger.info(f"  Processing sequence {seq_id}...")

            event_score_seq = utils.loadVariable(f"{trial_prefix}_score-seq",
                                                 scores_dir)

            if event_score_seq.shape[0] != true_seq.shape[0]:
                err_str = (f'Event scores shape {event_score_seq.shape} '
                           f'!= labels shape {true_seq.shape}')
                raise AssertionError(err_str)

            # FIXME: the serialized variables are probs, not log-probs
            # event_score_seq = suppress_nonmax(event_score_seq)
            # event_score_seq = np.log(event_score_seq)

            decode_score_seq = model.forward(event_score_seq)
            pred_seq = model.predict(decode_score_seq)

            metric_dict = eval_metrics(pred_seq, true_seq)
            for name, value in metric_dict.items():
                logger.info(f"    {name}: {value * 100:.2f}%")
            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)

            utils.saveVariable(decode_score_seq, f'{trial_prefix}_score-seq',
                               out_data_dir)
            utils.saveVariable(pred_seq, f'{trial_prefix}_pred-label-seq',
                               out_data_dir)
            utils.saveVariable(true_seq, f'{trial_prefix}_true-label-seq',
                               out_data_dir)

            if plot_io:
                utils.plot_array(event_score_seq.T, (pred_seq.T, true_seq.T),
                                 ('pred', 'true'),
                                 fn=os.path.join(fig_dir,
                                                 f"seq={seq_id:03d}.png"))
                write_labels(
                    os.path.join(misc_dir, f"seq={seq_id:03d}_pred-seq.txt"),
                    pred_seq, model.output_vocab.as_raw())
                write_labels(
                    os.path.join(misc_dir, f"seq={seq_id:03d}_true-seq.txt"),
                    true_seq, model.output_vocab.as_raw())
예제 #19
0
def main(out_dir=None,
         data_dir=None,
         attributes_dir=None,
         use_gt_segments=None,
         segments_dir=None,
         cv_data_dir=None,
         ignore_trial_ids=None,
         gpu_dev_id=None,
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={}):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    attributes_dir = os.path.expanduser(attributes_dir)
    if segments_dir is not None:
        segments_dir = os.path.expanduser(segments_dir)
    if cv_data_dir is not None:
        cv_data_dir = os.path.expanduser(cv_data_dir)

    if results_file is None:
        results_file = os.path.join(out_dir, 'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir, prefix='trial='):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'{prefix}{seq_id}_{var_name}')
            return joblib.load(fn)

        return tuple(map(loadOne, seq_ids))

    # Load data
    trial_ids = utils.getUniqueIds(data_dir, prefix='trial=', to_array=True)
    assembly_seqs = loadAll(trial_ids,
                            'assembly-seq.pkl',
                            data_dir,
                            prefix='trial=')

    # Define cross-validation folds
    if cv_data_dir is None:
        dataset_size = len(trial_ids)
        cv_folds = utils.makeDataSplits(dataset_size, **cv_params)
        cv_fold_trial_ids = tuple(
            tuple(map(lambda x: trial_ids[x], splits)) for splits in cv_folds)
    else:
        fn = os.path.join(cv_data_dir, 'cv-fold-trial-ids.pkl')
        cv_fold_trial_ids = joblib.load(fn)

    test_ids = set().union(*tuple(set(fold[-1]) for fold in cv_fold_trial_ids))
    logger.info(f"{len(test_ids)} final test ids: {test_ids}")

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (assembly_seqs, trial_ids))
        return split_data

    for cv_index, (train_ids, test_ids) in enumerate(cv_fold_trial_ids):
        logger.info(f'CV fold {cv_index + 1}: {len(trial_ids)} total '
                    f'({len(train_ids)} train, {len(test_ids)} test)')

        try:
            test_idxs = torch.tensor(
                [trial_ids.tolist().index(i) for i in test_ids])
        except ValueError:
            logger.info("  Skipping fold: missing test data")
            continue

        # TRAIN PHASE
        if cv_data_dir is None:
            train_idxs = torch.tensor([trial_ids.index(i) for i in train_ids])
            train_assembly_seqs = tuple(assembly_seqs[i] for i in train_idxs)
            train_assemblies = []
            for seq in train_assembly_seqs:
                list(
                    labels.gen_eq_classes(seq,
                                          train_assemblies,
                                          equivalent=None))
        else:
            fn = os.path.join(cv_data_dir,
                              f'cvfold={cv_index}_train-assemblies.pkl')
            train_assemblies = joblib.load(fn)

        signatures = make_signatures(train_assemblies)

        if plot_predictions:
            figsize = (12, 3)
            fig, axis = plt.subplots(1, figsize=figsize)
            axis.imshow(signatures.numpy().T,
                        interpolation='none',
                        aspect='auto')
            plt.savefig(
                os.path.join(fig_dir, f"cvfold={cv_index}_signatures.png"))
            plt.close()

        # TEST PHASE
        accuracies = {'assembly': [], 'assembly_upto_eq': []}
        for gt_assembly_seq, trial_id in zip(*getSplit(test_idxs)):

            # if ignore_trial_ids is not None and trial_id in ignore_trial_ids:
            if False:
                logger.info(f"  Ignoring trial {trial_id} in test fold")
                continue

            # FIXME: implement consistent data dimensions during serialization
            #   (ie samples along rows)
            # feature_seq shape is (pairs, samples, classes)
            # should be (samples, pairs, classes)
            try:
                fn = os.path.join(attributes_dir,
                                  f'trial={trial_id}_score-seq.pkl')
                feature_seq = joblib.load(fn)
            except FileNotFoundError:
                logger.info(f"  File not found: {fn}")
                continue

            test_assemblies = train_assemblies.copy()
            gt_assembly_id_seq = list(
                labels.gen_eq_classes(gt_assembly_seq,
                                      test_assemblies,
                                      equivalent=None))
            gt_seq = makeAssemblyLabels(gt_assembly_id_seq, gt_assembly_seq)

            if use_gt_segments:
                segments = utils.makeSegmentLabels(gt_seq)
            elif segments_dir is not None:
                var_name = 'segment-seq-imu.pkl'
                fn = os.path.join(segments_dir, f'trial={trial_id}_{var_name}')
                try:
                    segments = joblib.load(fn)
                except FileNotFoundError:
                    logger.info(f"  File not found: {fn}")
                    continue
            else:
                segments = None

            feature_seq = torch.tensor(feature_seq, dtype=torch.float)

            if segments is not None:
                feature_seq = feature_seq.transpose(0, 1)

                feature_seq, _ = utils.reduce_over_segments(
                    feature_seq.numpy(),
                    segments,
                    reduction=lambda x: x.mean(axis=0))
                feature_seq = torch.tensor(feature_seq.swapaxes(0, 1),
                                           dtype=torch.float)

                gt_seq, _ = utils.reduce_over_segments(gt_seq.numpy(),
                                                       segments)
                gt_seq = torch.tensor(gt_seq, dtype=torch.long)

            feature_seq = make_features(feature_seq)
            score_seq = score(feature_seq, signatures)
            pred_seq = predict(score_seq)

            pred_assemblies = [train_assemblies[i] for i in pred_seq]
            gt_assemblies = [test_assemblies[i] for i in gt_seq]
            acc = metrics.accuracy_upto(pred_assemblies,
                                        gt_assemblies,
                                        equivalence=components_equivalent)
            accuracies['assembly_upto_eq'].append(acc)

            acc = metrics.accuracy_upto(pred_assemblies, gt_assemblies)
            accuracies['assembly'].append(acc)

            # FIXME: Improve data naming convention in decode_keyframes.py
            saveVariable(score_seq, f'trial={trial_id}_data-scores')

            if plot_predictions:
                fn = os.path.join(fig_dir, f'trial={trial_id:03}.png')
                utils.plot_array(
                    feature_seq,
                    (gt_seq.numpy(), pred_seq.numpy(), score_seq.numpy()),
                    ('gt', 'pred', 'scores'),
                    fn=fn)

        if not any(v for v in accuracies.values()):
            continue

        metric_dict = {}
        for name, fold_accuracies in accuracies.items():
            fold_accuracy = float(torch.tensor(fold_accuracies).mean())
            logger.info(f'  {name}: {fold_accuracy * 100:.1f}%')
            metric_dict[name] = fold_accuracy
        utils.writeResults(results_file, metric_dict, sweep_param_name,
                           model_params)
예제 #20
0
def main(out_dir=None,
         data_dir=None,
         cv_data_dir=None,
         scores_dir=None,
         eq_class='state index',
         plot_predictions=None,
         results_file=None,
         sweep_param_name=None,
         model_params={},
         cv_params={},
         train_params={},
         viz_params={}):

    data_dir = os.path.expanduser(data_dir)
    out_dir = os.path.expanduser(out_dir)
    scores_dir = os.path.expanduser(scores_dir)
    if cv_data_dir is not None:
        cv_data_dir = os.path.expanduser(cv_data_dir)

    if results_file is None:
        results_file = os.path.join(out_dir, f'results.csv')
    else:
        results_file = os.path.expanduser(results_file)

    fig_dir = os.path.join(out_dir, 'figures')
    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)

    out_data_dir = os.path.join(out_dir, 'data')
    if not os.path.exists(out_data_dir):
        os.makedirs(out_data_dir)

    def saveVariable(var, var_name):
        joblib.dump(var, os.path.join(out_data_dir, f'{var_name}.pkl'))

    def loadAll(seq_ids, var_name, data_dir):
        def loadOne(seq_id):
            fn = os.path.join(data_dir, f'trial={seq_id}_{var_name}')
            return joblib.load(fn)

        return tuple(map(loadOne, seq_ids))

    # Load data
    trial_ids = utils.getUniqueIds(scores_dir, prefix='trial=')
    assembly_seqs = loadAll(trial_ids, 'assembly-seq.pkl', data_dir)
    score_seqs = loadAll(trial_ids, 'data-scores.pkl', scores_dir)

    oov_rate, contextual_oov_rate, state_counts = OOVrate(assembly_seqs,
                                                          eq_class=eq_class)
    logger.info(f"OOV RATE: {oov_rate * 100:.1f}%")
    plotOOV(contextual_oov_rate,
            state_counts,
            fn=os.path.join(fig_dir, "oovs.png"),
            eq_class=eq_class)
    scatterOOV(contextual_oov_rate,
               state_counts,
               fn=os.path.join(fig_dir, "oovs_scatter.png"),
               eq_class=eq_class)

    import sys
    sys.exit()

    # Define cross-validation folds
    if cv_data_dir is None:
        dataset_size = len(trial_ids)
        cv_folds = utils.makeDataSplits(dataset_size, **cv_params)
        cv_fold_trial_ids = tuple(
            tuple(map(lambda x: trial_ids[x], splits)) for splits in cv_folds)
    else:
        fn = os.path.join(cv_data_dir, f'cv-fold-trial-ids.pkl')
        cv_fold_trial_ids = joblib.load(fn)

    def getSplit(split_idxs):
        split_data = tuple(
            tuple(s[i] for i in split_idxs)
            for s in (score_seqs, assembly_seqs, trial_ids))
        return split_data

    for cv_index, (train_ids, test_ids) in enumerate(cv_fold_trial_ids):

        logger.info(f'CV fold {cv_index + 1}: {len(trial_ids)} total '
                    f'({len(train_ids)} train, {len(test_ids)} test)')

        try:
            test_idxs = np.array(
                [trial_ids.tolist().index(i) for i in test_ids])
        except ValueError:
            logger.info(f"  Skipping fold: missing test data")
            continue

        # TRAIN PHASE
        if cv_data_dir is None:
            train_idxs = np.array([trial_ids.index(i) for i in train_ids])
            train_assembly_seqs = tuple(assembly_seqs[i] for i in train_idxs)
            train_assemblies = []
            for seq in train_assembly_seqs:
                list(
                    labels.gen_eq_classes(seq,
                                          train_assemblies,
                                          equivalent=None))
            model = None
        else:
            fn = f'cvfold={cv_index}_train-assemblies.pkl'
            train_assemblies = joblib.load(os.path.join(cv_data_dir, fn))
            train_idxs = [
                i for i in range(len(trial_ids)) if i not in test_idxs
            ]

            fn = f'cvfold={cv_index}_model.pkl'
            # model = joblib.load(os.path.join(cv_data_dir, fn))
            model = None

        train_features, train_assembly_seqs, train_ids = getSplit(train_idxs)
        test_assemblies = train_assemblies.copy()
        for score_seq, gt_assembly_seq, trial_id in zip(*getSplit(test_idxs)):
            gt_seq = np.array(
                list(
                    labels.gen_eq_classes(gt_assembly_seq,
                                          test_assemblies,
                                          equivalent=None)))

        # oov_rate = OOVrate(train_assembly_seqs)
        # logger.info(f"  OOV RATE: {oov_rate * 100:.1f}%")

        if plot_predictions:
            assembly_fig_dir = os.path.join(fig_dir, 'assembly-imgs')
            if not os.path.exists(assembly_fig_dir):
                os.makedirs(assembly_fig_dir)
            for i, assembly in enumerate(test_assemblies):
                assembly.draw(assembly_fig_dir, i)

        # TEST PHASE
        accuracies = []
        for score_seq, gt_assembly_seq, trial_id in zip(*getSplit(test_idxs)):
            gt_seq = np.array(
                list(
                    labels.gen_eq_classes(gt_assembly_seq,
                                          test_assemblies,
                                          equivalent=None)))

            num_labels = gt_seq.shape[0]
            num_scores = score_seq.shape[-1]
            if num_labels != num_scores:
                err_str = f"Skipping trial {trial_id}: {num_labels} labels != {num_scores} scores"
                logger.info(err_str)
                continue

            if model is None:
                pred_seq = score_seq.argmax(axis=0)
            else:
                raise AssertionError()
            pred_seq = score_seq.argmax(axis=0)

            pred_assemblies = [train_assemblies[i] for i in pred_seq]
            gt_assemblies = [test_assemblies[i] for i in gt_seq]

            acc = metrics.accuracy_upto(pred_assemblies,
                                        gt_assemblies,
                                        equivalence=None)
            accuracies.append(acc)

            # num_states = len(gt_seq)
            # logger.info(f"  trial {trial_id}: {num_states} keyframes")
            # logger.info(f"    accuracy (fused): {acc * 100:.1f}%")

            saveVariable(score_seq, f'trial={trial_id}_data-scores')
            saveVariable(pred_assemblies,
                         f'trial={trial_id}_pred-assembly-seq')
            saveVariable(gt_assemblies, f'trial={trial_id}_gt-assembly-seq')

        if accuracies:
            fold_accuracy = float(np.array(accuracies).mean())
            metric_dict = {'Accuracy': fold_accuracy}
            utils.writeResults(results_file, metric_dict, sweep_param_name,
                               model_params)