Пример #1
0
def ranking_acc_f1(gold, outputs, probs):
    """A convenience custom function that returns accuracy, f1, and their mean for ranking task heads."""
    gold = (1 - gold) + 1
    outputs = 1 * (probs.reshape((-1,)) > 0.5)
    accuracy = metric_score(gold, outputs, metric="accuracy")
    f1 = metric_score(gold, outputs, metric="f1")
    return {"accuracy": accuracy, "f1": f1, "acc_f1": np.mean([accuracy, f1])}
Пример #2
0
    def score_task(self, X, Y, t=0, metric="accuracy", verbose=True, **kwargs):
        """Scores the predictive performance of the Classifier on task t

        Args:
            X: The input for the predict_task method
            Y: A [n] or [n, 1] np.ndarray or torch.Tensor of gold labels in
                {1,...,K_t}
            t: The task index to score
            metric: The metric with which to score performance on this task
        Returns:
            The (float) score of the Classifier for the specified task and
            metric
        """
        Y = self._to_numpy(Y)
        Y_tp = self.predict_task(X, t=t, **kwargs)
        probs = self.predict_proba(X)[t]
        score = metric_score(Y[t],
                             Y_tp,
                             metric,
                             ignore_in_gold=[0],
                             probs=probs,
                             **kwargs)
        if verbose:
            print(f"[t={t}] {metric.capitalize()}: {score:.7f}")
        return score
Пример #3
0
    def _calculate_standard_metrics(self, model, data_loader, target_metrics,
                                    metrics_dict, split):
        target_standard_metrics = []
        for split_metric in target_metrics:
            metric = self.remove_split_prefix(split_metric)
            if metric in standard_metric_names:
                target_standard_metrics.append(metric)

        # Only calculate predictions if at least one standard metric requires it
        if target_standard_metrics:
            if model.multitask:
                # For multitask models, use score method for aggregation
                # This may cause inefficiency if there are multiple desired metrics
                # and we re-predict for each one.
                for metric in target_standard_metrics:
                    score = model.score(data_loader, metric, verbose=False)
                    metrics_dict[self.add_split_prefix(metric, split)] = score
            else:
                # For singletask models, predict once and use Y_probs/Y_preds
                # for all metrics calculations
                Y_preds, Y, Y_probs = model._get_predictions(data_loader,
                                                             return_probs=True)
                for metric in target_standard_metrics:
                    score = metric_score(Y, Y_preds, metric, probs=Y_probs)
                    metrics_dict[self.add_split_prefix(metric, split)] = score
        return metrics_dict
Пример #4
0
    def score(self,
              X,
              Y,
              metric=['accuracy'],
              break_ties='random',
              verbose=True,
              **kwargs):
        """Scores the predictive performance of the Classifier on all tasks

        Args:
            X: The input for the predict method
            Y: An [N] or [N, 1] torch.Tensor or np.ndarray of gold labels in 
                {1,...,K_t}
            metric: A metric (string) with which to score performance or a 
                list of such metrics
            break_ties: How to break ties when making predictions

        Returns:
            scores: A (float) score
        """
        Y = self._to_numpy(Y)
        Y_p = self.predict(X, break_ties=break_ties, **kwargs)

        metric_list = metric if isinstance(metric, list) else [metric]
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric, ignore_in_gold=[0])
            if verbose:
                print(f"{metric.capitalize()}: {score:.3f}")

        return score
Пример #5
0
    def score(self, Y, Y_probs, Y_preds, target_metrics=None):
        """
        Calculates and returns a metrics_dict for a given set of predictions and labels

        Args:
            Y: an [n] list of gold labels
            Y_probs: an [n] list of probabilities
            Y_preds: an [n] list of predictions
            target_metrics: a list of simple metrics to calculate
        Returns:
            a metrics_dict object of the form:
                {metric1 : score1, ...., metricN: score N}

        Note that the returned metrics dict will be transformed to have full metric
        names (e.g., "accuracy" -> "foo_task/bar_payload/accuracy") in the trainer.
        """
        self.validate_target_metrics(target_metrics)

        # TODO: Tighen this up; it can be much more efficient
        # The main issue is that we currently require Y/Y_probs/Y_preds to be lists
        # so that they can support sequence-based tasks that have arbitrary length
        # labels. But there is certainly a way we can be more strict/certain about
        # what our data types will be and do some much more efficient slice operation
        # instead of list comprehension.

        # Identify all examples with at least one non-zero (i.e., non-abstain) label
        active = [bool(y != 0) for y in Y]
        if sum(active) != len(active):
            Y = [y for a, y in zip(active, Y) if a]
            if Y_probs:
                Y_probs = [y for a, y in zip(active, Y_probs) if a]
            if Y_preds:
                Y_preds = [y for a, y in zip(active, Y_preds) if a]

        simple_metrics_dict = {}
        for metric in self.standard_metrics:
            # If target metrics were specified and this is not one of them, skip it
            if target_metrics and metric not in target_metrics:
                continue
            score = metric_score(Y, Y_preds, metric, probs=Y_probs)
            simple_metrics_dict[metric] = score

        for metric, custom_metric_func in self.custom_metric_map.items():
            # If target metrics were specified and this is not one of them, skip it
            if target_metrics and metric not in target_metrics:
                continue
            # If the current metric is already in the simple_metrics_dict, skip it
            # This is possible because a custom_metric_func can return multiple metrics
            if metric in simple_metrics_dict:
                continue
            custom_metric_dict = custom_metric_func(Y, Y_preds, probs=Y_probs)
            for metric, score in custom_metric_dict.items():
                if not target_metrics or metric in target_metrics:
                    simple_metrics_dict[metric] = score

        return simple_metrics_dict
Пример #6
0
    def score(
        self,
        data,
        metric="accuracy",
        break_ties="random",
        verbose=True,
        print_confusion_matrix=True,
        **kwargs,
    ):
        """Scores the predictive performance of the Classifier on all tasks

        Args:
            data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y):
                X: The input for the predict method
                Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels
                    in {1,...,k}
            metric: A metric (string) with which to score performance or a
                list of such metrics
            break_ties: A tie-breaking policy (see Classifier._break_ties())
            verbose: The verbosity for just this score method; it will not
                update the class config.
            print_confusion_matrix: Print confusion matrix (overwritten to False if
                verbose=False)

        Returns:
            scores: A (float) score or a list of such scores if kwarg metric
                is a list
        """
        Y_p, Y, Y_s = self._get_predictions(data,
                                            break_ties=break_ties,
                                            return_probs=True,
                                            **kwargs)

        # Evaluate on the specified metrics
        return_list = isinstance(metric, list)
        metric_list = metric if isinstance(metric, list) else [metric]
        scores = []
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric, probs=Y_s, ignore_in_gold=[0])
            scores.append(score)
            if verbose:
                if type(score) != list:
                    print(f"{metric.capitalize()}: {score:.7f}")
                else:
                    print(f"{metric.capitalize()}: {score}")

        # Optionally print confusion matrix
        if print_confusion_matrix and verbose:
            confusion_matrix(Y, Y_p, pretty_print=True)

        # If a single metric was given as a string (not list), return a float
        if len(scores) == 1 and not return_list:
            return scores[0]
        else:
            return scores
Пример #7
0
    def score(
        self,
        X,
        Y,
        metric="accuracy",
        reduce="mean",
        break_ties="random",
        verbose=True,
        **kwargs,
    ):
        """Scores the predictive performance of the Classifier on all tasks
        Args:
            X: The input for the predict method
            Y: A t-length list of [n] or [n, 1] np.ndarrays or torch.Tensors of
                gold labels in {1,...,K_t}
            metric: The metric with which to score performance on each task
            reduce: How to reduce the scores of multiple tasks:
                 None : return a t-length list of scores
                'mean': return the mean score across tasks
            break_ties: How to break ties when making predictions
        Returns:
            scores: A (float) score or a t-length list of such scores if
                reduce=None
        """
        self._check(Y, typ=list)
        Y = [self._to_numpy(Y_t) for Y_t in Y]

        Y_p = self.predict(X, break_ties=break_ties, **kwargs)
        self._check(Y_p, typ=list)

        task_scores = []
        for t, Y_tp in enumerate(Y_p):
            score = metric_score(Y[t], Y_tp, metric, ignore_in_gold=[0])
            task_scores.append(score)

        # TODO: Other options for reduce, including scoring only certain
        # primary tasks, and converting to end labels using TaskGraph...
        if reduce is None:
            score = task_scores
        elif reduce == "mean":
            score = np.mean(task_scores)
        else:
            raise Exception(f"Keyword reduce='{reduce}' not recognized.")

        if verbose:
            if reduce is None:
                for t, score_t in enumerate(score):
                    print(f"{metric.capitalize()} (t={t}): {score_t:0.3f}")
            else:
                print(f"{metric.capitalize()}: {score:.3f}")

        return score
    def score(self, X, Y, metric='f1', verbose=True):
        Y = convert_labels(Y, 'categorical', 'onezero')
        Y_p = self.predict(X)

        metric_list = metric if isinstance(metric, list) else [metric]
        scores = []
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric)
            scores.append(score)
            if verbose:
                print(f"{metric.capitalize()}: {score:.3f}")

        if isinstance(scores, list) and len(scores) == 1:
            return scores[0]
        else:
            return scores
Пример #9
0
    def score(self, probs, target_probs):
        """
        """
        metrics = defaultdict(dict)
        for task_idx, _ in enumerate(probs):
            probs_t = torch.tensor(probs[task_idx]).double()
            preds_t = soft_to_hard(probs_t, break_ties='random')

            target_probs_t = torch.tensor(target_probs[task_idx]).double()
            targets = soft_to_hard(target_probs_t, break_ties='random')

            print(pred_to_prob(targets, k=probs_t.shape[1]))
            for metric in METRICS_LIST:
                metrics[self.idx_to_task[task_idx]][metric] = metric_score(
                    targets + 1, preds_t + 1, metric, probs=probs_t)

        return metrics
Пример #10
0
    def score(
        self,
        X,
        Y,
        metric=["accuracy"],
        break_ties="random",
        verbose=True,
        **kwargs,
    ):
        """Scores the predictive performance of the Classifier on all tasks

        Args:
            X: The input for the predict method
            Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels in
                {1,...,k}
            metric: A metric (string) with which to score performance or a
                list of such metrics
            break_ties: How to break ties when making predictions
            verbose: The verbosity for just this score method; it will not
                update the class config.

        Returns:
            scores: A (float) score
        """
        Y = self._to_numpy(Y)
        Y_p = self.predict(X, break_ties=break_ties, **kwargs)

        metric_list = metric if isinstance(metric, list) else [metric]
        scores = []
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric, ignore_in_gold=[0])
            scores.append(score)
            if verbose:
                print(f"{metric.capitalize()}: {score:.3f}")

        if isinstance(scores, list) and len(scores) == 1:
            return scores[0]
        else:
            return scores
Пример #11
0
    def score(
        self,
        data,
        metric="accuracy",
        validation_task=None,
        reduce="mean",
        break_ties="random",
        verbose=True,
        print_confusion_matrix=False,
        **kwargs,
    ):
        """Scores the predictive performance of the Classifier on all tasks
        Args:
            data: either a Pytorch Dataset, DataLoader or tuple supplying (X,Y):
                X: The input for the predict method
                Y: A t-length list of [n] or [n, 1] np.ndarrays or
                   torch.Tensors of gold labels in {1,...,K_t}
            metric: The metric with which to score performance on each task
            validation_task:
                int: returns score for specific task number.
            reduce: How to reduce the scores of multiple tasks:
                 None : return a t-length list of scores
                'mean': return the mean score across tasks
            break_ties: How to break ties when making predictions
        Returns:
            scores: A (float) score or a t-length list of such scores if
                reduce=None
        """
        Y_p, Y, Y_s = self._get_predictions(data,
                                            break_ties=break_ties,
                                            return_probs=True,
                                            **kwargs)

        # TODO: Handle multiple metrics...
        metric_list = metric if isinstance(metric, list) else [metric]
        if len(metric_list) > 1:
            raise NotImplementedError(
                "Multiple metrics for multi-task score() not yet supported.")
        metric = metric_list[0]

        # Return score for task t only.
        if validation_task is not None:
            score = metric_score(
                Y[validation_task],
                Y_p[validation_task],
                metric,
                probs=Y_s[validation_task],
                ignore_in_gold=[0],
            )
            if verbose:
                print(f"{metric.capitalize()}: {score:.7f}")
            return score

        task_scores = []
        for t, Y_tp in enumerate(Y_p):
            score = metric_score(Y[t],
                                 Y_tp,
                                 metric,
                                 probs=Y_s[t],
                                 ignore_in_gold=[0])
            task_scores.append(score)

        # TODO: Other options for reduce, including scoring only certain
        # primary tasks, and converting to end labels using TaskGraph...
        if reduce is None:
            score = task_scores
        elif reduce == "mean":
            score = np.mean(task_scores)
        else:
            raise Exception(f"Keyword reduce='{reduce}' not recognized.")

        if verbose:
            if reduce is None:
                for t, score_t in enumerate(score):
                    print(f"{metric.capitalize()} (t={t}): {score_t:0.3f}")
            else:
                print(f"{metric.capitalize()}: {scor7:.7f}")

        return score
Пример #12
0
def train_model(args):

    #global args
    #args = parser.parse_args()

    hidden_size = 128
    num_classes = 2
    encode_dim = 108  # using get_frm_output_size()

    if (torch.cuda.is_available()):
        device = torch.device('cuda:0')
        #device = 'cuda'
    else:
        device = 'cpu'

    #print(device)
    L, Y = load_labels(args)

    # Label Model
    # labelling functions analysis
    print(lf_summary(L["dev"], Y=Y["dev"]))

    # majority vote of LFs
    mv = MajorityLabelVoter(seed=123)
    print('Majority Label Voter Metrics:')
    mv.score((L["dev"], Y["dev"]),
             metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model - no temporal modelling
    label_model = LabelModel(k=num_classes, seed=123)
    label_model.train_model(L["train"],
                            Y["dev"],
                            n_epochs=500,
                            log_train_every=50)

    # evaluating label model
    print('Trained Label Model Metrics:')
    label_model.score((L["dev"], Y["dev"]),
                      metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model without temporal modelling
    # naive model
    #print(L["train"].todense().shape) # (18850,5)
    #print(L["dev"].todense().shape) # (1500,5)
    #print(Y["dev"].shape) # (1500,)
    m_per_task = L["train"].todense().shape[1]  # 5
    MRI_data_naive = {
        'Li_train':
        torch.FloatTensor(np.array(L["train"].todense().astype('int_'))),
        'Li_dev':
        torch.FloatTensor(np.array(L["dev"].todense())),
        'R_dev':
        Y["dev"]
    }

    MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device)

    # training naive model
    naive_model = DPLabelModel(
        m=m_per_task,
        T=1,
        edges=[],
        coverage_sets=[[
            0,
        ]] * m_per_task,
        mu_sharing=[[
            i,
        ] for i in range(m_per_task)],
        phi_sharing=[],
        device=device,
        #class_balance=MRI_data_naive['class_balance'],
        seed=0)

    optimize(naive_model,
             L_hat=MRI_data_naive['Li_train'],
             num_iter=300,
             lr=1e-3,
             momentum=0.8,
             clamp=True,
             seed=0)

    # evaluating naive model
    R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy()
    R_pred = 2 - R_pred
    #print(R_pred)
    #print(MRI_data_naive['R_dev'])

    for metric in ['accuracy', 'f1', 'recall', 'precision']:
        score = metric_score(MRI_data_naive['R_dev'], R_pred, metric)
        print(f"{metric.capitalize()}: {score:.3f}")

    # training label model with temporal modelling
    # reshaping dataset
    num_frames = 50
    n_patients_train = round(L["train"].todense().shape[0] /
                             num_frames)  #(377)
    n_patients_dev = round(L["dev"].todense().shape[0] / num_frames)  #(30)
    Ltrain = np.reshape(np.array(L["train"].todense()),
                        (n_patients_train, num_frames, -1))
    Ldev = np.reshape(np.array(L["dev"].todense()),
                      (n_patients_dev, num_frames, -1))
    Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames))
    # print(Ltrain.shape) # (377,50,5)
    #print(Ldev.shape) # (30,50,5)
    #print(Ydev.shape) # (30,50)

    # subsampling
    # selecting frames 3,13,23,33,43
    indices = np.linspace(2, 42, 5).astype(int)
    m_per_task = 5
    T = 5

    Ltrain_small = Ltrain[:, indices, :]  # shape (377,5,5)
    Ldev_small = Ldev[:, indices, :]  # shape (30,5,5)
    Ydev_small = Ydev[:, indices]  # shape (30,5)

    Ltrain_small = np.reshape(
        Ltrain_small, ((n_patients_train * T), m_per_task))  # shape (1885,5)
    Ldev_small = np.reshape(
        Ldev_small, ((n_patients_dev * T), m_per_task))  # shape (150,5)
    Ydev_small = np.reshape(Ydev_small,
                            ((n_patients_dev * T), ))  # shape (150,)

    MRI_data_temporal = {
        'Li_train':
        torch.LongTensor(Ltrain_small).view(n_patients_train,
                                            (m_per_task * T)),
        'Li_dev':
        torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)),
        'R_dev':
        torch.LongTensor(Ydev_small)[::T] * (2**T - 1),
        'm':
        m_per_task * T,
        'T':
        T
    }

    MRI_data_temporal['class_balance'] = normalize(
        (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange(
            2**T, device=device).unsqueeze(0)).sum(0).float(),
        dim=0,
        p=1)

    max_seed = 10
    temporal_models = [
        None,
    ] * max_seed
    for seed in range(max_seed):
        markov_model = DPLabelModel(
            m=m_per_task * T,
            T=T,
            edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)],
            coverage_sets=[[
                t,
            ] for t in range(T) for _ in range(m_per_task)],
            mu_sharing=[[t * m_per_task + i for t in range(T)]
                        for i in range(m_per_task)],
            phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i)
                          for t in range(T - 1)] for i in range(m_per_task)],
            device=device,
            class_balance=MRI_data_temporal['class_balance'],
            seed=seed)
        optimize(markov_model,
                 L_hat=MRI_data_temporal['Li_train'],
                 num_iter=1000,
                 lr=1e-5,
                 momentum=0.8,
                 clamp=True,
                 verbose=False,
                 seed=seed)
        temporal_models[seed] = markov_model

    for seed, model in enumerate(temporal_models):
        R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu())
        F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0,
                          R_pred.cpu() > 0, 'f1')
        accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(),
                                'accuracy')
        print(f"seed={seed}  accuracy={accuracy:.3f}  F1={F1:.3f}")
Пример #13
0
def acc_f1(gold, outputs, **kwargs):
    """A convenience custom function that returns accuracy, f1, and their mean"""
    accuracy = metric_score(gold, outputs, metric="accuracy")
    f1 = metric_score(gold, outputs, metric="f1")
    return {"accuracy": accuracy, "f1": f1, "acc_f1": np.mean([accuracy, f1])}
Пример #14
0
 def test_metric_score(self):
     gold = [1, 1, 1, 2, 2]
     pred = [1, 1, 1, 2, 1]
     acc = accuracy_score(gold, pred)
     met = metric_score(gold, pred, metric="accuracy")
     self.assertAlmostEqual(acc, met)