Exemplo n.º 1
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                id2label = model.vocab.get_index_to_token_vocabulary("labels")
                _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics()
Exemplo n.º 2
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None,
             eval_type: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            ## made cuda compatible (if needed)
            batch = move_to_device(batch, cuda_device)

            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                _persist_data(file_handle, batch.get("metadata"), model_output,
                              eval_type)
            description = ', '.join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics(reset=True)
Exemplo n.º 5
0
def evaluate(model: Model, instances: Iterable[Instance], task_name: str,
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    """
    Evaluate a model for a particular tasks (usually after training).
    
    Parameters
    ----------
    model : ``allennlp.models.model.Model``, required
        The model to evaluate
    instances : ``Iterable[Instance]``, required
        The (usually test) dataset on which to evalute the model.
    task_name : ``str``, required
        The name of the tasks on which evaluate the model.
    data_iterator : ``DataIterator``
        Iterator that go through the dataset.
    cuda_device : ``int``
        Cuda device to use.
        
    Returns
    -------
    metrics :  ``Dict[str, Any]``
        A dictionary containing the metrics on the evaluated dataset.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        eval_loss = 0
        nb_batches = 0
        for tensor_batch in generator_tqdm:
            nb_batches += 1

            train_stages = ["stm", "sd", "valid"]
            task_index = TASKS_NAME.index(task_name)
            tensor_batch['task_index'] = torch.tensor(task_index)
            tensor_batch["reverse"] = torch.tensor(False)
            tensor_batch['for_training'] = torch.tensor(False)
            train_stage = train_stages.index("stm")
            tensor_batch['train_stage'] = torch.tensor(train_stage)
            tensor_batch = move_to_device(tensor_batch, 0)

            eval_output_dict = model.forward(**tensor_batch)
            loss = eval_output_dict["loss"]
            eval_loss += loss.item()
            metrics = model.get_metrics(task_name=task_name)
            metrics["stm_loss"] = float(eval_loss / nb_batches)

            description = training_util.description_from_metrics(metrics)
            generator_tqdm.set_description(description, refresh=False)

        metrics = model.get_metrics(task_name=task_name, reset=True)
        metrics["stm_loss"] = float(eval_loss / nb_batches)
        return metrics
Exemplo n.º 6
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             label_fname: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        label_file = open(label_fname, 'w')
        label_file.write('real_label,guessed_label\n')

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        total_num_inst = 0
        for batch in generator_tqdm:
            num_inst = batch['tokens']['tokens'].size(0)
            total_num_inst += num_inst
            batch = util.move_to_device(batch, cuda_device)

            output_dict = model(**batch)
            if cuda_device == -1:
                output_matrix = output_dict['label_logits'].data.numpy()
            else:
                output_matrix = output_dict['label_logits'].data.cpu().numpy()
            output_labels = np.argmax(output_matrix, axis=1)
            if cuda_device == -1:
                true_labels = batch['label'].data.numpy()
            else:
                true_labels = batch['label'].data.cpu().numpy()
            assert true_labels.shape[0] == output_labels.shape[0]
            for i in range(true_labels.shape[0]):
                label_file.write(str(int(true_labels[i])) + ',')
                label_file.write(str(int(output_labels[i])) + '\n')

            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)


        print("NUM INSTANCES ITERATED OVER: " + str(total_num_inst))
        label_file.close()

        return model.get_metrics(reset=True)
Exemplo n.º 7
0
def evaluate(model: Model, instances: Iterable[Instance], task_name: str,
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    """
    Evaluate a model for a particular task (usually after training).
    
    Parameters
    ----------
    model : ``allennlp.models.model.Model``, required
        The model to evaluate
    instances : ``Iterable[Instance]``, required
        The (usually test) dataset on which to evalute the model.
    task_name : ``str``, required
        The name of the task on which evaluate the model.
    data_iterator : ``DataIterator``
        Iterator that go through the dataset.
    cuda_device : ``int``
        Cuda device to use.
        
    Returns
    -------
    metrics :  ``Dict[str, Any]``
        A dictionary containing the metrics on the evaluated dataset.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        eval_loss = 0
        nb_batches = 0
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            nb_batches += 1

            eval_output_dict = model.forward(task_name=task_name,
                                             tensor_batch=batch)
            loss = eval_output_dict["loss"]
            eval_loss += loss.item()
            metrics = model.get_metrics(task_name=task_name)
            metrics["loss"] = float(eval_loss / nb_batches)

            description = ", ".join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        metrics = model.get_metrics(task_name=task_name, reset=True, full=True)
        metrics["loss"] = float(eval_loss / nb_batches)
        return metrics
Exemplo n.º 8
0
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))
    output = pd.DataFrame()
    for raw_batch, batch in generator_tqdm:
        raw_fields = [x.fields for x in raw_batch.instances]
        parsed_fields = []

        for item in raw_fields:
            premise = " ".join([x.text for x in item['premise'].tokens])
            hypothesis = " ".join([x.text for x in item['hypothesis'].tokens])
            label = item['label'].label
            parsed_fields.append({
                "sentence1": premise,
                "sentence2": hypothesis,
                "gold_label": label
            })
        parsed_fields = pd.DataFrame(parsed_fields)
        tensor_batch = arrays_to_variables(batch,
                                           cuda_device,
                                           for_training=False)
        bo = model.forward(**tensor_batch)
        metrics = model.get_metrics()
        description = ', '.join(
            ["%s: %.2f" % (name, value)
             for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)
        batch_output = pd.DataFrame()
        INVERSE_LABEL_MAP = {
            0: "entailment",
            1: "neutral",
            2: "contradiction",
            3: "hidden"
        }
        batch_output['prediction_label'] = bo['label_logits'].data.numpy(
        ).argmax(axis=1)
        batch_output['prediction_score'] = bo['label_probs'].data.numpy().max(
            axis=1)
        batch_output['prediction_label'] = batch_output.prediction_label.apply(
            lambda x: INVERSE_LABEL_MAP[x])
        parsed_output = pd.concat([parsed_fields, batch_output], axis=1)
        output = pd.concat([output, parsed_output], axis=0)
    hard_subset = output.loc[output.gold_label != output.prediction_label]
    easy_subset = output.loc[output.gold_label == output.prediction_label]
    return model.get_metrics(), hard_subset, easy_subset
Exemplo n.º 9
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        batch_count = 0
        loss_count = 0
        total_loss = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            loss = model(**batch).get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                metrics["loss"] = loss.item()
                total_loss += loss.item()

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss/batch_count

        return final_metrics
Exemplo n.º 10
0
def get_metrics(
    model: Model,
    total_loss: float,
    total_reg_loss: Optional[float],
    batch_loss: Optional[float],
    batch_reg_loss: Optional[float],
    num_batches: int,
    reset: bool = False,
    world_size: int = 1,
    cuda_device: Union[int, torch.device] = torch.device("cpu"),
) -> Dict[str, float]:
    """
    Gets the metrics but sets `"loss"` to
    the total loss divided by the `num_batches` so that
    the `"loss"` metric is "average loss per batch".
    Returns the `"batch_loss"` separately.
    """
    metrics = model.get_metrics(reset=reset)
    if batch_loss is not None:
        metrics["batch_loss"] = batch_loss
    metrics["loss"] = float(total_loss /
                            num_batches) if num_batches > 0 else 0.0
    if total_reg_loss is not None:
        if batch_reg_loss is not None:
            metrics["batch_reg_loss"] = batch_reg_loss
        metrics["reg_loss"] = float(total_reg_loss /
                                    num_batches) if num_batches > 0 else 0.0

    return metrics
Exemplo n.º 11
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics(reset=True)
Exemplo n.º 12
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics()
Exemplo n.º 13
0
def get_metrics(
    model: Model,
    total_loss: float,
    num_batches: int,
    reset: bool = False,
    world_size: int = 1,
    cuda_device: Union[int, List] = 0,
) -> Dict[str, float]:
    """
    Gets the metrics but sets `"loss"` to
    the total loss divided by the `num_batches` so that
    the `"loss"` metric is "average loss per batch".
    """
    metrics = model.get_metrics(reset=reset)
    metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0

    if world_size > 1:
        # In distributed mode, average out all metrics across GPUs
        aggregated_metrics = {}
        for metric_name, metric_val in metrics.items():
            if isinstance(cuda_device, list):
                metric_tensor = torch.tensor(metric_val).to(torch.device(cuda_device[0]))
            else:
                metric_tensor = torch.tensor(metric_val).to(torch.device(cuda_device))
            dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM)
            reduced_metric = metric_tensor.item() / world_size
            aggregated_metrics[metric_name] = reduced_metric
        return aggregated_metrics
    else:
        return metrics
Exemplo n.º 14
0
def evaluate(model: Model,
             dataset: Dataset,
             iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset))
    for batch in generator_tqdm:
        tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False)
        model.forward(**tensor_batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)

    return model.get_metrics()
Exemplo n.º 15
0
def get_metrics(model: Model, total_loss: float, num_batches: int, reset: bool = False) -> Dict[str, float]:
    """
    Gets the metrics but sets ``"loss"`` to
    the total loss divided by the ``num_batches`` so that
    the ``"loss"`` metric is "average loss per batch".
    """
    metrics = model.get_metrics(reset=reset)
    metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0
    return metrics
Exemplo n.º 16
0
def evaluate(model: Model, dataset: InstanceCollection, iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset,
                         num_epochs=1,
                         cuda_device=cuda_device,
                         for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(
            ["%s: %.2f" % (name, value)
             for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)

    return model.get_metrics()
Exemplo n.º 17
0
def evaluate(model: Model,
             dataset: Dataset,
             iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    for batch in tqdm.tqdm(generator):
        tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False)
        model.forward(**tensor_batch)

    return model.get_metrics()
def get_model_predictions(model: Model, instances: Iterable[Instance],
                          data_iterator: DataIterator,
                          cuda_device: int) -> (Dict[str, Any], List):

    model.eval()
    model_predictions = []

    iterator = data_iterator(instances,
                             num_epochs=1,
                             cuda_device=cuda_device,
                             for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    return model.get_metrics(), model_predictions
Exemplo n.º 19
0
def get_metrics(
    model: Model,
    total_loss: float,
    total_reg_loss: Optional[float],
    batch_loss: Optional[float],
    batch_reg_loss: Optional[float],
    num_batches: int,
    reset: bool = False,
    world_size: int = 1,
    cuda_device: Union[int, torch.device] = torch.device("cpu"),
) -> Dict[str, float]:
    """
    Gets the metrics but sets `"loss"` to
    the total loss divided by the `num_batches` so that
    the `"loss"` metric is "average loss per batch".
    Returns the `"batch_loss"` separately.
    """
    metrics = model.get_metrics(reset=reset)
    if batch_loss is not None:
        metrics["batch_loss"] = batch_loss
    metrics["loss"] = float(total_loss /
                            num_batches) if num_batches > 0 else 0.0
    if total_reg_loss is not None:
        if batch_reg_loss is not None:
            metrics["batch_reg_loss"] = batch_reg_loss
        metrics["reg_loss"] = float(total_reg_loss /
                                    num_batches) if num_batches > 0 else 0.0

    if world_size > 1:
        # In distributed mode, average out all metrics across GPUs
        aggregated_metrics = {}
        for metric_name, metric_val in metrics.items():
            metric_tensor = torch.tensor(metric_val).to(cuda_device)
            dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM)
            reduced_metric = metric_tensor.item() / world_size
            aggregated_metrics[metric_name] = reduced_metric
        return aggregated_metrics
    else:
        return metrics
Exemplo n.º 20
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str,
) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Exemplo n.º 21
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            ############ Comment out this block to save class_probabilities, logits, and losses for each batch #########
            # print(output_dict['class_probabilities'].shape)
            # import copy
            #
            # newoutput_dict = copy.deepcopy(output_dict)
            # newoutput_dict['class_probabilities'] = newoutput_dict['class_probabilities'].cpu().data.numpy()
            # newoutput_dict['logits'] = newoutput_dict['logits'].cpu().data.numpy()
            # newoutput_dict['loss'] = newoutput_dict['loss'].cpu().data.numpy()
            #
            # output_file = os.path.join(os.path.dirname(__file__), '..', "data", "test",
            #                            str(batch_count) + "_output.pkl")
            # import json
            # import pickle
            # if output_file:
            #     with open(output_file, "wb") as file:
            #         pickle.dump(newoutput_dict, file)
            #     file.close()
            # ###########################################################################################################

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.4f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " +
                    "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Exemplo n.º 22
0
def evaluate(model: Model, dataset: Dataset, iterator: BasicIterator,
             cuda_device: int, serialization_directory: str) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset,
                         num_epochs=1,
                         cuda_device=cuda_device,
                         shuffle=False,
                         for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))

    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join([
            "%s: %.5f" % (name, value)
            for name, value in metrics.items() if "overall" in name
        ]) + " ||"
        generator_tqdm.set_description(description)

    metrics = model.get_metrics()
    golds = metrics["gold_spans"]
    predictions = metrics["predicted_spans"]
    assert len(dataset.instances) == len(golds) == len(predictions)

    # gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file_path = os.path.join(serialization_directory,
                                        "predictions.txt")
    prediction_file = open(prediction_file_path, "w+")
    # gold_file = open(gold_file_path, "w+")
    logger.info("Writing predictions in CoNLL-like format to %s",
                prediction_file_path)

    for instance, gold, prediction in tqdm.tqdm(
            zip(dataset.instances, golds, predictions)):
        fields = instance.fields
        if "targets" in fields:
            verb_index = fields["targets"].labels.index(1)
        elif "verb_indicator" in fields:
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None
        else:
            verb_index = None

        frame = None
        if "frame" in fields:
            frame = fields["frame"].tokens[0].text
        gf = None
        if "gf" in fields:
            gf = [g.text for g in fields["gf"].tokens]
        pt = None
        if "pt" in fields:
            pt = [p.text for p in fields["pt"].tokens]

        sentence = [token.text for token in fields["tokens"].tokens]

        gold_tags = convert_spans_to_seq(gold, len(sentence))
        predicted_tags = convert_spans_to_seq(prediction, len(sentence))
        assert len(sentence) == len(gold_tags) == len(predicted_tags)

        write_to_conll_eval_file(
            prediction_file,
            #  gold_file,
            verb_index,
            sentence,
            predicted_tags,
            gold_tags,
            frame,
            gf,
            pt)

    return model.get_metrics()
Exemplo n.º 23
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0
        # ksk
        total_probs, all_example_ids = [], []

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight
                # ksk
                if 'probs' in output_dict:
                    total_probs.extend(output_dict['probs'])
                    all_example_ids.extend([
                        batch['metadata'][batch_index]['example_ids']
                        for batch_index in range(len(batch['metadata']))
                    ])

            if (not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " +
                    "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight
            # ksk
            if 'probs' in output_dict:
                total_probs.extend(output_dict['probs'])
                all_example_ids.extend([
                    batch['metadata'][batch_index]['example_ids']
                    for batch_index in range(len(batch['metadata']))
                ])
                final_metrics["probs"] = total_probs
                final_metrics["example_ids"] = all_example_ids

        return final_metrics
Exemplo n.º 24
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Exemplo n.º 25
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Exemplo n.º 26
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        runtime = []
        prev_time = time.time()
        for batch in generator_tqdm:
            batch_count += 1
            #if batch_count == 1000:
            #   runtime = np.asarray(runtime)
            #   print("Mean:", np.mean(runtime))
            #   print("Std:", np.std(runtime))
            #   exit(-1)
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not HasBeenWarned.tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)
            runtime.append(time.time() - prev_time)
            prev_time = time.time()

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics