예제 #1
0
 def test_get_question_id(self, example_id):
     sequence_id = text_utils.get_sequence_id(example_id, "0")
     question_id = text_utils.get_question_id(sequence_id, 0)
     new_example_id, annotator, position = text_utils.parse_question_id(
         question_id)
     self.assertEqual(new_example_id, example_id)
     self.assertEqual(annotator, "0")
     self.assertEqual(position, 0)
예제 #2
0
def _write_prediction(
    prediction,
    cell_classification_threshold,
    do_model_aggregation,
    do_model_classification,
    writer,
):
  """Writes a single prediction to TSV."""
  question_id = _get_question_id(prediction)
  max_width = prediction["column_ids"].max()
  max_height = prediction["row_ids"].max()

  if (max_width == 0 and max_height == 0 and
      question_id == text_utils.get_padded_question_id()):
    logging.info("Removing padded example: %s", question_id)
    return

  cell_coords_to_prob = get_mean_cell_probs(prediction)

  answers = _get_token_answers(
      prediction,
      cell_classification_threshold,
  )

  # Select the answers above a classification threshold.
  answer_coordinates = []
  for col in range(max_width):
    for row in range(max_height):
      cell_prob = cell_coords_to_prob.get((col, row), None)
      if cell_prob is not None:
        if cell_prob > cell_classification_threshold:
          answer_coordinates.append(str((row, col)))

  try:
    example_id, annotator, position = text_utils.parse_question_id(question_id)
    position = str(position)
  except ValueError:
    example_id = "_"
    annotator = "_"
    position = "_"
  prediction_to_write = {
      "question_id": question_id,
      "id": example_id,
      "annotator": annotator,
      "position": position,
      "answer_coordinates": str(answer_coordinates),
      "answers": token_answers_to_text(answers),
  }
  if do_model_aggregation:
    prediction_to_write["gold_aggr"] = str(prediction["gold_aggr"][0])
    prediction_to_write["pred_aggr"] = str(prediction["pred_aggr"])
  if do_model_classification:
    prediction_to_write["gold_cls"] = str(prediction["gold_cls"][0])
    prediction_to_write["pred_cls"] = str(prediction["pred_cls"])
    prediction_to_write["logits_cls"] = str(prediction["logits_cls"])
  writer.writerow(prediction_to_write)
예제 #3
0
def compute_prediction_sequence(estimator, examples_by_position):
    """Computes predictions using model's answers to the previous questions."""
    all_results = []
    prev_answers = None

    for position in range(len(examples_by_position)):
        examples = copy.deepcopy(examples_by_position[position])
        if prev_answers is not None:
            for example_id in examples:
                coords_to_answer = prev_answers[example_id]
                example = examples[example_id]
                prev_label_ids = example["prev_label_ids"]
                model_label_ids = np.zeros_like(prev_label_ids)
                for i in range(model_label_ids.shape[1]):
                    row_id = example["row_ids"][0, i] - 1
                    col_id = example["column_ids"][0, i] - 1
                    if row_id >= 0 and col_id >= 0 and example["segment_ids"][
                            0, i] == 1:
                        model_label_ids[0, i] = int(coords_to_answer[(col_id,
                                                                      row_id)])
                examples[example_id]["prev_label_ids"] = model_label_ids

        results = list(
            estimator.predict(
                input_fn=_get_in_memory_input_fn(examples.values())))
        all_results.extend(results)

        prev_answers = {}
        for prediction in results:
            question_id = prediction["question_id"][0].decode("utf-8")
            table_id, annotator, _ = text_utils.parse_question_id(question_id)
            example_id = (table_id, annotator)
            example = examples[example_id]
            probabilities = prediction["probabilities"]

            # Compute average probability per cell, aggregating over tokens.
            coords_to_probs = collections.defaultdict(list)
            for i, p in enumerate(probabilities):
                segment_id = prediction["segment_ids"][i]
                col = prediction["column_ids"][i] - 1
                row = prediction["row_ids"][i] - 1
                if col >= 0 and row >= 0 and segment_id == 1:
                    coords_to_probs[(col, row)].append(p)

            coords_to_answer = {}
            for key in coords_to_probs:
                coords_to_answer[key] = np.array(
                    coords_to_probs[key]).mean() > 0.5
            prev_answers[example_id] = coords_to_answer

    return all_results
예제 #4
0
def read_classifier_dataset(
    predict_data,
    data_format,
    compression_type,
    max_seq_length,
    max_predictions_per_seq,
    add_aggregation_function_id,
    add_classification_labels,
    add_answer,
):
    """Reads the classification dataset into memory as numpy arrays."""
    dataset = tapas_classifier_model.input_fn(
        name="predict",
        file_patterns=predict_data,
        data_format=data_format,
        compression_type=compression_type,
        is_training=False,
        max_seq_length=max_seq_length,
        max_predictions_per_seq=max_predictions_per_seq,
        add_aggregation_function_id=add_aggregation_function_id,
        add_classification_labels=add_classification_labels,
        add_answer=add_answer,
        include_id=True,
        params={"batch_size": 1})
    get_next = dataset.make_one_shot_iterator().get_next()

    examples_by_position = collections.defaultdict(dict)
    with tf.Session() as sess:
        try:
            while True:
                example = sess.run(get_next)
                question_id = example["question_id"][0, 0].decode("utf-8")
                table_id, annotator, position = text_utils.parse_question_id(
                    question_id)
                example_id = (table_id, annotator)
                examples_by_position[position][example_id] = example
        except tf.errors.OutOfRangeError:
            pass
    return examples_by_position
예제 #5
0
def _get_example_id(question_id):
  example_id, _, _ = text_utils.parse_question_id(question_id)
  return example_id
예제 #6
0
def write_predictions(
    predictions,
    output_predict_file,
    do_model_aggregation,
    do_model_classification,
    cell_classification_threshold,
):
    """Writes predictions to an output TSV file.

  Predictions header: [id, annotator, position, answer_coordinates, gold_aggr,
  pred_aggr]

  Args:
    predictions: model predictions
    output_predict_file: Path for wrinting the predicitons.
    do_model_aggregation: Indicates whther to write predicted aggregations.
    do_model_classification: Indicates whther to write predicted classes.
    cell_classification_threshold: Threshold for selecting a cell.
  """
    with tf.io.gfile.GFile(output_predict_file, "w") as write_file:
        header = [
            "question_id",
            "id",
            "annotator",
            "position",
            "answer_coordinates",
            "answer",
        ]
        if do_model_aggregation:
            header.extend(["gold_aggr", "pred_aggr"])
        if do_model_classification:
            header.extend(["gold_cls", "pred_cls", "logits_cls"])
        writer = csv.DictWriter(write_file, fieldnames=header, delimiter="\t")
        writer.writeheader()

        for prediction in predictions:
            question_id = _get_question_id(prediction)
            max_width = prediction["column_ids"].max()
            max_height = prediction["row_ids"].max()

            if (max_width == 0 and max_height == 0
                    and question_id == text_utils.get_padded_question_id()):
                logging.info("Removing padded example: %s", question_id)
                continue

            cell_coords_to_prob = get_mean_cell_probs(prediction)

            answer_indexes = get_answer_indexes(
                prediction,
                cell_classification_threshold,
            )

            # Select the answers above a classification threshold.
            answer_coordinates = []
            for col in range(max_width):
                for row in range(max_height):
                    cell_prob = cell_coords_to_prob.get((col, row), None)
                    if cell_prob is not None:
                        if cell_prob > cell_classification_threshold:
                            answer_coordinates.append(str((row, col)))

            try:
                example_id, annotator, position = text_utils.parse_question_id(
                    question_id)
                position = str(position)
            except ValueError:
                example_id = "_"
                annotator = "_"
                position = "_"
            prediction_to_write = {
                "question_id": question_id,
                "id": example_id,
                "annotator": annotator,
                "position": position,
                "answer_coordinates": str(answer_coordinates),
                "answer": str(answer_indexes),
            }
            if do_model_aggregation:
                prediction_to_write["gold_aggr"] = str(
                    prediction["gold_aggr"][0])
                prediction_to_write["pred_aggr"] = str(prediction["pred_aggr"])
            if do_model_classification:
                prediction_to_write["gold_cls"] = str(
                    prediction["gold_cls"][0])
                prediction_to_write["pred_cls"] = str(prediction["pred_cls"])
                prediction_to_write["logits_cls"] = str(
                    prediction["logits_cls"])
            writer.writerow(prediction_to_write)
예제 #7
0
def compute_prediction_sequence(model: TAPAS, features: List[dict]):
    """Computes predictions using model's answers to the previous questions."""
    examples_by_position = collections.defaultdict(dict)
    for feature in features:
        question_id = feature["question_id"][0, 0].numpy().decode("utf-8")
        table_id, annotator, position = text_utils.parse_question_id(
            question_id)
        example_id = (table_id, annotator)
        examples_by_position[position][example_id] = feature

    all_results = []
    prev_answers = None

    for position in range(len(examples_by_position)):
        results = []
        examples = copy.deepcopy(examples_by_position[position])
        if prev_answers is not None:
            for example_id in examples:
                coords_to_answer = prev_answers[example_id]
                example = examples[example_id]
                prev_label_ids = example["prev_label_ids"]
                model_label_ids = np.zeros_like(prev_label_ids)
                for i in range(model_label_ids.shape[1]):
                    row_id = example["row_ids"][0, i].numpy() - 1
                    col_id = example["column_ids"][0, i].numpy() - 1
                    if row_id >= 0 and col_id >= 0 and example["segment_ids"][
                            0, i].numpy() == 1:
                        model_label_ids[0, i] = int(coords_to_answer[(col_id,
                                                                      row_id)])
                examples[example_id]["prev_label_ids"] = model_label_ids

        for example_id in examples:
            example = examples[example_id]
            result = get_outputs(model, example)
            results.append(result)
        all_results.extend(results)
        prev_answers = {}
        for prediction in results:
            question_id = prediction["question_id"][0,
                                                    0].numpy().decode("utf-8")
            table_id, annotator, _ = text_utils.parse_question_id(question_id)
            example_id = (table_id, annotator)
            example = examples[example_id]
            probabilities = prediction["probabilities"][0].numpy()

            # Compute average probability per cell, aggregating over tokens.
            coords_to_probs = collections.defaultdict(list)
            for i, p in enumerate(probabilities):
                segment_id = prediction["segment_ids"][0][i].numpy()
                col = prediction["column_ids"][0][i].numpy() - 1
                row = prediction["row_ids"][0][i].numpy() - 1
                if col >= 0 and row >= 0 and segment_id == 1:
                    coords_to_probs[(col, row)].append(p)

            coords_to_answer = {}
            for key in coords_to_probs:
                coords_to_answer[key] = np.array(
                    coords_to_probs[key]).mean() > 0.5
            prev_answers[example_id] = coords_to_answer

    return all_results
예제 #8
0
def get_predictions(
    predictions,
    do_model_aggregation,
    do_model_classification,
    cell_classification_threshold,
):
    """Writes predictions to an output TSV file.

    Predictions header: [id, annotator, position, answer_coordinates, gold_aggr,
    pred_aggr]

    Args:
      predictions: model predictions
      do_model_aggregation: Indicates whther to write predicted aggregations.
      do_model_classification: Indicates whther to write predicted classes.
      cell_classification_threshold: Threshold for selecting a cell.
    """
    results = []
    header = [
        "question_id",
        "id",
        "annotator",
        "position",
        "answer_coordinates",
        "answer",
    ]
    if do_model_aggregation:
        header.extend(["gold_aggr", "pred_aggr"])
    if do_model_classification:
        header.extend(["gold_cls", "pred_cls", "logits_cls"])

    for prediction in predictions:
        question_id = _get_question_id(prediction)
        max_width = prediction["column_ids"][0].numpy().max()
        max_height = prediction["row_ids"][0].numpy().max()

        if (max_width == 0 and max_height == 0
                and question_id == text_utils.get_padded_question_id()):
            logging.info("Removing padded example: %s", question_id)
            continue

        cell_coords_to_prob = get_mean_cell_probs(prediction)

        answer_indexes = get_answer_indexes(
            prediction,
            cell_classification_threshold,
        )

        # Select the answers above a classification threshold.
        answer_coordinates = []
        answer_probablities = []
        for col in range(max_width):
            for row in range(max_height):
                cell_prob = cell_coords_to_prob.get((col, row), None)
                if cell_prob is not None:
                    if cell_prob > cell_classification_threshold:
                        answer_coordinates.append(str((row, col)))
                        answer_probablities.append(cell_prob)

        try:
            example_id, annotator, position = text_utils.parse_question_id(
                question_id)
            position = str(position)
        except ValueError:
            example_id = "_"
            annotator = "_"
            position = "_"
        prediction_to_write = {
            "question_id":
            question_id,
            "id":
            example_id,
            "annotator":
            annotator,
            "position":
            position,
            "answer_coordinates":
            str(answer_coordinates),
            "answer":
            str(answer_indexes),
            "answer_probablities":
            answer_probablities if len(answer_probablities) else [0.0]
        }
        if do_model_aggregation:
            prediction_to_write["pred_aggr"] = str(
                prediction["pred_aggr"][0].numpy())
        if do_model_classification:
            prediction_to_write["pred_cls"] = str(prediction["pred_cls"])
            prediction_to_write["logits_cls"] = str(prediction["logits_cls"])
        results.append(prediction_to_write)
    return results