Exemplo n.º 1
0
 def start_bundle(self):
   convert_impl = ConverterImplType(self._convert_impl_value)
   if convert_impl == ConverterImplType.PYTHON:
     self._converter = tf_example_utils.ToClassifierTensorflowExample(
         self._config)
   else:
     raise ValueError(f'Unsupported implementation: {convert_impl.name}')
Exemplo n.º 2
0
 def test_convert(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0'),
                         interaction_pb2.Cell(text='4'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1'),
                         interaction_pb2.Cell(text='3'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='2')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'),
                          [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11])
         self.assertEqual(_get_int_feature(example, 'row_ids'),
                          [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2])
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3])
         self.assertEqual(_get_int_feature(example, 'column_ranks'),
                          [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                          [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2])
         self.assertEqual(
             _get_float_feature(example, 'question_numeric_values'),
             _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
Exemplo n.º 3
0
 def test_convert_with_context_heading(self):
     max_seq_length = 20
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, ['a', 'b', 'c', 'd', 'e'])
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 use_document_title=True,
                 use_context_title=True,
                 update_answer_coordinates=True,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 document_title='E E',
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A B C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A B'),
                         interaction_pb2.Cell(text='A B C'),
                     ]),
                 ],
                 context_heading='B',
             ),
             questions=[
                 interaction_pb2.Question(
                     id='id',
                     original_text='D',
                     answer=interaction_pb2.Answer(answer_texts=['B C']),
                 )
             ],
         )
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(
             _get_int_feature(example, 'input_ids'),
             [2, 5, 3, 10, 10, 3, 7, 3, 6, 6, 7, 8, 6, 7, 6, 7, 8, 0, 0, 0])
         self.assertEqual(
             _get_int_feature(example, 'label_ids'),
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
Exemplo n.º 4
0
 def test_convert_with_trimmed_cell(self):
     max_seq_length = 16
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 cell_trim_length=2,
                 drop_rows_to_fit=True))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A A'),
                     interaction_pb2.Cell(text='A A A A'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='A')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         # We expect the second row to be dropped all cells should be trimmed to
         # >= 2 tokens.
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
Exemplo n.º 5
0
  def test_get_empty_example(self):
    max_seq_length = 3

    with tempfile.TemporaryDirectory() as input_dir:
      vocab_file = os.path.join(input_dir, 'vocab.txt')
      _create_vocab(vocab_file, [])
      converter = tf_example_utils.ToClassifierTensorflowExample(
          config=tf_example_utils.ClassifierConversionConfig(
              vocab_file=vocab_file,
              max_seq_length=max_seq_length,
              max_column_id=max_seq_length,
              max_row_id=max_seq_length,
              strip_column_names=False,
              add_aggregation_candidates=False,
          ))
      example = converter.get_empty_example()
      logging.info(example)
      question_id = _get_byte_feature(example, 'question_id')[0].decode('utf-8')
      self.assertEqual(question_id, text_utils.get_padded_question_id())
Exemplo n.º 6
0
 def __init__(self,
              model_dir: Text,
              task: Text,
              tf_record_filename="test.tfrecord"):
     self.tf_record_filename = tf_record_filename
     self.task = task
     vocab_file = os.path.join(model_dir, "vocab.txt")
     bert_config_file = os.path.join(model_dir, "bert_config.json")
     classifier_conversion_config = tf_example_utils.ClassifierConversionConfig(
         vocab_file=vocab_file,
         max_seq_length=_MAX_SEQ_LENGTH,
         max_column_id=_MAX_TABLE_ID,
         max_row_id=_MAX_TABLE_ID,
         strip_column_names=False,
         add_aggregation_candidates=False)
     self.converter = tf_example_utils.ToClassifierTensorflowExample(
         classifier_conversion_config)
     self.bert_config = json.load(open(bert_config_file))
     self.tapas_config = get_config(task, self.bert_config, model_dir)
     self.tapas = get_model(self.tapas_config, _MAX_SEQ_LENGTH)
Exemplo n.º 7
0
def _create_examples(
    interaction_dir,
    example_dir,
    vocab_file,
    filename,
    batch_size,
    test_mode,
):
  """Creates TF example for a single dataset."""

  filename = f'{filename}.tfrecord'
  interaction_path = os.path.join(interaction_dir, filename)
  example_path = os.path.join(example_dir, filename)

  config = tf_example_utils.ClassifierConversionConfig(
      vocab_file=vocab_file,
      max_seq_length=FLAGS.max_seq_length,
      max_column_id=_MAX_TABLE_ID,
      max_row_id=_MAX_TABLE_ID,
      strip_column_names=False,
      add_aggregation_candidates=False,
  )
  converter = tf_example_utils.ToClassifierTensorflowExample(config)

  examples = []
  num_questions = 0
  num_conversion_errors = 0
  for interaction in prediction_utils.iterate_interactions(interaction_path):
    number_annotation_utils.add_numeric_values(interaction)
    for i in range(len(interaction.questions)):
      num_questions += 1

      try:
        examples.append(converter.convert(interaction, i))
      except ValueError as e:
        num_conversion_errors += 1
        logging.info("Can't convert interaction: %s error: %s", interaction.id,
                     e)
    if test_mode and len(examples) >= 100:
      break

  _print(f'Processed: {filename}')
  _print(f'Num questions processed: {num_questions}')
  _print(f'Num examples: {len(examples)}')
  _print(f'Num conversion errors: {num_conversion_errors}')

  if batch_size is None:
    random.shuffle(examples)
  else:
    # Make sure the eval sets are divisible by the test batch size since
    # otherwise examples will be dropped on TPU.
    # These examples will later be ignored when writing the predictions.
    originial_num_examples = len(examples)
    while len(examples) % batch_size != 0:
      examples.append(converter.get_empty_example())
    if originial_num_examples != len(examples):
      _print(f'Padded with {len(examples) - originial_num_examples} examples.')

  with tf.io.TFRecordWriter(
      example_path,
      options=_to_tf_compression_type(FLAGS.compression_type),
  ) as writer:
    for example in examples:
      writer.write(example.SerializeToString())
Exemplo n.º 8
0
    def test_convert_with_token_selection(self):
        max_seq_length = 12
        with tempfile.TemporaryDirectory() as input_dir:
            vocab_file = os.path.join(input_dir, 'vocab.txt')
            _create_vocab(vocab_file, range(10))
            converter = tf_example_utils.ToClassifierTensorflowExample(
                config=tf_example_utils.ClassifierConversionConfig(
                    vocab_file=vocab_file,
                    max_seq_length=max_seq_length,
                    max_column_id=max_seq_length,
                    max_row_id=max_seq_length,
                    strip_column_names=False,
                    add_aggregation_candidates=False,
                ))
            interaction = interaction_pb2.Interaction(
                table=interaction_pb2.Table(
                    columns=[
                        interaction_pb2.Cell(text='A'),
                        interaction_pb2.Cell(text='B'),
                        interaction_pb2.Cell(text='C'),
                    ],
                    rows=[
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='0 6'),
                            interaction_pb2.Cell(text='4 7'),
                            interaction_pb2.Cell(text='5 6'),
                        ]),
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='1 7'),
                            interaction_pb2.Cell(text='3 6'),
                            interaction_pb2.Cell(text='5 5'),
                        ]),
                    ],
                ),
                questions=[
                    interaction_pb2.Question(id='id', original_text='2')
                ],
            )
            table_coordinates = []
            for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0),
                            (2, 2, 0), (2, 2, 1)]:
                table_coordinates.append(
                    table_selection_pb2.TableSelection.TokenCoordinates(
                        row_index=r, column_index=c, token_index=t))
            interaction.questions[0].Extensions[
                table_selection_pb2.TableSelection.
                table_selection_ext].CopyFrom(
                    table_selection_pb2.TableSelection(
                        selected_tokens=table_coordinates))

            number_annotation_utils.add_numeric_values(interaction)
            example = converter.convert(interaction, 0)
            logging.info(example)
            self.assertEqual(_get_int_feature(example, 'input_ids'),
                             [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'row_ids'),
                             [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ids'),
                             [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ranks'),
                             [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                             [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
Exemplo n.º 9
0
    f.write('model_checkpoint_path: "model.ckpt-0"')
for suffix in ['.data-00000-of-00001', '.index', '.meta']:
    shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}',
                    f'results/sqa/model/model.ckpt-0{suffix}')

    max_seq_length = 512
vocab_file = "tapas_sqa_base/vocab.txt"
config = tf_example_utils.ClassifierConversionConfig(
    vocab_file=vocab_file,
    max_seq_length=max_seq_length,
    max_column_id=max_seq_length,
    max_row_id=max_seq_length,
    strip_column_names=False,
    add_aggregation_candidates=False,
)
converter = tf_example_utils.ToClassifierTensorflowExample(config)


def convert_interactions_to_examples(tables_and_queries):
    """Calls Tapas converter to convert interaction to example."""
    for idx, (table, queries) in enumerate(tables_and_queries):
        interaction = interaction_pb2.Interaction()
        for position, query in enumerate(queries):
            question = interaction.questions.add()
            question.original_text = query
            question.id = f"{idx}-0_{position}"
        for header in table[0]:
            interaction.table.columns.add().text = header
        for line in table[1:]:
            row = interaction.table.rows.add()
            for cell in line:
 def start_bundle(self):
     self._converter = tf_example_utils.ToClassifierTensorflowExample(
         self._config)