예제 #1
0
def convert_interactions_to_examples(converter,
                                     tables_and_queries,
                                     filename="test.tfrecord"):
    """Calls Tapas converter to convert interaction to example."""

    filename = os.path.join("temp", filename)
    for idx, (table, queries) in enumerate(tables_and_queries):
        interaction = interaction_pb2.Interaction()
        for position, query in enumerate(queries):
            question = interaction.questions.add()
            question.original_text = query
            question.id = f"{idx}-0_{position}"
        for header in table[0]:
            interaction.table.columns.add().text = header
        for line in table[1:]:
            row = interaction.table.rows.add()
            for cell in line:
                row.cells.add().text = cell
    number_annotation_utils.add_numeric_values(interaction)
    with tf.io.TFRecordWriter(filename) as writer:
        for i in range(len(interaction.questions)):
            try:
                writer.write(
                    converter.convert(interaction, i).SerializeToString())
            except ValueError as e:
                print(
                    f"Can't convert interaction: {interaction.id} error: {e}")
    return filename
예제 #2
0
 def test_convert(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0'),
                         interaction_pb2.Cell(text='4'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1'),
                         interaction_pb2.Cell(text='3'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='2')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'),
                          [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11])
         self.assertEqual(_get_int_feature(example, 'row_ids'),
                          [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2])
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3])
         self.assertEqual(_get_int_feature(example, 'column_ranks'),
                          [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                          [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2])
         self.assertEqual(
             _get_float_feature(example, 'question_numeric_values'),
             _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
예제 #3
0
 def test_convert_with_trimmed_cell(self):
     max_seq_length = 16
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 cell_trim_length=2,
                 drop_rows_to_fit=True))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A A'),
                     interaction_pb2.Cell(text='A A A A'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='A')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         # We expect the second row to be dropped all cells should be trimmed to
         # >= 2 tokens.
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
예제 #4
0
    def test_get_empty_example(self):
        max_seq_length = 15

        input_path = os.path.join(self.test_data_dir,
                                  'retrieval_interaction.pbtxt')
        with open(input_path) as input_file:
            interaction = text_format.ParseLines(input_file,
                                                 interaction_pb2.Interaction())
        number_annotation_utils.add_numeric_values(interaction)

        with tempfile.TemporaryDirectory() as input_dir:
            vocab_file = os.path.join(input_dir, 'vocab.txt')
            _create_vocab(vocab_file, [
                'by', 'created', 'do', 'dragon', 'go', 'hannibal', 'harris',
                'in', 'lecter', 'movies', 'novels', 'order', 'original', 'red',
                'the', 'thomas', 'what', 'work'
            ])
            converter = tf_example_utils.ToRetrievalTensorflowExample(
                config=tf_example_utils.RetrievalConversionConfig(
                    vocab_file=vocab_file,
                    max_seq_length=max_seq_length,
                    max_column_id=max_seq_length,
                    max_row_id=max_seq_length,
                    strip_column_names=False,
                ))
            example = converter.convert(interaction,
                                        index=0,
                                        negative_example=None)
            logging.info(example)
            # Check the question.
            self.assertEqual(
                _get_int_feature(example, 'question_input_ids'),
                [2, 22, 17, 8, 20, 11, 14, 15, 10, 13, 3, 0, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'question_input_mask'),
                             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0])
            # Check document title + table.
            self.assertEqual(
                _get_int_feature(example, 'input_ids'),
                [2, 11, 14, 3, 7, 6, 18, 23, 16, 21, 12, 19, 9, 19, 9])
            self.assertEqual(_get_int_feature(example, 'input_mask'),
                             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
            self.assertEqual(_get_int_feature(example, 'segment_ids'),
                             [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
예제 #5
0
def convert_interactions_to_examples(tables_and_queries):
    for idx, (table, queries) in enumerate(tables_and_queries):
        interaction = interaction_pb2.Interaction()
        for position, query in enumerate(queries):
            question = interaction.questions.add()
            question.original_text = query
            question.id = f"{idx}-0_{position}"
        for header in table[0]:
            interaction.table.columns.add().text = header
        for line in table[1:]:
            row = interaction.table.rows.add()
            for cell in line:
                row.cells.add().text = cell
        number_annotation_utils.add_numeric_values(interaction)
        for i in range(len(interaction.questions)):
            try:
                yield converter.convert(interaction, i)
            except ValueError as e:
                print(f"Can't convert interaction: {interaction.id} error: {e}")
예제 #6
0
def _create_examples(
    interaction_dir,
    example_dir,
    vocab_file,
    filename,
    batch_size,
    test_mode,
):
  """Creates TF example for a single dataset."""

  filename = f'{filename}.tfrecord'
  interaction_path = os.path.join(interaction_dir, filename)
  example_path = os.path.join(example_dir, filename)

  config = tf_example_utils.ClassifierConversionConfig(
      vocab_file=vocab_file,
      max_seq_length=FLAGS.max_seq_length,
      max_column_id=_MAX_TABLE_ID,
      max_row_id=_MAX_TABLE_ID,
      strip_column_names=False,
      add_aggregation_candidates=False,
  )
  converter = tf_example_utils.ToClassifierTensorflowExample(config)

  examples = []
  num_questions = 0
  num_conversion_errors = 0
  for interaction in prediction_utils.iterate_interactions(interaction_path):
    number_annotation_utils.add_numeric_values(interaction)
    for i in range(len(interaction.questions)):
      num_questions += 1

      try:
        examples.append(converter.convert(interaction, i))
      except ValueError as e:
        num_conversion_errors += 1
        logging.info("Can't convert interaction: %s error: %s", interaction.id,
                     e)
    if test_mode and len(examples) >= 100:
      break

  _print(f'Processed: {filename}')
  _print(f'Num questions processed: {num_questions}')
  _print(f'Num examples: {len(examples)}')
  _print(f'Num conversion errors: {num_conversion_errors}')

  if batch_size is None:
    random.shuffle(examples)
  else:
    # Make sure the eval sets are divisible by the test batch size since
    # otherwise examples will be dropped on TPU.
    # These examples will later be ignored when writing the predictions.
    originial_num_examples = len(examples)
    while len(examples) % batch_size != 0:
      examples.append(converter.get_empty_example())
    if originial_num_examples != len(examples):
      _print(f'Padded with {len(examples) - originial_num_examples} examples.')

  with tf.io.TFRecordWriter(
      example_path,
      options=_to_tf_compression_type(FLAGS.compression_type),
  ) as writer:
    for example in examples:
      writer.write(example.SerializeToString())
예제 #7
0
def add_numeric_values_fn(element):
    key, interaction = element
    new_interaction = interaction_pb2.Interaction()
    new_interaction.CopyFrom(interaction)
    number_annotation_utils.add_numeric_values(new_interaction)
    return key, new_interaction
예제 #8
0
    def test_convert_with_token_selection(self):
        max_seq_length = 12
        with tempfile.TemporaryDirectory() as input_dir:
            vocab_file = os.path.join(input_dir, 'vocab.txt')
            _create_vocab(vocab_file, range(10))
            converter = tf_example_utils.ToClassifierTensorflowExample(
                config=tf_example_utils.ClassifierConversionConfig(
                    vocab_file=vocab_file,
                    max_seq_length=max_seq_length,
                    max_column_id=max_seq_length,
                    max_row_id=max_seq_length,
                    strip_column_names=False,
                    add_aggregation_candidates=False,
                ))
            interaction = interaction_pb2.Interaction(
                table=interaction_pb2.Table(
                    columns=[
                        interaction_pb2.Cell(text='A'),
                        interaction_pb2.Cell(text='B'),
                        interaction_pb2.Cell(text='C'),
                    ],
                    rows=[
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='0 6'),
                            interaction_pb2.Cell(text='4 7'),
                            interaction_pb2.Cell(text='5 6'),
                        ]),
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='1 7'),
                            interaction_pb2.Cell(text='3 6'),
                            interaction_pb2.Cell(text='5 5'),
                        ]),
                    ],
                ),
                questions=[
                    interaction_pb2.Question(id='id', original_text='2')
                ],
            )
            table_coordinates = []
            for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0),
                            (2, 2, 0), (2, 2, 1)]:
                table_coordinates.append(
                    table_selection_pb2.TableSelection.TokenCoordinates(
                        row_index=r, column_index=c, token_index=t))
            interaction.questions[0].Extensions[
                table_selection_pb2.TableSelection.
                table_selection_ext].CopyFrom(
                    table_selection_pb2.TableSelection(
                        selected_tokens=table_coordinates))

            number_annotation_utils.add_numeric_values(interaction)
            example = converter.convert(interaction, 0)
            logging.info(example)
            self.assertEqual(_get_int_feature(example, 'input_ids'),
                             [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'row_ids'),
                             [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ids'),
                             [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ranks'),
                             [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                             [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
예제 #9
0
 def test_convert_with_negative_tables(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToRetrievalTensorflowExample(
             config=tf_example_utils.RetrievalConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0 6'),
                         interaction_pb2.Cell(text='4 7'),
                         interaction_pb2.Cell(text='5 6'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1 7'),
                         interaction_pb2.Cell(text='3 6'),
                         interaction_pb2.Cell(text='5 5'),
                     ]),
                 ],
                 table_id='table_0',
             ),
             questions=[
                 interaction_pb2.Question(
                     id='id',
                     original_text='2',
                 )
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         n_table = interaction_pb2.Table(
             columns=[
                 interaction_pb2.Cell(text='A'),
                 interaction_pb2.Cell(text='B'),
             ],
             rows=[
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='0 6'),
                     interaction_pb2.Cell(text='4 7'),
                 ]),
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='1 7'),
                     interaction_pb2.Cell(text='3 6'),
                 ]),
             ],
             table_id='table_1',
         )
         number_annotation_utils.add_numeric_table_values(n_table)
         n_example = _NegativeRetrievalExample()
         n_example.table.CopyFrom(n_table)
         n_example.score = -82.0
         n_example.rank = 1
         example = converter.convert(interaction, 0, n_example)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'), [
             2, 5, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11, 2, 5, 3, 1, 1, 6, 10, 7,
             9, 0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'row_ids'), [
             0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'column_ids'), [
             0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 0, 0, 0, 1, 2, 1, 2, 1, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'segment_ids'), [
             0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'input_mask'), [
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'inv_column_ranks'), [
             0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'column_ranks'), [
             0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'), [
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'table_id_hash'),
                          [911224864, 1294380046])
         self.assertEqual(_get_float_feature(example, 'numeric_values'), [
             'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 5.0, 1.0,
             3.0, 5.0, 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 1.0,
             3.0, 'nan', 'nan', 'nan'
         ])
         self.assertEqual(
             _get_float_feature(example, 'numeric_values_scale'), [
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
             ])
         self.assertEqual([
             i.decode('utf-8')
             for i in _get_byte_feature(example, 'table_id')
         ], ['table_0', 'table_1'])