def convert_interactions_to_examples(converter, tables_and_queries, filename="test.tfrecord"): """Calls Tapas converter to convert interaction to example.""" filename = os.path.join("temp", filename) for idx, (table, queries) in enumerate(tables_and_queries): interaction = interaction_pb2.Interaction() for position, query in enumerate(queries): question = interaction.questions.add() question.original_text = query question.id = f"{idx}-0_{position}" for header in table[0]: interaction.table.columns.add().text = header for line in table[1:]: row = interaction.table.rows.add() for cell in line: row.cells.add().text = cell number_annotation_utils.add_numeric_values(interaction) with tf.io.TFRecordWriter(filename) as writer: for i in range(len(interaction.questions)): try: writer.write( converter.convert(interaction, i).SerializeToString()) except ValueError as e: print( f"Can't convert interaction: {interaction.id} error: {e}") return filename
def test_convert(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0'), interaction_pb2.Cell(text='4'), interaction_pb2.Cell(text='5'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1'), interaction_pb2.Cell(text='3'), interaction_pb2.Cell(text='5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2]) self.assertEqual( _get_float_feature(example, 'question_numeric_values'), _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
def test_convert_with_trimmed_cell(self): max_seq_length = 16 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, cell_trim_length=2, drop_rows_to_fit=True)) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='A A'), interaction_pb2.Cell(text='A A A A'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='A') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) # We expect the second row to be dropped all cells should be trimmed to # >= 2 tokens. self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
def test_get_empty_example(self): max_seq_length = 15 input_path = os.path.join(self.test_data_dir, 'retrieval_interaction.pbtxt') with open(input_path) as input_file: interaction = text_format.ParseLines(input_file, interaction_pb2.Interaction()) number_annotation_utils.add_numeric_values(interaction) with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, [ 'by', 'created', 'do', 'dragon', 'go', 'hannibal', 'harris', 'in', 'lecter', 'movies', 'novels', 'order', 'original', 'red', 'the', 'thomas', 'what', 'work' ]) converter = tf_example_utils.ToRetrievalTensorflowExample( config=tf_example_utils.RetrievalConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, )) example = converter.convert(interaction, index=0, negative_example=None) logging.info(example) # Check the question. self.assertEqual( _get_int_feature(example, 'question_input_ids'), [2, 22, 17, 8, 20, 11, 14, 15, 10, 13, 3, 0, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'question_input_mask'), [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]) # Check document title + table. self.assertEqual( _get_int_feature(example, 'input_ids'), [2, 11, 14, 3, 7, 6, 18, 23, 16, 21, 12, 19, 9, 19, 9]) self.assertEqual(_get_int_feature(example, 'input_mask'), [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) self.assertEqual(_get_int_feature(example, 'segment_ids'), [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
def convert_interactions_to_examples(tables_and_queries): for idx, (table, queries) in enumerate(tables_and_queries): interaction = interaction_pb2.Interaction() for position, query in enumerate(queries): question = interaction.questions.add() question.original_text = query question.id = f"{idx}-0_{position}" for header in table[0]: interaction.table.columns.add().text = header for line in table[1:]: row = interaction.table.rows.add() for cell in line: row.cells.add().text = cell number_annotation_utils.add_numeric_values(interaction) for i in range(len(interaction.questions)): try: yield converter.convert(interaction, i) except ValueError as e: print(f"Can't convert interaction: {interaction.id} error: {e}")
def _create_examples( interaction_dir, example_dir, vocab_file, filename, batch_size, test_mode, ): """Creates TF example for a single dataset.""" filename = f'{filename}.tfrecord' interaction_path = os.path.join(interaction_dir, filename) example_path = os.path.join(example_dir, filename) config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=FLAGS.max_seq_length, max_column_id=_MAX_TABLE_ID, max_row_id=_MAX_TABLE_ID, strip_column_names=False, add_aggregation_candidates=False, ) converter = tf_example_utils.ToClassifierTensorflowExample(config) examples = [] num_questions = 0 num_conversion_errors = 0 for interaction in prediction_utils.iterate_interactions(interaction_path): number_annotation_utils.add_numeric_values(interaction) for i in range(len(interaction.questions)): num_questions += 1 try: examples.append(converter.convert(interaction, i)) except ValueError as e: num_conversion_errors += 1 logging.info("Can't convert interaction: %s error: %s", interaction.id, e) if test_mode and len(examples) >= 100: break _print(f'Processed: {filename}') _print(f'Num questions processed: {num_questions}') _print(f'Num examples: {len(examples)}') _print(f'Num conversion errors: {num_conversion_errors}') if batch_size is None: random.shuffle(examples) else: # Make sure the eval sets are divisible by the test batch size since # otherwise examples will be dropped on TPU. # These examples will later be ignored when writing the predictions. originial_num_examples = len(examples) while len(examples) % batch_size != 0: examples.append(converter.get_empty_example()) if originial_num_examples != len(examples): _print(f'Padded with {len(examples) - originial_num_examples} examples.') with tf.io.TFRecordWriter( example_path, options=_to_tf_compression_type(FLAGS.compression_type), ) as writer: for example in examples: writer.write(example.SerializeToString())
def add_numeric_values_fn(element): key, interaction = element new_interaction = interaction_pb2.Interaction() new_interaction.CopyFrom(interaction) number_annotation_utils.add_numeric_values(new_interaction) return key, new_interaction
def test_convert_with_token_selection(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0 6'), interaction_pb2.Cell(text='4 7'), interaction_pb2.Cell(text='5 6'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1 7'), interaction_pb2.Cell(text='3 6'), interaction_pb2.Cell(text='5 5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) table_coordinates = [] for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0), (2, 2, 0), (2, 2, 1)]: table_coordinates.append( table_selection_pb2.TableSelection.TokenCoordinates( row_index=r, column_index=c, token_index=t)) interaction.questions[0].Extensions[ table_selection_pb2.TableSelection. table_selection_ext].CopyFrom( table_selection_pb2.TableSelection( selected_tokens=table_coordinates)) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
def test_convert_with_negative_tables(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToRetrievalTensorflowExample( config=tf_example_utils.RetrievalConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0 6'), interaction_pb2.Cell(text='4 7'), interaction_pb2.Cell(text='5 6'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1 7'), interaction_pb2.Cell(text='3 6'), interaction_pb2.Cell(text='5 5'), ]), ], table_id='table_0', ), questions=[ interaction_pb2.Question( id='id', original_text='2', ) ], ) number_annotation_utils.add_numeric_values(interaction) n_table = interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0 6'), interaction_pb2.Cell(text='4 7'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1 7'), interaction_pb2.Cell(text='3 6'), ]), ], table_id='table_1', ) number_annotation_utils.add_numeric_table_values(n_table) n_example = _NegativeRetrievalExample() n_example.table.CopyFrom(n_table) n_example.score = -82.0 n_example.rank = 1 example = converter.convert(interaction, 0, n_example) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [ 2, 5, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11, 2, 5, 3, 1, 1, 6, 10, 7, 9, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'row_ids'), [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'column_ids'), [ 0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 0, 0, 0, 1, 2, 1, 2, 1, 2, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'segment_ids'), [ 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'input_mask'), [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'inv_column_ranks'), [ 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [ 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) self.assertEqual(_get_int_feature(example, 'table_id_hash'), [911224864, 1294380046]) self.assertEqual(_get_float_feature(example, 'numeric_values'), [ 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 5.0, 1.0, 3.0, 5.0, 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 1.0, 3.0, 'nan', 'nan', 'nan' ]) self.assertEqual( _get_float_feature(example, 'numeric_values_scale'), [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ]) self.assertEqual([ i.decode('utf-8') for i in _get_byte_feature(example, 'table_id') ], ['table_0', 'table_1'])