def convert(self, interaction, index): """Converts question at 'index' to example.""" table = interaction.table num_rows = self._get_num_rows(table, self._drop_rows_to_fit) num_columns = self._get_num_columns(table) question = interaction.questions[index] if not interaction.questions[index].answer.is_valid: raise ValueError('Invalid answer') text_tokens = self._tokenize_extended_question(question, table) tokenized_table = self._tokenize_table(table) serialized_example, features = self._to_trimmed_features( question=question, table=table, question_tokens=text_tokens, tokenized_table=tokenized_table, num_columns=num_columns, num_rows=num_rows, drop_rows_to_fit=self._drop_rows_to_fit) column_ids = serialized_example.column_ids row_ids = serialized_example.row_ids def get_answer_ids(question): if self._update_answer_coordinates: return _find_answer_ids_from_answer_texts( column_ids, row_ids, tokenized_table, answer_texts=[ self._tokenizer.tokenize(at) for at in question.answer.answer_texts ], ) return _get_answer_ids(column_ids, row_ids, question) answer_ids = get_answer_ids(question) self._pad_to_seq_length(answer_ids) features['label_ids'] = create_int_feature(answer_ids) if index == 0: prev_answer_ids = [0] * len(column_ids) else: prev_answer_ids = get_answer_ids(interaction.questions[index - 1],) self._pad_to_seq_length(prev_answer_ids) features['prev_label_ids'] = create_int_feature(prev_answer_ids) features['question_id'] = create_string_feature( [question.id.encode('utf8')]) features['question_id_ints'] = create_int_feature( text_utils.str_to_ints( question.id, length=text_utils.DEFAULT_INTS_LENGTH)) features['aggregation_function_id'] = create_int_feature( [question.answer.aggregation_function]) features['classification_class_index'] = create_int_feature( [question.answer.class_index]) answer = question.answer.float_value if question.answer.HasField( 'float_value') else _NAN features['answer'] = create_float_feature([answer]) self._add_question_numeric_values(question, features) if self._add_aggregation_candidates: rng = random.Random(fingerprint(question.id)) candidates = interpretation_utils.find_candidates( rng, table, question) num_initial_candidates = len(candidates) candidates = [c for c in candidates if len(c.rows) < _MAX_NUM_ROWS] candidates = candidates[:_MAX_NUM_CANDIDATES] funs = [0] * _MAX_NUM_CANDIDATES sizes = [0] * _MAX_NUM_CANDIDATES indexes = [] num_final_candidates = 0 for index, candidate in enumerate(candidates): token_indexes = [] for row in candidate.rows: token_indexes += _get_cell_token_indexes(column_ids, row_ids, candidate.column, row) if len(indexes) + len(serialized_example.tokens) > _MAX_INDEX_LENGTH: break num_final_candidates += 1 sizes[index] = len(token_indexes) funs[index] = candidate.agg_function indexes += token_indexes # <int>[1] features['cand_num'] = create_int_feature([num_final_candidates]) # <int>[_MAX_NUM_CANDIDATES] features['can_aggregation_function_ids'] = create_int_feature(funs) # <int>[_MAX_NUM_CANDIDATES] features['can_sizes'] = create_int_feature(sizes) # <int>[_MAX_INDEX_LENGTH] # Actual length is sum(sizes). features['can_indexes'] = create_int_feature(indexes) return tf.train.Example(features=tf.train.Features(feature=features))
def test_str_to_ints(self, text, length): ints = text_utils.str_to_ints(text, length) self.assertLen(ints, length) logging.info("ints: %s %s ", ints, text) self.assertEqual(text_utils.ints_to_str(ints), text)
def convert( self, interaction, index, negative_example, ): """Converts question at 'index' to example.""" table = interaction.table num_rows = len(table.rows) if num_rows >= self._max_row_id: num_rows = self._max_row_id - 1 num_columns = len(table.columns) if num_columns >= self._max_column_id: num_columns = self._max_column_id - 1 title = table.document_title if not self._use_document_title: title = '' title_tokens = self._tokenizer.tokenize(title) tokenized_table = self._tokenize_table(table) while True: try: _, features = self._to_trimmed_features( question=None, table=table, question_tokens=title_tokens, tokenized_table=tokenized_table, num_columns=num_columns, num_rows=num_rows) break except ValueError: pass # Since this is retrieval we might get away with removing some cells of # the table. # TODO(thomasmueller) Consider taking the token length into account. if num_columns >= num_rows: num_columns -= 1 else: num_rows -= 1 if num_columns == 0 or num_rows == 0: raise ValueError('Cannot fit table into sequence.') question = interaction.questions[index] features['question_id'] = base.create_string_feature( [question.id.encode('utf8')]) features['question_id_ints'] = base.create_int_feature( text_utils.str_to_ints(question.id, length=text_utils.DEFAULT_INTS_LENGTH)) q_tokens = self._tokenizer.tokenize(question.text) q_tokens = self._serialize_text(q_tokens)[0] q_tokens.append(base.Token(_SEP, _SEP)) q_input_ids = self._to_token_ids(q_tokens) self._pad_to_seq_length(q_input_ids) q_input_mask = [1] * len(q_tokens) self._pad_to_seq_length(q_input_mask) features['question_input_ids'] = base.create_int_feature(q_input_ids) features['question_input_mask'] = base.create_int_feature(q_input_mask) if question: features['question_hash'] = base.create_int_feature( [base.fingerprint(question.text) % _MAX_INT]) if negative_example is not None: n_table = negative_example.table n_title_tokens = self._tokenizer.tokenize(n_table.document_title) n_tokenized_table = self._tokenize_table(n_table) n_num_rows = self._get_num_rows(n_table, drop_rows_to_fit=True) n_num_columns = self._get_num_columns(n_table) _, n_example_features = self._to_trimmed_features( question=None, table=n_table, question_tokens=n_title_tokens, tokenized_table=n_tokenized_table, num_columns=n_num_columns, num_rows=n_num_rows, drop_rows_to_fit=True) _join_features(features, n_example_features) return tf.train.Example(features=tf.train.Features(feature=features))
def test_str_to_ints(self, text, length): ints = text_utils.str_to_ints(text, length) self.assertLen(ints, length) self.assertEqual(text_utils.ints_to_str(ints), text)
def convert(self, interaction, index): """Converts question at 'index' to example.""" table = interaction.table num_rows = self._get_num_rows(table, self._drop_rows_to_fit) num_columns = self._get_num_columns(table) question = interaction.questions[index] if not interaction.questions[index].answer.is_valid: beam_metrics.Metrics.counter( _NS, 'Conversion skipped (answer not valid)').inc() raise ValueError('Invalid answer') text_tokens = self._tokenize_extended_question(question, table) tokenized_table = self._tokenize_table(table) table_selection_ext = table_selection_pb2.TableSelection.table_selection_ext if table_selection_ext in question.Extensions: table_selection = question.Extensions[table_selection_ext] if not tokenized_table.selected_tokens: raise ValueError('No tokens selected') if table_selection.selected_tokens: selected_tokens = {(t.row_index, t.column_index, t.token_index) for t in table_selection.selected_tokens} tokenized_table.selected_tokens = [ t for t in tokenized_table.selected_tokens if (t.row_index, t.column_index, t.token_index) in selected_tokens ] serialized_example, features = self._to_trimmed_features( question=question, table=table, question_tokens=text_tokens, tokenized_table=tokenized_table, num_columns=num_columns, num_rows=num_rows, drop_rows_to_fit=self._drop_rows_to_fit) column_ids = serialized_example.column_ids row_ids = serialized_example.row_ids def get_answer_ids(question): if self._update_answer_coordinates: return _find_answer_ids_from_answer_texts( column_ids, row_ids, tokenized_table, answer_texts=[ self._tokenizer.tokenize(at) for at in question.answer.answer_texts ], ) return _get_answer_ids(column_ids, row_ids, question) answer_ids = get_answer_ids(question) self._pad_to_seq_length(answer_ids) features['label_ids'] = create_int_feature(answer_ids) if index > 0: prev_answer_ids = get_answer_ids(interaction.questions[index - 1],) else: prev_answer_ids = [0] * len(column_ids) self._pad_to_seq_length(prev_answer_ids) features['prev_label_ids'] = create_int_feature(prev_answer_ids) features['question_id'] = create_string_feature( [question.id.encode('utf8')]) if self._trim_question_ids: question_id = question.id[-text_utils.DEFAULT_INTS_LENGTH:] else: question_id = question.id features['question_id_ints'] = create_int_feature( text_utils.str_to_ints( question_id, length=text_utils.DEFAULT_INTS_LENGTH)) features['aggregation_function_id'] = create_int_feature( [question.answer.aggregation_function]) features['classification_class_index'] = create_int_feature( [question.answer.class_index]) answer = question.answer.float_value if question.answer.HasField( 'float_value') else _NAN features['answer'] = create_float_feature([answer]) if self._add_aggregation_candidates: rng = random.Random(fingerprint(question.id)) candidates = interpretation_utils.find_candidates(rng, table, question) num_initial_candidates = len(candidates) candidates = [c for c in candidates if len(c.rows) < _MAX_NUM_ROWS] candidates = candidates[:_MAX_NUM_CANDIDATES] funs = [0] * _MAX_NUM_CANDIDATES sizes = [0] * _MAX_NUM_CANDIDATES indexes = [] num_final_candidates = 0 for index, candidate in enumerate(candidates): token_indexes = [] for row in candidate.rows: token_indexes += _get_cell_token_indexes(column_ids, row_ids, candidate.column, row) if len(indexes) + len(serialized_example.tokens) > _MAX_INDEX_LENGTH: break num_final_candidates += 1 sizes[index] = len(token_indexes) funs[index] = candidate.agg_function indexes += token_indexes # <int>[1] features['cand_num'] = create_int_feature([num_final_candidates]) # <int>[_MAX_NUM_CANDIDATES] features['can_aggregation_function_ids'] = create_int_feature(funs) # <int>[_MAX_NUM_CANDIDATES] features['can_sizes'] = create_int_feature(sizes) # <int>[_MAX_INDEX_LENGTH] # Actual length is sum(sizes). features['can_indexes'] = create_int_feature(indexes) if num_initial_candidates > 0: beam_metrics.Metrics.counter( _NS, _get_buckets(num_initial_candidates, [10, 20, 50, 100, 200, 500, 1000, 1200, 1500], 'Candidates Size:')).inc() beam_metrics.Metrics.counter(_NS, 'Candidates: Input').inc() if num_final_candidates != num_initial_candidates: beam_metrics.Metrics.counter(_NS, 'Candidates: Dropped candidates').inc() return tf.train.Example(features=tf.train.Features(feature=features))