예제 #1
0
    def convert(self, interaction,
                index):
        """Converts question at 'index' to example."""
        table = interaction.table

        num_rows = self._get_num_rows(table, self._drop_rows_to_fit)
        num_columns = self._get_num_columns(table)

        question = interaction.questions[index]
        if not interaction.questions[index].answer.is_valid:
            raise ValueError('Invalid answer')

        text_tokens = self._tokenize_extended_question(question, table)
        tokenized_table = self._tokenize_table(table)

        serialized_example, features = self._to_trimmed_features(
            question=question,
            table=table,
            question_tokens=text_tokens,
            tokenized_table=tokenized_table,
            num_columns=num_columns,
            num_rows=num_rows,
            drop_rows_to_fit=self._drop_rows_to_fit)

        column_ids = serialized_example.column_ids
        row_ids = serialized_example.row_ids

        def get_answer_ids(question):
            if self._update_answer_coordinates:
                return _find_answer_ids_from_answer_texts(
                    column_ids,
                    row_ids,
                    tokenized_table,
                    answer_texts=[
                        self._tokenizer.tokenize(at)
                        for at in question.answer.answer_texts
                    ],
                )
            return _get_answer_ids(column_ids, row_ids, question)

        answer_ids = get_answer_ids(question)
        self._pad_to_seq_length(answer_ids)
        features['label_ids'] = create_int_feature(answer_ids)

        if index == 0:
            prev_answer_ids = [0] * len(column_ids)
        else:
            prev_answer_ids = get_answer_ids(interaction.questions[index - 1],)
        self._pad_to_seq_length(prev_answer_ids)
        features['prev_label_ids'] = create_int_feature(prev_answer_ids)
        features['question_id'] = create_string_feature(
            [question.id.encode('utf8')])
        features['question_id_ints'] = create_int_feature(
            text_utils.str_to_ints(
                question.id, length=text_utils.DEFAULT_INTS_LENGTH))
        features['aggregation_function_id'] = create_int_feature(
            [question.answer.aggregation_function])
        features['classification_class_index'] = create_int_feature(
            [question.answer.class_index])

        answer = question.answer.float_value if question.answer.HasField(
            'float_value') else _NAN
        features['answer'] = create_float_feature([answer])

        self._add_question_numeric_values(question, features)

        if self._add_aggregation_candidates:
            rng = random.Random(fingerprint(question.id))

            candidates = interpretation_utils.find_candidates(
                rng, table, question)
            num_initial_candidates = len(candidates)

            candidates = [c for c in candidates if len(c.rows) < _MAX_NUM_ROWS]
            candidates = candidates[:_MAX_NUM_CANDIDATES]

            funs = [0] * _MAX_NUM_CANDIDATES
            sizes = [0] * _MAX_NUM_CANDIDATES
            indexes = []

            num_final_candidates = 0
            for index, candidate in enumerate(candidates):
                token_indexes = []
                for row in candidate.rows:
                    token_indexes += _get_cell_token_indexes(column_ids, row_ids,
                                                             candidate.column, row)
                if len(indexes) + len(serialized_example.tokens) > _MAX_INDEX_LENGTH:
                    break
                num_final_candidates += 1
                sizes[index] = len(token_indexes)
                funs[index] = candidate.agg_function
                indexes += token_indexes

            # <int>[1]
            features['cand_num'] = create_int_feature([num_final_candidates])
            # <int>[_MAX_NUM_CANDIDATES]
            features['can_aggregation_function_ids'] = create_int_feature(funs)
            # <int>[_MAX_NUM_CANDIDATES]
            features['can_sizes'] = create_int_feature(sizes)
            # <int>[_MAX_INDEX_LENGTH]
            # Actual length is sum(sizes).
            features['can_indexes'] = create_int_feature(indexes)

        return tf.train.Example(features=tf.train.Features(feature=features))
예제 #2
0
 def test_str_to_ints(self, text, length):
     ints = text_utils.str_to_ints(text, length)
     self.assertLen(ints, length)
     logging.info("ints: %s %s ", ints, text)
     self.assertEqual(text_utils.ints_to_str(ints), text)
예제 #3
0
    def convert(
        self,
        interaction,
        index,
        negative_example,
    ):
        """Converts question at 'index' to example."""
        table = interaction.table

        num_rows = len(table.rows)
        if num_rows >= self._max_row_id:
            num_rows = self._max_row_id - 1

        num_columns = len(table.columns)
        if num_columns >= self._max_column_id:
            num_columns = self._max_column_id - 1

        title = table.document_title
        if not self._use_document_title:
            title = ''
        title_tokens = self._tokenizer.tokenize(title)
        tokenized_table = self._tokenize_table(table)

        while True:
            try:
                _, features = self._to_trimmed_features(
                    question=None,
                    table=table,
                    question_tokens=title_tokens,
                    tokenized_table=tokenized_table,
                    num_columns=num_columns,
                    num_rows=num_rows)
                break
            except ValueError:
                pass
            # Since this is retrieval we might get away with removing some cells of
            # the table.
            # TODO(thomasmueller) Consider taking the token length into account.
            if num_columns >= num_rows:
                num_columns -= 1
            else:
                num_rows -= 1
            if num_columns == 0 or num_rows == 0:
                raise ValueError('Cannot fit table into sequence.')

        question = interaction.questions[index]
        features['question_id'] = base.create_string_feature(
            [question.id.encode('utf8')])
        features['question_id_ints'] = base.create_int_feature(
            text_utils.str_to_ints(question.id,
                                   length=text_utils.DEFAULT_INTS_LENGTH))

        q_tokens = self._tokenizer.tokenize(question.text)
        q_tokens = self._serialize_text(q_tokens)[0]
        q_tokens.append(base.Token(_SEP, _SEP))
        q_input_ids = self._to_token_ids(q_tokens)
        self._pad_to_seq_length(q_input_ids)
        q_input_mask = [1] * len(q_tokens)
        self._pad_to_seq_length(q_input_mask)
        features['question_input_ids'] = base.create_int_feature(q_input_ids)
        features['question_input_mask'] = base.create_int_feature(q_input_mask)
        if question:
            features['question_hash'] = base.create_int_feature(
                [base.fingerprint(question.text) % _MAX_INT])

        if negative_example is not None:
            n_table = negative_example.table
            n_title_tokens = self._tokenizer.tokenize(n_table.document_title)
            n_tokenized_table = self._tokenize_table(n_table)
            n_num_rows = self._get_num_rows(n_table, drop_rows_to_fit=True)
            n_num_columns = self._get_num_columns(n_table)
            _, n_example_features = self._to_trimmed_features(
                question=None,
                table=n_table,
                question_tokens=n_title_tokens,
                tokenized_table=n_tokenized_table,
                num_columns=n_num_columns,
                num_rows=n_num_rows,
                drop_rows_to_fit=True)
            _join_features(features, n_example_features)
        return tf.train.Example(features=tf.train.Features(feature=features))
예제 #4
0
 def test_str_to_ints(self, text, length):
   ints = text_utils.str_to_ints(text, length)
   self.assertLen(ints, length)
   self.assertEqual(text_utils.ints_to_str(ints), text)
예제 #5
0
  def convert(self, interaction,
              index):
    """Converts question at 'index' to example."""
    table = interaction.table

    num_rows = self._get_num_rows(table, self._drop_rows_to_fit)
    num_columns = self._get_num_columns(table)

    question = interaction.questions[index]
    if not interaction.questions[index].answer.is_valid:
      beam_metrics.Metrics.counter(
          _NS, 'Conversion skipped (answer not valid)').inc()
      raise ValueError('Invalid answer')


    text_tokens = self._tokenize_extended_question(question, table)
    tokenized_table = self._tokenize_table(table)
    table_selection_ext = table_selection_pb2.TableSelection.table_selection_ext
    if table_selection_ext in question.Extensions:
      table_selection = question.Extensions[table_selection_ext]
      if not tokenized_table.selected_tokens:
        raise ValueError('No tokens selected')
      if table_selection.selected_tokens:
        selected_tokens = {(t.row_index, t.column_index, t.token_index)
                           for t in table_selection.selected_tokens}
        tokenized_table.selected_tokens = [
            t for t in tokenized_table.selected_tokens
            if (t.row_index, t.column_index, t.token_index) in selected_tokens
        ]

    serialized_example, features = self._to_trimmed_features(
        question=question,
        table=table,
        question_tokens=text_tokens,
        tokenized_table=tokenized_table,
        num_columns=num_columns,
        num_rows=num_rows,
        drop_rows_to_fit=self._drop_rows_to_fit)

    column_ids = serialized_example.column_ids
    row_ids = serialized_example.row_ids

    def get_answer_ids(question):
      if self._update_answer_coordinates:
        return _find_answer_ids_from_answer_texts(
            column_ids,
            row_ids,
            tokenized_table,
            answer_texts=[
                self._tokenizer.tokenize(at)
                for at in question.answer.answer_texts
            ],
        )
      return _get_answer_ids(column_ids, row_ids, question)

    answer_ids = get_answer_ids(question)
    self._pad_to_seq_length(answer_ids)
    features['label_ids'] = create_int_feature(answer_ids)

    if index > 0:
      prev_answer_ids = get_answer_ids(interaction.questions[index - 1],)
    else:
      prev_answer_ids = [0] * len(column_ids)
    self._pad_to_seq_length(prev_answer_ids)
    features['prev_label_ids'] = create_int_feature(prev_answer_ids)
    features['question_id'] = create_string_feature(
        [question.id.encode('utf8')])
    if self._trim_question_ids:
      question_id = question.id[-text_utils.DEFAULT_INTS_LENGTH:]
    else:
      question_id = question.id
    features['question_id_ints'] = create_int_feature(
        text_utils.str_to_ints(
            question_id, length=text_utils.DEFAULT_INTS_LENGTH))
    features['aggregation_function_id'] = create_int_feature(
        [question.answer.aggregation_function])
    features['classification_class_index'] = create_int_feature(
        [question.answer.class_index])

    answer = question.answer.float_value if question.answer.HasField(
        'float_value') else _NAN
    features['answer'] = create_float_feature([answer])

    if self._add_aggregation_candidates:
      rng = random.Random(fingerprint(question.id))

      candidates = interpretation_utils.find_candidates(rng, table, question)
      num_initial_candidates = len(candidates)

      candidates = [c for c in candidates if len(c.rows) < _MAX_NUM_ROWS]
      candidates = candidates[:_MAX_NUM_CANDIDATES]

      funs = [0] * _MAX_NUM_CANDIDATES
      sizes = [0] * _MAX_NUM_CANDIDATES
      indexes = []

      num_final_candidates = 0
      for index, candidate in enumerate(candidates):
        token_indexes = []
        for row in candidate.rows:
          token_indexes += _get_cell_token_indexes(column_ids, row_ids,
                                                   candidate.column, row)
        if len(indexes) + len(serialized_example.tokens) > _MAX_INDEX_LENGTH:
          break
        num_final_candidates += 1
        sizes[index] = len(token_indexes)
        funs[index] = candidate.agg_function
        indexes += token_indexes

      # <int>[1]
      features['cand_num'] = create_int_feature([num_final_candidates])
      # <int>[_MAX_NUM_CANDIDATES]
      features['can_aggregation_function_ids'] = create_int_feature(funs)
      # <int>[_MAX_NUM_CANDIDATES]
      features['can_sizes'] = create_int_feature(sizes)
      # <int>[_MAX_INDEX_LENGTH]
      # Actual length is sum(sizes).
      features['can_indexes'] = create_int_feature(indexes)

      if num_initial_candidates > 0:
        beam_metrics.Metrics.counter(
            _NS,
            _get_buckets(num_initial_candidates,
                         [10, 20, 50, 100, 200, 500, 1000, 1200, 1500],
                         'Candidates Size:')).inc()

        beam_metrics.Metrics.counter(_NS, 'Candidates: Input').inc()
        if num_final_candidates != num_initial_candidates:
          beam_metrics.Metrics.counter(_NS,
                                       'Candidates: Dropped candidates').inc()

    return tf.train.Example(features=tf.train.Features(feature=features))