def _key_with_random_key( example, seed, num_random_table_bins, num_corpus_bins, ): """Computes a new random key (train/test split aware) for random pairing.""" key, _ = example seed = seed + tf_example_utils.fingerprint(key) rng = random.Random(seed) # Add train/test information so that random buckets don't cross over. partition = pretrain_utils.partition_fn(example, 2, num_corpus_bins) return f'{partition}_{rng.randint(0, num_random_table_bins)}', example
def process(self, element): beam.metrics.Metrics.counter(_NS, "Interactions").inc() key, (interaction, random_table) = element seed = tf_example_utils.fingerprint( "%s_%d_%d" % (key, self._config.random_seed, self._config.max_seq_length)) rng = random.Random(seed) example = self._converter.convert(rng, interaction, random_table) if example: beam.metrics.Metrics.counter(_NS, "Examples").inc() yield prepand_fingerprint(key), example
def fingerprint(key): return "%08X" % abs(tf_example_utils.fingerprint(key))
def to_numpy_seed(obj): return tf_example_utils.fingerprint(repr(obj)) % _MAX_INT
def convert( self, interaction, index, negative_example, ): """Converts question at 'index' to example.""" table = interaction.table num_rows = len(table.rows) if num_rows >= self._max_row_id: num_rows = self._max_row_id - 1 num_columns = len(table.columns) if num_columns >= self._max_column_id: num_columns = self._max_column_id - 1 title = table.document_title if not self._use_document_title: title = '' title_tokens = self._tokenizer.tokenize(title) tokenized_table = self._tokenize_table(table) while True: try: _, features = self._to_trimmed_features( question=None, table=table, question_tokens=title_tokens, tokenized_table=tokenized_table, num_columns=num_columns, num_rows=num_rows) break except ValueError: pass # Since this is retrieval we might get away with removing some cells of # the table. # TODO(thomasmueller) Consider taking the token length into account. if num_columns >= num_rows: num_columns -= 1 else: num_rows -= 1 if num_columns == 0 or num_rows == 0: raise ValueError('Cannot fit table into sequence.') question = interaction.questions[index] features['question_id'] = base.create_string_feature( [question.id.encode('utf8')]) features['question_id_ints'] = base.create_int_feature( text_utils.str_to_ints(question.id, length=text_utils.DEFAULT_INTS_LENGTH)) q_tokens = self._tokenizer.tokenize(question.text) q_tokens = self._serialize_text(q_tokens)[0] q_tokens.append(base.Token(_SEP, _SEP)) q_input_ids = self._to_token_ids(q_tokens) self._pad_to_seq_length(q_input_ids) q_input_mask = [1] * len(q_tokens) self._pad_to_seq_length(q_input_mask) features['question_input_ids'] = base.create_int_feature(q_input_ids) features['question_input_mask'] = base.create_int_feature(q_input_mask) if question: features['question_hash'] = base.create_int_feature( [base.fingerprint(question.text) % _MAX_INT]) if negative_example is not None: n_table = negative_example.table n_title_tokens = self._tokenizer.tokenize(n_table.document_title) n_tokenized_table = self._tokenize_table(n_table) n_num_rows = self._get_num_rows(n_table, drop_rows_to_fit=True) n_num_columns = self._get_num_columns(n_table) _, n_example_features = self._to_trimmed_features( question=None, table=n_table, question_tokens=n_title_tokens, tokenized_table=n_tokenized_table, num_columns=n_num_columns, num_rows=n_num_rows, drop_rows_to_fit=True) _join_features(features, n_example_features) return tf.train.Example(features=tf.train.Features(feature=features))