def convert_examples_to_features(self, examples, tfrecord_file, label_names): """Converts examples to features and write them into TFRecord file.""" if not self.is_built: self.build() classifier_data_lib.file_based_convert_examples_to_features( examples, label_names, self.seq_len, self.tokenizer, tfrecord_file)
def generate_sentence_retrevial_tf_record(processor, data_dir, tokenizer, eval_data_output_path=None, test_data_output_path=None, max_seq_length=128): """Generates the tf records for retrieval tasks. Args: processor: Input processor object to be used for generating data. Subclass of `DataProcessor`. data_dir: Directory that contains train/eval data to process. Data files should be in from. tokenizer: The tokenizer to be applied on the data. eval_data_output_path: Output to which processed tf record for evaluation will be saved. test_data_output_path: Output to which processed tf record for testing will be saved. Must be a pattern template with {} if processor has language specific test data. max_seq_length: Maximum sequence length of the to be generated training/eval data. Returns: A dictionary containing input meta data. """ assert eval_data_output_path or test_data_output_path if processor.get_processor_name() == "BUCC": path_pattern = "{}-en.{{}}.{}" if processor.get_processor_name() == "TATOEBA": path_pattern = "{}-en.{}" meta_data = { "processor_type": processor.get_processor_name(), "max_seq_length": max_seq_length, "number_eval_data": {}, "number_test_data": {}, } logging.info("Start to process %s task data", processor.get_processor_name()) for lang_a in processor.languages: for lang_b in [lang_a, "en"]: if eval_data_output_path: eval_input_data_examples = processor.get_dev_examples( data_dir, os.path.join(path_pattern.format(lang_a, lang_b))) num_eval_data = len(eval_input_data_examples) logging.info("Processing %d dev examples of %s-en.%s", num_eval_data, lang_a, lang_b) output_file = os.path.join( eval_data_output_path, "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev")) classifier_data_lib.file_based_convert_examples_to_features( eval_input_data_examples, None, max_seq_length, tokenizer, output_file, None) meta_data["number_eval_data"][ f"{lang_a}-en.{lang_b}"] = num_eval_data if test_data_output_path: test_input_data_examples = processor.get_test_examples( data_dir, os.path.join(path_pattern.format(lang_a, lang_b))) num_test_data = len(test_input_data_examples) logging.info("Processing %d test examples of %s-en.%s", num_test_data, lang_a, lang_b) output_file = os.path.join( test_data_output_path, "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test")) classifier_data_lib.file_based_convert_examples_to_features( test_input_data_examples, None, max_seq_length, tokenizer, output_file, None) meta_data["number_test_data"][ f"{lang_a}-en.{lang_b}"] = num_test_data return meta_data