def get_dialog_examples(self, dataset): """Return a list of `InputExample`s of the data splits' dialogues. Args: dataset: str. can be "train", "dev", or "test". Returns: examples: a list of `InputExample`s. """ dialog_paths = [ os.path.join(self.dstc8_data_dir, dataset, "dialogues_{:03d}.json".format(i)) for i in self._file_ranges[dataset] ] dialogs = load_dialogues(dialog_paths) schema_path = os.path.join(self.dstc8_data_dir, dataset, "schema.json") schemas = schema.Schema(schema_path) examples = [] for dialog_idx, dialog in enumerate(dialogs): tf.compat.v1.logging.log_every_n(tf.compat.v1.logging.INFO, "Processed %d dialogs.", 1000, dialog_idx) examples.extend( self._create_examples_from_dialog(dialog, schemas, dataset)) return examples
def _create_schema_embeddings(bert_config, schema_embedding_file): """Create schema embeddings and save it into file.""" if not tf.io.gfile.exists(FLAGS.schema_embedding_dir): tf.io.gfile.makedirs(FLAGS.schema_embedding_dir) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 schema_emb_run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) schema_json_path = os.path.join(FLAGS.dstc8_data_dir, FLAGS.dataset_split, "schema.json") schemas = schema.Schema(schema_json_path) # Prepare BERT model for embedding a natural language descriptions. bert_init_ckpt = os.path.join(FLAGS.bert_ckpt_dir, "bert_model.ckpt") schema_emb_model_fn = extract_schema_embedding.model_fn_builder( bert_config=bert_config, init_checkpoint=bert_init_ckpt, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. schema_emb_estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=schema_emb_model_fn, config=schema_emb_run_config, predict_batch_size=FLAGS.predict_batch_size) vocab_file = os.path.join(FLAGS.bert_ckpt_dir, "vocab.txt") tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=FLAGS.do_lower_case) emb_generator = extract_schema_embedding.SchemaEmbeddingGenerator( tokenizer, schema_emb_estimator, FLAGS.max_seq_length) emb_generator.save_embeddings(schemas, schema_embedding_file)
def main(_): schema_path = os.path.join(_DIR_PATH, FLAGS.schema_file_name) schemas = schema.Schema(schema_path) processor = Processor(schemas) data_path = os.path.join(FLAGS.input_data_dir, 'data.json') with tf.io.gfile.GFile(data_path, 'r') as f: data = json.load(f) dev_test_ids = [] output_dir = FLAGS.output_dir or _DIR_PATH # Generate dev and test set according to the ids listed in the files. Ids not # included in the dev and test id list files belong to the training set. for output_dir_name, file_name in _PATH_MAPPING: output_sub_dir = os.path.join(output_dir, output_dir_name) if not tf.io.gfile.exists(output_sub_dir): tf.io.gfile.makedirs(output_sub_dir) schema_path = os.path.join(output_sub_dir, 'schema.json') schemas.save_to_file(schema_path) dial_ids = [] if file_name: id_list_path = os.path.join(FLAGS.input_data_dir, file_name) with tf.io.gfile.GFile(id_list_path) as f: dial_ids = [id_name.strip() for id_name in f.readlines()] dev_test_ids.extend(dial_ids) else: # Generate the ids for the training set. dial_ids = list(set(data.keys()) - set(dev_test_ids)) converted_dials = processor.convert_to_dstc(dial_ids, data) logging.info('Unfound slot span ratio %s', processor.unfound_slot_span_ratio) logging.info('Writing %d dialogs to %s', len(converted_dials), output_sub_dir) for i in range(0, len(converted_dials), _NUM_DIALS_PER_FILE): file_index = int(i / _NUM_DIALS_PER_FILE) + 1 # Create a new json file and save the dialogues. json_file_path = os.path.join( output_sub_dir, 'dialogues_{:03d}.json'.format(file_index)) dialogs_list = converted_dials[(file_index - 1) * _NUM_DIALS_PER_FILE:file_index * _NUM_DIALS_PER_FILE] with tf.io.gfile.GFile(json_file_path, 'w') as f: json.dump(dialogs_list, f, indent=2, separators=(',', ': '), sort_keys=True) logging.info('Created %s with %d dialogues.', json_file_path, len(dialogs_list))
def write_predictions_to_file(predictions, input_json_files, schema_json_file, output_dir): """Write the predicted dialogues as json files. Args: predictions: An iterator containing model predictions. This is the output of the predict method in the estimator. input_json_files: A list of json paths containing the dialogues to run inference on. schema_json_file: Path for the json file containing the schemas. output_dir: The directory where output json files will be created. """ tf.compat.v1.logging.info("Writing predictions to %s.", output_dir) schemas = schema.Schema(schema_json_file) # Index all predictions. all_predictions = {} for idx, prediction in enumerate(predictions): if not prediction["is_real_example"]: continue tf.compat.v1.logging.log_every_n(tf.compat.v1.logging.INFO, "Processed %d examples.", 500, idx) _, dialog_id, turn_id, service_name = ( prediction["example_id"].decode("utf-8").split("-")) all_predictions[(dialog_id, turn_id, service_name)] = prediction # Read each input file and write its predictions. for input_file_path in input_json_files: with tf.io.gfile.GFile(input_file_path) as f: dialogs = json.load(f) pred_dialogs = [] for d in dialogs: pred_dialogs.append( get_predicted_dialog(d, all_predictions, schemas)) input_file_name = os.path.basename(input_file_path) output_file_path = os.path.join(output_dir, input_file_name) with tf.io.gfile.GFile(output_file_path, "w") as f: json.dump(pred_dialogs, f, indent=2, separators=(",", ": "), sort_keys=True)