def generate_squad_dataset(): """Generates squad training dataset and returns input meta data.""" assert FLAGS.squad_data_file if FLAGS.tokenization == "WordPiece": return squad_lib_wp.generate_tf_record_from_json_file( input_file_path=FLAGS.squad_data_file, vocab_file_path=FLAGS.vocab_file, output_path=FLAGS.train_data_output_path, translated_input_folder=FLAGS.translated_squad_data_folder, max_seq_length=FLAGS.max_seq_length, do_lower_case=FLAGS.do_lower_case, max_query_length=FLAGS.max_query_length, doc_stride=FLAGS.doc_stride, version_2_with_negative=FLAGS.version_2_with_negative, xlnet_format=FLAGS.xlnet_format) else: assert FLAGS.tokenization == "SentencePiece" return squad_lib_sp.generate_tf_record_from_json_file( input_file_path=FLAGS.squad_data_file, sp_model_file=FLAGS.sp_model_file, output_path=FLAGS.train_data_output_path, translated_input_folder=FLAGS.translated_squad_data_folder, max_seq_length=FLAGS.max_seq_length, do_lower_case=FLAGS.do_lower_case, max_query_length=FLAGS.max_query_length, doc_stride=FLAGS.doc_stride, xlnet_format=FLAGS.xlnet_format, version_2_with_negative=FLAGS.version_2_with_negative)
def get_input_meta_data(): # training file, vocab file and path to output of tf files input_meta_data = generate_tf_record_from_json_file( "./data/train-v1.1.json", "./data/vocab.txt", "./data/train-v1.1.tf_record") # save the input meta data in json format with tf.io.gfile.GFile("./data/squad/train_meta_data", "w") as writer: writer.write( json.dumps(input_meta_data, indent=4) + "\n") # \n just to make sure there is no issue with the code return input_meta_data
def generate_squad_dataset(): """Generates squad training dataset and returns input meta data.""" assert FLAGS.squad_data_file if FLAGS.tokenizer_impl == "word_piece": return squad_lib_wp.generate_tf_record_from_json_file( FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative) else: assert FLAGS.tokenizer_impl == "sentence_piece" return squad_lib_sp.generate_tf_record_from_json_file( FLAGS.squad_data_file, FLAGS.sp_model_file, FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative)
from joblib import dump, load import app.config as cf from app.model.BERTSquad import BERTSquad from app.model.squad_loss_fn import squad_loss_fn if __name__ == "__main__": input_meta_data = generate_tf_record_from_json_file( cf.INPUTS_FILE_TRAIN, cf.INPUTS_FILE_VOCAB, cf.INPUTS_FILE_DEV ) with tf.io.gfile.GFile(cf.TRAIN_META_DATA, "w") as writer: writer.write(json.dumps(input_meta_data, indent=4) + "\n") train_dataset = create_squad_dataset( cf.INPUTS_FILE_DEV, input_meta_data['max_seq_length'], # 384 cf.BATCH_SIZE, is_training=True ) train_dataset_light = train_dataset.take(cf.NB_BATCHES_TRAIN)
import numpy as np import math import random import time import json import collections import os from google.colab import drive drive.mount("/content/drive") input_meta_data = generate_tf_record_from_json_file( "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.json", "/content/drive/MyDrive/BERT/ChatBot/vocab.txt", "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record" ) with tf.io.gfile.GFile("/content/drive/MyDrive/BERT/ChatBot/train_meta_data","w") as writer: writer.write(json.dumps(input_meta_data, indent=4)+"\n") BATCH_SIZE = 4 train_dataset = create_squad_dataset("/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record", input_meta_data["max_seq_length"], BATCH_SIZE, is_training=True) class BertSquadLayer(tf.keras.layers.Layer): def __init__(self):