def __init__(self, args): print("Loading BERT configs...") with open("bert_config.json") as f: config_json = json.load(f) config = BertConfig( attention_probs_dropout_prob=config_json[ "attention_probs_dropout_prob"], hidden_act=config_json["hidden_act"], hidden_dropout_prob=config_json["hidden_dropout_prob"], hidden_size=config_json["hidden_size"], initializer_range=config_json["initializer_range"], intermediate_size=config_json["intermediate_size"], max_position_embeddings=config_json["max_position_embeddings"], num_attention_heads=config_json["num_attention_heads"], num_hidden_layers=config_json["num_hidden_layers"], type_vocab_size=config_json["type_vocab_size"], vocab_size=config_json["vocab_size"]) print("Loading PyTorch model...") self.model = BertForQuestionAnswering(config) self.model.eval() self.model.cuda() self.model.load_state_dict( torch.load( "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")) print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL(args.max_examples)
def __init__(self): print("Loading TF model...") self.sess = tf.Session() with gfile.FastGFile('build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb', 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.sess.graph.as_default() tf.import_graph_def(graph_def, name='') print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL()
def __init__(self, quantized): print("Loading ONNX model...") self.quantized = quantized if not quantized: model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx" else: model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/bert_large_v1_1_fake_quant.onnx" self.sess = onnxruntime.InferenceSession(model_path) print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) self.qsl = get_squad_QSL() print("Finished constructing SUT.")
def __init__(self, batch_size=8): print("Loading TF model...") bert_config = modeling.BertConfig.from_json_file("bert_config.json") model_fn = self.model_fn_builder( bert_config=bert_config, init_checkpoint="build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474") self.estimator = tf.estimator.Estimator(model_fn=model_fn) self.batch_size = batch_size print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) self.qsl = get_squad_QSL() print("Finished constructing SUT.")
def __init__(self, args): self.profile = args.profile self.options = onnxruntime.SessionOptions() self.options.enable_profiling = args.profile print("Loading ONNX model...") self.quantized = args.quantized if self.quantized: model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/bert_large_v1_1_fake_quant.onnx" else: model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx" self.sess = onnxruntime.InferenceSession(model_path, self.options) print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL(args.max_examples)
def __init__(self, args): print("Loading TF model...") infer_config = tf.compat.v1.ConfigProto() infer_config.intra_op_parallelism_threads = int(os.environ['TF_INTRA_OP_PARALLELISM_THREADS']) \ if 'TF_INTRA_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count() infer_config.inter_op_parallelism_threads = int(os.environ['TF_INTER_OP_PARALLELISM_THREADS']) \ if 'TF_INTER_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count() infer_config.use_per_session_threads = 1 self.sess = tf.compat.v1.Session(config=infer_config) with gfile.FastGFile( 'build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb', 'rb') as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) self.sess.graph.as_default() tf.import_graph_def(graph_def, name='') print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL(args.max_examples)