def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info('Reading model {}'.format(args.model)) ie_net = ie.read_network(model=model_xml, weights=model_bin) # check input and output names if len(ie_net.input_info) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(ie_net.input_info))) if len(ie_net.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(ie_net.outputs))) input_names = next(iter(ie_net.input_info)) output_names = next(iter(ie_net.outputs)) # load model to the device ie_net_exec = ie.load_network(network=ie_net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = ie_net.input_info[input_names].input_data.shape[1] eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for IE inputs = { input_names: model_input, } # infer by IE t_start = time.perf_counter() res = ie_net_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) outputs = res[output_names] next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))