def create_data(file, file_tf): # # 데이터 불러오기 및 저장 # 데이터 불러오기 DATA_train = pd.read_csv(file, sep='\t') print('데이터 크기: ', len(DATA_train)) if os.path.exists(file_tf): print('FILE ALREADY EXISTS {}'.format(file_tf)) return # 결측값 제거 DATA_train.dropna(axis=0, inplace=True) # 문장, 라벨 추출 X = DATA_train['document'].values Y = DATA_train['label'].values # 문장 전처리 및 토큰화 from src.data.preprocessor import PreProcessor prep = PreProcessor() ## 전처리 1. 클린징 X = list(map(lambda x: prep.clean(x)[0], X)) ## 전처리 2. 토큰화 - InputFeatures object X = list( map(lambda x: prep.create_InputFeature(x), tqdm(X, desc='create_InputFeature'))) # write TFRecord dataset with tf.python_io.TFRecordWriter(file_tf) as writer: def _int64List_feature(value): """Returns an int64_list from a bool / enum / int / uint.""" return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _int64_feature(value): """Returns an int64_list from a bool / enum / int / uint.""" return tf.train.Feature(int64_list=tf.train.Int64List( value=[value])) for i in tqdm(range(len(X)), desc='Writing to {}'.format(file_tf)): feature = { 'input_ids': _int64List_feature(X[i].input_ids), 'segment_ids': _int64List_feature(X[i].segment_ids), 'input_masks': _int64List_feature(X[i].input_masks), 'label': _int64_feature(Y[i]) } features = tf.train.Features(feature=feature) example = tf.train.Example(features=features) writer.write(example.SerializeToString())
class Model(object): def __init__(self, mode): ''' :param mode: 0: search, 1: similarty ''' self.mode = mode self.CONFIG = config.BERT self.preprocessor = PreProcessor() # placeholders self.input_ids = None self.input_masks = None self.segment_ids = None # pred indexes self.start_logits = None self.end_logtis = None self.start_pred = None self.end_pred = None # tf.Session() self.sess = None # feature vectors self.all_encoder_layers = None self.pooled_output = None self.feature_vector = None self.similarity_output = None self.build_model() def build_model(self): if self.mode == 0: bert_json = self.CONFIG['bert_json'] max_seq_length = self.CONFIG['max_seq_length-search'] elif self.mode == 1: bert_json = self.CONFIG['bert_json'] model_path = self.CONFIG['model_path-similarity'] max_seq_length = self.CONFIG['max_seq_length-similarity'] bert_config = BertConfig() bert_config.read_from_json_file(bert_json) self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) embedding_output = None # sum of Token, segment, position embedding_table = None # id embedding table self.all_encoder_layers = None # transformer model self.similarity_output = None # output layer self.elmo_output = None # ELMO FEATURE 추출을 위한 레이어 with tf.variable_scope(name_or_scope=None, default_name='bert'): with tf.variable_scope(name_or_scope='embeddings'): embedding_output, embedding_table = embedding_lookup( self.input_ids, bert_config.vocab_size, bert_config.hidden_size, bert_config.initializer_range, word_embedding_name='word_embeddings') embedding_output = embedding_postprocessor( embedding_output, use_token_type=True, token_type_ids=self.segment_ids, token_type_vocab_size=bert_config.type_vocab_size, use_position_embeddings=True, token_type_embedding_name='token_type_embeddings', position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=bert_config. max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob) with tf.variable_scope(name_or_scope='encoder'): attention_mask = create_attention_mask_from_input_mask( self.input_ids, self.input_masks) self.all_encoder_layers = tranformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=gelu, # TODO gelu -> . hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config. attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=True) self.similarity_output = self.all_encoder_layers[ self.CONFIG['similarity_layer']] self.elmo_output = self.all_encoder_layers[-1] with tf.variable_scope('pooler'): first_token_tensor = tf.squeeze(self.similarity_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( inputs=first_token_tensor, units=bert_config.hidden_size, activation=tf.nn.tanh, kernel_initializer=tf.truncated_normal_initializer( bert_config.initializer_range)) final_layer = self.similarity_output output_weights = tf.get_variable( 'cls/squad/output_weights', shape=[2, bert_config.hidden_size], initializer=tf.truncated_normal_initializer( bert_config.initializer_range)) output_bias = tf.get_variable( 'cls/squad/output_bias', shape=[2], initializer=tf.truncated_normal_initializer( bert_config.hidden_size)) final_layer = tf.reshape(final_layer, shape=[-1, bert_config.hidden_size]) logits = tf.matmul(final_layer, output_weights, transpose_b=True) + output_bias logits = tf.reshape(logits, shape=[1, -1, 2]) # 질문이 하나씩 온다는 가정임 logits = tf.transpose(logits, perm=[2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) self.start_logits = unstacked_logits[0] self.end_logtis = unstacked_logits[1] self.start_pred = tf.argmax(self.start_logits, axis=-1) self.end_pred = tf.argmax(self.end_logtis, axis=-1) def load_checkpoint(self): if self.mode == 0: model_path = self.CONFIG['model_path-search'] elif self.mode == 1: model_path = self.CONFIG['model_path-similarity'] self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) tvars = tf.trainable_variables() assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tvars, model_path) # 201 tf.train.init_from_checkpoint(model_path, assignment_map) self.sess = tf.Session() # TODO 두번 불러야 정상작동되는 에러 해결 self.sess.run(tf.global_variables_initializer()) tvars = tf.trainable_variables() assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tvars, model_path) # 201 tf.train.init_from_checkpoint(model_path, assignment_map) for var in tvars: if var.name in initialized_variable_names: print(var.name, ' - INIT FROM CKPT') def _convert_to_feature(self, chat, context): return self.preprocessor.create_InputFeature(chat, context=context) def predict(self, chat, text): input_feature = self._convert_to_feature(chat, text) feed_dict = { self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)), self.input_masks: np.array(input_feature.input_masks).reshape(1, -1), self.segment_ids: np.array(input_feature.segment_ids).reshape(1, -1) } start, end = self.sess.run([self.start_pred, self.end_pred], feed_dict) # start_n, end_n = sess.run([start_n_best, end_n_best], feed_dict) # TODO n best answers return self.preprocessor.idx_to_orig(start, end, input_feature) def extract_feature_vector(self, input_feature): tic = time.time() length = np.sum(input_feature.input_masks) feed_dict = { self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)), self.input_masks: np.array(input_feature.input_masks).reshape(1, -1), self.segment_ids: np.array(input_feature.segment_ids).reshape(1, -1) } sequence_output = self.sess.run(self.similarity_output, feed_dict) feature_vector = np.mean(sequence_output[:, 1:length - 1], axis=1) # [CLS] 와 [SEP]를 제외한 단어 벡터들을 더함 toc = time.time() print('*** Vectorizing Done: %5.3f ***' % (toc - tic)) return np.reshape(feature_vector, newshape=(-1)) # def extract_elmo_feature_vector(self, input_feature): # tic = time.time() # feed_dict = {self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)), # self.input_masks: np.array(input_feature.input_masks).reshape(1, -1), # self.segment_ids: np.array(input_feature.segment_ids).reshape(1, -1)} # elmo_output = self.sess.run(self.elmo_output, feed_dict) def search_to_saved_model(self): MODEL_DIR = self.CONFIG['MODEL_DIR'] version = self.CONFIG['version-search'] export_path = os.path.join(MODEL_DIR, 'search', str(version)) print('export_path = {}\n'.format(export_path)) if os.path.isdir(export_path): print('\nAlready saved a model, cleaning up\n') return builder = tf.saved_model.builder.SavedModelBuilder(export_path) input_ids = tf.saved_model.utils.build_tensor_info(self.input_ids) input_masks = tf.saved_model.utils.build_tensor_info(self.input_masks) segment_ids = tf.saved_model.utils.build_tensor_info(self.segment_ids) start_pred = tf.saved_model.utils.build_tensor_info(self.start_logits) end_pred = tf.saved_model.utils.build_tensor_info(self.end_logtis) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ 'input_ids': input_ids, 'input_masks': input_masks, 'segment_ids': segment_ids }, outputs={ 'start_pred': start_pred, 'end_pred': end_pred }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) signature_def_map = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature } builder.add_meta_graph_and_variables( self.sess, tags=[tf.saved_model.tag_constants.SERVING], signature_def_map=signature_def_map) builder.save() print('GENERATED SAVED MODEL') def ef_to_saved_model(self): MODEL_DIR = self.CONFIG['MODEL_DIR'] version = self.CONFIG['version-similarity'] export_path = os.path.join(MODEL_DIR, 'similarity', str(version)) print('export_path = {}\n'.format(export_path)) if os.path.isdir(export_path): print('\nAlready saved a model, cleaning up\n') return builder = tf.saved_model.builder.SavedModelBuilder(export_path) input_ids = tf.saved_model.utils.build_tensor_info(self.input_ids) input_masks = tf.saved_model.utils.build_tensor_info(self.input_masks) segment_ids = tf.saved_model.utils.build_tensor_info(self.segment_ids) similarity_output = tf.saved_model.utils.build_tensor_info( self.similarity_output) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ 'input_ids': input_ids, 'input_masks': input_masks, 'segment_ids': segment_ids }, outputs={'similarity_output': similarity_output}, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) signature_def_map = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature } builder.add_meta_graph_and_variables( self.sess, tags=[tf.saved_model.tag_constants.SERVING], signature_def_map=signature_def_map) builder.save() print('GENERATED SAVED MODEL')
class TensorServer(metaclass=Singleton): def __init__(self): self.preprocessor = PreProcessor() self.CONFIG = config.TENSOR_SERVING search_v = json.loads(requests.get(self.CONFIG['url-search-v']).text) sentiment_v = json.loads(requests.get(self.CONFIG['url-sentiment-v']).text) similarity_v = json.loads(requests.get(self.CONFIG['url-similarity-v']).text) print('TensorServer Running') print('QA - {}'.format(search_v)) print('Sentiment - {}'.format(sentiment_v)) print('Similarity - {}'.format(similarity_v)) @staticmethod def create_request(features): request_json = { 'instances': [ { 'input_ids': features.input_ids, 'input_masks': features.input_masks, 'segment_ids': features.segment_ids } ] } return request_json def sentiment(self, chat): chat, _ = self.preprocessor.clean(chat=chat) features = self.preprocessor.create_InputFeature(query_text=chat) response = requests.post(self.CONFIG['url-sentiment'], json=self.create_request(features)) predict = json.loads(response.text)['predictions'][0] return predict def similarity(self, chat): chat, _ = self.preprocessor.clean(chat=chat) features = self.preprocessor.create_InputFeature(query_text=chat) _length = np.sum(features.input_masks) response = requests.post(self.CONFIG['url-similarity'], json=self.create_request(features)) response = json.loads(response.text) similarity_vector = response['predictions'][0] # similarity_vector = np.mean(np.array(similarity_vector), axis=0) # similarity_vector = np.mean(np.array(similarity_vector)[:_length, :], axis=0) # similarity_vector = np.mean(np.array(similarity_vector)[1: _length - 1, :], axis=0) similarity_vector = np.array(similarity_vector)[1:_length - 1] # similarity_vector = np.array(similarity_vector)[0] return similarity_vector def search(self, chat, context): chat, _ = self.preprocessor.clean(chat=chat) features = self.preprocessor.create_InputFeature(chat, context) response = requests.post(self.CONFIG['url-search'], json=self.create_request(features)) response = json.loads(response.text) start = response['predictions'][0]['start_pred'] end = response['predictions'][0]['end_pred'] start = np.argmax(start, axis=-1) end = np.argmax(end, axis=-1) return self.preprocessor.idx_to_orig(start, end, features)