예제 #1
0
def create_data(file, file_tf):
    # #  데이터 불러오기 및 저장
    #  데이터 불러오기
    DATA_train = pd.read_csv(file, sep='\t')
    print('데이터 크기: ', len(DATA_train))
    if os.path.exists(file_tf):
        print('FILE ALREADY EXISTS {}'.format(file_tf))
        return

    #  결측값 제거
    DATA_train.dropna(axis=0, inplace=True)
    #  문장, 라벨 추출
    X = DATA_train['document'].values
    Y = DATA_train['label'].values
    #  문장 전처리 및 토큰화
    from src.data.preprocessor import PreProcessor
    prep = PreProcessor()

    ##  전처리 1. 클린징
    X = list(map(lambda x: prep.clean(x)[0], X))

    ##  전처리 2. 토큰화 - InputFeatures object
    X = list(
        map(lambda x: prep.create_InputFeature(x),
            tqdm(X, desc='create_InputFeature')))

    #  write TFRecord dataset
    with tf.python_io.TFRecordWriter(file_tf) as writer:

        def _int64List_feature(value):
            """Returns an int64_list from a bool / enum / int / uint."""
            return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

        def _int64_feature(value):
            """Returns an int64_list from a bool / enum / int / uint."""
            return tf.train.Feature(int64_list=tf.train.Int64List(
                value=[value]))

        for i in tqdm(range(len(X)), desc='Writing to {}'.format(file_tf)):
            feature = {
                'input_ids': _int64List_feature(X[i].input_ids),
                'segment_ids': _int64List_feature(X[i].segment_ids),
                'input_masks': _int64List_feature(X[i].input_masks),
                'label': _int64_feature(Y[i])
            }
            features = tf.train.Features(feature=feature)
            example = tf.train.Example(features=features)
            writer.write(example.SerializeToString())
예제 #2
0
class Model(object):
    def __init__(self, mode):
        '''

        :param mode: 0: search, 1: similarty
        '''
        self.mode = mode
        self.CONFIG = config.BERT
        self.preprocessor = PreProcessor()

        # placeholders
        self.input_ids = None
        self.input_masks = None
        self.segment_ids = None

        # pred indexes
        self.start_logits = None
        self.end_logtis = None
        self.start_pred = None
        self.end_pred = None

        # tf.Session()
        self.sess = None

        # feature vectors
        self.all_encoder_layers = None
        self.pooled_output = None
        self.feature_vector = None
        self.similarity_output = None

        self.build_model()

    def build_model(self):
        if self.mode == 0:
            bert_json = self.CONFIG['bert_json']
            max_seq_length = self.CONFIG['max_seq_length-search']
        elif self.mode == 1:
            bert_json = self.CONFIG['bert_json']
            model_path = self.CONFIG['model_path-similarity']
            max_seq_length = self.CONFIG['max_seq_length-similarity']

        bert_config = BertConfig()
        bert_config.read_from_json_file(bert_json)

        self.input_ids = tf.placeholder(dtype=tf.int32,
                                        shape=[None, max_seq_length])
        self.input_masks = tf.placeholder(dtype=tf.int32,
                                          shape=[None, max_seq_length])
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[None, max_seq_length])

        embedding_output = None  # sum of Token, segment, position
        embedding_table = None  # id embedding table
        self.all_encoder_layers = None  # transformer model
        self.similarity_output = None  # output layer
        self.elmo_output = None  # ELMO FEATURE 추출을 위한 레이어

        with tf.variable_scope(name_or_scope=None, default_name='bert'):
            with tf.variable_scope(name_or_scope='embeddings'):
                embedding_output, embedding_table = embedding_lookup(
                    self.input_ids,
                    bert_config.vocab_size,
                    bert_config.hidden_size,
                    bert_config.initializer_range,
                    word_embedding_name='word_embeddings')
                embedding_output = embedding_postprocessor(
                    embedding_output,
                    use_token_type=True,
                    token_type_ids=self.segment_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    use_position_embeddings=True,
                    token_type_embedding_name='token_type_embeddings',
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=bert_config.
                    max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob)

            with tf.variable_scope(name_or_scope='encoder'):
                attention_mask = create_attention_mask_from_input_mask(
                    self.input_ids, self.input_masks)
                self.all_encoder_layers = tranformer_model(
                    input_tensor=embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=gelu,  # TODO gelu -> .
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.
                    attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=True)

                self.similarity_output = self.all_encoder_layers[
                    self.CONFIG['similarity_layer']]
                self.elmo_output = self.all_encoder_layers[-1]

            with tf.variable_scope('pooler'):
                first_token_tensor = tf.squeeze(self.similarity_output[:,
                                                                       0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    inputs=first_token_tensor,
                    units=bert_config.hidden_size,
                    activation=tf.nn.tanh,
                    kernel_initializer=tf.truncated_normal_initializer(
                        bert_config.initializer_range))

        final_layer = self.similarity_output

        output_weights = tf.get_variable(
            'cls/squad/output_weights',
            shape=[2, bert_config.hidden_size],
            initializer=tf.truncated_normal_initializer(
                bert_config.initializer_range))
        output_bias = tf.get_variable(
            'cls/squad/output_bias',
            shape=[2],
            initializer=tf.truncated_normal_initializer(
                bert_config.hidden_size))

        final_layer = tf.reshape(final_layer,
                                 shape=[-1, bert_config.hidden_size])
        logits = tf.matmul(final_layer, output_weights,
                           transpose_b=True) + output_bias

        logits = tf.reshape(logits, shape=[1, -1, 2])  # 질문이 하나씩 온다는 가정임
        logits = tf.transpose(logits, perm=[2, 0, 1])

        unstacked_logits = tf.unstack(logits, axis=0)

        self.start_logits = unstacked_logits[0]
        self.end_logtis = unstacked_logits[1]

        self.start_pred = tf.argmax(self.start_logits, axis=-1)
        self.end_pred = tf.argmax(self.end_logtis, axis=-1)

    def load_checkpoint(self):
        if self.mode == 0:
            model_path = self.CONFIG['model_path-search']
        elif self.mode == 1:
            model_path = self.CONFIG['model_path-similarity']

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        tvars = tf.trainable_variables()
        assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(
            tvars, model_path)  # 201
        tf.train.init_from_checkpoint(model_path, assignment_map)
        self.sess = tf.Session()  # TODO 두번 불러야 정상작동되는 에러 해결
        self.sess.run(tf.global_variables_initializer())
        tvars = tf.trainable_variables()
        assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(
            tvars, model_path)  # 201
        tf.train.init_from_checkpoint(model_path, assignment_map)

        for var in tvars:
            if var.name in initialized_variable_names:
                print(var.name, ' - INIT FROM CKPT')

    def _convert_to_feature(self, chat, context):
        return self.preprocessor.create_InputFeature(chat, context=context)

    def predict(self, chat, text):

        input_feature = self._convert_to_feature(chat, text)

        feed_dict = {
            self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)),
            self.input_masks:
            np.array(input_feature.input_masks).reshape(1, -1),
            self.segment_ids:
            np.array(input_feature.segment_ids).reshape(1, -1)
        }

        start, end = self.sess.run([self.start_pred, self.end_pred], feed_dict)
        # start_n, end_n = sess.run([start_n_best, end_n_best], feed_dict) # TODO n best answers

        return self.preprocessor.idx_to_orig(start, end, input_feature)

    def extract_feature_vector(self, input_feature):
        tic = time.time()
        length = np.sum(input_feature.input_masks)
        feed_dict = {
            self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)),
            self.input_masks:
            np.array(input_feature.input_masks).reshape(1, -1),
            self.segment_ids:
            np.array(input_feature.segment_ids).reshape(1, -1)
        }
        sequence_output = self.sess.run(self.similarity_output, feed_dict)
        feature_vector = np.mean(sequence_output[:, 1:length - 1],
                                 axis=1)  # [CLS] 와 [SEP]를 제외한 단어 벡터들을 더함
        toc = time.time()
        print('*** Vectorizing Done: %5.3f ***' % (toc - tic))
        return np.reshape(feature_vector, newshape=(-1))

    # def extract_elmo_feature_vector(self, input_feature):
    #     tic = time.time()
    #     feed_dict = {self.input_ids: np.array(input_feature.input_ids).reshape((1, -1)),
    #                  self.input_masks: np.array(input_feature.input_masks).reshape(1, -1),
    #                  self.segment_ids: np.array(input_feature.segment_ids).reshape(1, -1)}
    #     elmo_output = self.sess.run(self.elmo_output, feed_dict)

    def search_to_saved_model(self):
        MODEL_DIR = self.CONFIG['MODEL_DIR']
        version = self.CONFIG['version-search']
        export_path = os.path.join(MODEL_DIR, 'search', str(version))
        print('export_path = {}\n'.format(export_path))
        if os.path.isdir(export_path):
            print('\nAlready saved a model, cleaning up\n')
            return
        builder = tf.saved_model.builder.SavedModelBuilder(export_path)

        input_ids = tf.saved_model.utils.build_tensor_info(self.input_ids)
        input_masks = tf.saved_model.utils.build_tensor_info(self.input_masks)
        segment_ids = tf.saved_model.utils.build_tensor_info(self.segment_ids)

        start_pred = tf.saved_model.utils.build_tensor_info(self.start_logits)
        end_pred = tf.saved_model.utils.build_tensor_info(self.end_logtis)

        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    'input_ids': input_ids,
                    'input_masks': input_masks,
                    'segment_ids': segment_ids
                },
                outputs={
                    'start_pred': start_pred,
                    'end_pred': end_pred
                },
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))

        signature_def_map = {
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            prediction_signature
        }

        builder.add_meta_graph_and_variables(
            self.sess,
            tags=[tf.saved_model.tag_constants.SERVING],
            signature_def_map=signature_def_map)

        builder.save()
        print('GENERATED SAVED MODEL')

    def ef_to_saved_model(self):
        MODEL_DIR = self.CONFIG['MODEL_DIR']
        version = self.CONFIG['version-similarity']
        export_path = os.path.join(MODEL_DIR, 'similarity', str(version))
        print('export_path = {}\n'.format(export_path))
        if os.path.isdir(export_path):
            print('\nAlready saved a model, cleaning up\n')
            return
        builder = tf.saved_model.builder.SavedModelBuilder(export_path)
        input_ids = tf.saved_model.utils.build_tensor_info(self.input_ids)
        input_masks = tf.saved_model.utils.build_tensor_info(self.input_masks)
        segment_ids = tf.saved_model.utils.build_tensor_info(self.segment_ids)

        similarity_output = tf.saved_model.utils.build_tensor_info(
            self.similarity_output)

        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    'input_ids': input_ids,
                    'input_masks': input_masks,
                    'segment_ids': segment_ids
                },
                outputs={'similarity_output': similarity_output},
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))

        signature_def_map = {
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            prediction_signature
        }

        builder.add_meta_graph_and_variables(
            self.sess,
            tags=[tf.saved_model.tag_constants.SERVING],
            signature_def_map=signature_def_map)

        builder.save()
        print('GENERATED SAVED MODEL')
예제 #3
0
class TensorServer(metaclass=Singleton):
    def __init__(self):
        self.preprocessor = PreProcessor()
        self.CONFIG = config.TENSOR_SERVING
        search_v = json.loads(requests.get(self.CONFIG['url-search-v']).text)
        sentiment_v = json.loads(requests.get(self.CONFIG['url-sentiment-v']).text)
        similarity_v = json.loads(requests.get(self.CONFIG['url-similarity-v']).text)
        print('TensorServer Running')
        print('QA - {}'.format(search_v))
        print('Sentiment - {}'.format(sentiment_v))
        print('Similarity - {}'.format(similarity_v))

    @staticmethod
    def create_request(features):
        request_json = {
            'instances': [
                {
                    'input_ids': features.input_ids,
                    'input_masks': features.input_masks,
                    'segment_ids': features.segment_ids
                }
            ]
        }
        return request_json

    def sentiment(self, chat):
        chat, _ = self.preprocessor.clean(chat=chat)
        features = self.preprocessor.create_InputFeature(query_text=chat)
        response = requests.post(self.CONFIG['url-sentiment'], json=self.create_request(features))
        predict = json.loads(response.text)['predictions'][0]
        return predict

    def similarity(self, chat):
        chat, _ = self.preprocessor.clean(chat=chat)
        features = self.preprocessor.create_InputFeature(query_text=chat)
        _length = np.sum(features.input_masks)

        response = requests.post(self.CONFIG['url-similarity'], json=self.create_request(features))
        response = json.loads(response.text)
        similarity_vector = response['predictions'][0]
        # similarity_vector = np.mean(np.array(similarity_vector), axis=0)
        # similarity_vector = np.mean(np.array(similarity_vector)[:_length, :], axis=0)
        # similarity_vector = np.mean(np.array(similarity_vector)[1: _length - 1, :], axis=0)
        similarity_vector = np.array(similarity_vector)[1:_length - 1]
        # similarity_vector = np.array(similarity_vector)[0]

        return similarity_vector

    def search(self, chat, context):
        chat, _ = self.preprocessor.clean(chat=chat)
        features = self.preprocessor.create_InputFeature(chat, context)

        response = requests.post(self.CONFIG['url-search'], json=self.create_request(features))
        response = json.loads(response.text)

        start = response['predictions'][0]['start_pred']
        end = response['predictions'][0]['end_pred']

        start = np.argmax(start, axis=-1)
        end = np.argmax(end, axis=-1)
        return self.preprocessor.idx_to_orig(start, end, features)