Пример #1
0
    def __init__(self, conf):
        self.task_type = 'classify'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep=",",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        self.num_output = self.num_class
        logging.info(
            f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<")
        for idx, text in enumerate(self.text_list):
            self.text_list[idx] = self.pre.get_dl_input_by_text(text)
            if len(self.text_list[idx]) == 0:
                logging.error(f"find blank lines in {idx}")

        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)
Пример #2
0
 def __init__(self, conf):
     self.task_type = 'match'
     self.conf = conf
     for attr in conf:
         setattr(self, attr, conf[attr])
     self.graph = tf.get_default_graph()
     self.pre = Preprocess()
     self.model_loaded = False
     self.zdy = {}
     csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False)
     self.text_list = list(csv['text'])
     self.label_list = list(csv['target'])
     self.num_class = len(set(self.label_list))
     logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}")
     self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                       self.text_list]
     self.conf.update({
         "maxlen": self.maxlen,
         "maxlen1": self.maxlen,
         "maxlen2": self.maxlen,
         "num_class": self.num_class,
         "embedding_size": self.embedding_size,
         "batch_size": self.batch_size,
         "num_output": self.num_output,
         "keep_prob": 1,
         "is_training": False,
     })
     self.encoder = encoder[self.encoder_type](**self.conf)
Пример #3
0
 def read_data(self):
     self.pre = Preprocess()
     self.util = NERUtil()
     self.text_list, self.label_list = self.util.load_ner_data(self.ori_path)
     self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list]
     self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list))))
     self.data_type = 'column_2'
Пример #4
0
 def read_data(self):
     self.pre = Preprocess()
     csv = pd.read_csv(self.ori_path,
                       header=0,
                       sep="\t",
                       error_bad_lines=False)
     if 'text' in csv.keys() and 'target' in csv.keys():
         #format: text \t target
         #for this format, the size for each class should be larger than 2
         self.text_list = list(csv['text'])
         self.label_list = list(csv['target'])
         self.data_type = 'column_2'
     elif 'text_a' in csv.keys() and 'text_b' in csv.keys(
     ) and 'target' in csv.keys():
         #format: text_a \t text_b \t target
         #for this format, target value can only be choosen from 0 or 1
         self.text_a_list = list(csv['text_a'])
         self.text_b_list = list(csv['text_b'])
         self.text_list = self.text_a_list + self.text_b_list
         self.label_list = list(csv['target'])
         self.data_type = 'column_3'
     else:
         raise ValueError('error format for train file')
     self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                       self.text_list]
Пример #5
0
    def __init__(self, conf):
        self.conf = conf
        self.task_type = 'classify'
        for attr in conf:
            setattr(self, attr, conf[attr])

        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        self.pre = Preprocess()
        self.text_list, self.label_list = load_classify_data(self.train_path)
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list]

        if not self.use_language_model:
            #build vocabulary map using training data
            self.vocab_dict = embedding[self.embedding_type].build_dict(dict_path = self.dict_path, 
                                                                  text_list = self.text_list)

            #define embedding object by embedding_type
            self.embedding = embedding[self.embedding_type](text_list = self.text_list,
                                                            vocab_dict = self.vocab_dict,
                                                            dict_path = self.dict_path,
                                                            random=self.rand_embedding,
                                                            batch_size = self.batch_size,
                                                            maxlen = self.maxlen,
                                                            embedding_size = self.embedding_size,
                                                            conf = self.conf)
            self.embed = self.embedding(name = 'x')
        self.y = tf.placeholder(tf.int32, [None], name="y")

        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "keep_prob":self.keep_prob,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "is_training": self.is_training
        })
        self.encoder = encoder[self.encoder_type](**params)

        if not self.use_language_model:
            self.out = self.encoder(self.embed)
        else:
            self.out = self.encoder()
        self.output_nodes = self.out.name.split(':')[0]
        self.loss(self.out)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        if self.use_language_model:
            tvars = tf.trainable_variables()
            init_checkpoint = conf['init_checkpoint_path']
            (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint,assignment_map)
Пример #6
0
 def read_data(self):
     self.pre = Preprocess()
     csv = pd.read_csv(self.ori_path,
                       header=0,
                       sep="\t",
                       error_bad_lines=False)
     self.text_list = list(csv['text'])
     self.label_list = list(csv['target'])
     for idx, text in enumerate(self.text_list):
         self.text_list[idx] = self.pre.get_dl_input_by_text(text)
         if len(self.text_list[idx]) == 0:
             logging.error("find blank lines in %s" % idx)
     self.data_type = 'column_2'
Пример #7
0
def main(_):
    for map_name in env_names:
        if rl_algo == 'ddpg':
            from agent.ddpg import DDPGAgent
            from networks.acnetwork_q_seperated import ActorNet, CriticNet
            from utils.memory import SequentialMemory

            actor = ActorNet()
            critic = CriticNet()
            memory = SequentialMemory(limit=arglist.DDPG.memory_limit)
            learner = DDPGAgent(actor, critic, memory)

        elif rl_algo == 'ppo':
            from agent.ppo import PPOAgent
            from networks.acnetwork_v_seperated import ActorNet, CriticNet
            from utils.memory import EpisodeMemory

            actor = ActorNet()
            critic = CriticNet()
            memory = EpisodeMemory(limit=arglist.PPO.memory_limit,
                                   action_shape=arglist.action_shape,
                                   observation_shape=arglist.observation_shape)
            learner = PPOAgent(actor, critic, memory)

        else:
            raise NotImplementedError()

        preprocess = Preprocess()
        game = MiniGame(map_name, learner, preprocess, nb_episodes=10000)
        game.run_ddpg()
    return 0
Пример #8
0
 def read_data(self):
     self.pre = Preprocess()
     encode_list, decode_list, target_list =\
         load_chat_data(self.ori_path)
     self.text_list = encode_list + decode_list
     self.label_list = target_list
     self.data_type = 'translation'
Пример #9
0
class Classify(TaskBase):
    def __init__(self, conf):
        super(Classify, self).__init__(conf)
        self.task_type = 'classify'
        self.conf = conf
        self.read_data()
        self.num_class = len(set(self.label_list))
        self.num_output = self.num_class
        logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" %
                     self.num_class)
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def read_data(self):
        self.pre = Preprocess()
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep="\t",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        for idx, text in enumerate(self.text_list):
            self.text_list[idx] = self.pre.get_dl_input_by_text(text)
            if len(self.text_list[idx]) == 0:
                logging.error("find blank lines in %s" % idx)
        self.data_type = 'column_2'

    def create_model_fn(self):
        def cal_loss(pred, labels, batch_size, conf):
            loss = get_loss(type=self.loss_type,
                            logits=pred,
                            labels=labels,
                            labels_sparse=True,
                            **conf)
            return loss

        def model_fn(features, labels, mode, params):
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()

            #############  encoder  #################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                self.embed_query = self.embedding(features=features,
                                                  name='x_query')
                out = self.encoder(self.embed_query,
                                   name='x_query',
                                   features=features)
            else:
                out = self.encoder(features=features)
            #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class))
            pred = tf.nn.softmax(out)
            pred_labels = tf.argmax(pred, axis=-1)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'encode': out,
                    'logit': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = cal_loss(pred, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                return self.train_estimator_spec(mode, loss, global_step,
                                                 params)

            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {
                    "accuracy":
                    tf.metrics.accuracy(labels=labels, predictions=pred_labels)
                }
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            size = self.num_class
            num_classes_per_batch = self.num_class_per_batch
            assert num_classes_per_batch <= self.num_class, \
                "num_classes_per_batch is %s > %s"%(num_classes_per_batch, self.num_class)
            num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith('train')
            ]
            if len(filenames) == 0:
                logging.warn(
                    "Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [
                    os.path.join(self.tfrecords_path, item)
                    for item in os.listdir(self.tfrecords_path)
                    if item.startswith('train')
                ]
            assert size == len(filenames), "each file represent one class"
            logging.info("tfrecords train class num: {}".format(
                len(filenames)))
            logging.info("tfrecords num_sentences_per_class:{}".format(
                num_sentences_per_class))
            logging.info("tfrecords num_classes_per_batch:{}".format(
                num_classes_per_batch))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith(mode)
            ]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        elif mode == 'dev':
            return lambda: test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {'is_training': True, 'keep_prob': 0.7}
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)
        self.save()

    def test(self, mode='test'):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]" % mode)

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}

        def get_features():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            features.update(self.encoder.features)
            return features

        self.save_model(self.create_model_fn(), params, get_features)
Пример #10
0
import sys

sys.path.append(".")
import pickle
from utils.preprocess import Preprocess

f = open("./train/chatbot_bin.bin", "rb")
word_index = pickle.load(f)
f.close()

sent = "갑자기 짜장면 먹고 싶네 ㅋㅋ"

p = Preprocess("./train/chatbot_bin.bin")
pos = p.pos(sent)
keywords = p.get_keywords(pos, without_tag=True)

print(p.word_index)
print(p.get_wordidx_sequence(keywords))
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        print(word, word_index["OOV"])
Пример #11
0
class NER(TaskBase):
    def __init__(self, conf):
        super(NER, self).__init__(conf)
        self.task_type = 'ner'
        self.conf = conf
        self.read_data()
        if self.maxlen == -1:
            self.maxlen = max([len(text.split()) for text in self.text_list])
        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "keep_prob": 1,
            "is_training": False,
        })

        #params['num_output'] = 128
        #self.encoder_base = encoder['transformer'](**params)
        #params['num_output'] = self.num_class
        self.encoder = encoder[self.encoder_type](**params)


    def read_data(self):
        self.pre = Preprocess()
        self.util = NERUtil()
        self.text_list, self.label_list = self.util.load_ner_data(self.ori_path)
        self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list]
        self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list))))
        self.data_type = 'column_2'

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            if mode == tf.estimator.ModeKeys.TRAIN:
                self.encoder.keep_prob = 0.5
                self.encoder.is_training = True
            else:
                self.encoder.keep_prob = 1
                self.encoder.is_training = False

            seq_len = features['x_query_length']
            global_step = tf.train.get_or_create_global_step()

            ################ encode ##################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                embed = self.embedding(features = features, name = 'x_query')
                out = self.encoder(embed, 'x_query', features = features, middle_flag = True)
                #out = self.encoder_base(embed, 'x_query', features = features, middle_flag = True)
                #out = self.encoder(out, 'x_query', features = features, middle_flag = True)
            else:
                out = self.encoder(features = features)

            logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class])

            batch_size = get_placeholder_batch_size(logits)
            small = -1000
            start_logits = tf.concat([
                small*tf.ones(shape=[batch_size, 1, self.num_class]), 
                tf.zeros(shape=[batch_size, 1, 1])],
                                     axis=-1)
            pad_logits = tf.cast(small * tf.ones(shape=[batch_size, self.maxlen,
                                                        1]), tf.float32)
            logits = tf.concat([logits, pad_logits], axis = -1)
            logits = tf.concat([start_logits, logits], axis = 1)
            seq_len += 1
            transition_params = tf.get_variable('crf', 
                                         [self.num_class + 1,self.num_class + 1], 
                                         dtype=tf.float32)
            pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'logit': logits,
                    'pred_ids': pred_ids,
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            else:
                ############### loss ####################
                labels = tf.concat([
                    tf.cast(self.num_class * tf.ones(shape=[batch_size, 1]), tf.int64), 
                    labels
                ], axis = -1)
                log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, 
                                                                      labels,
                                                                      seq_len,
                                                                      transition_params)
                loss = -tf.reduce_mean(log_likelihood)
                if mode == tf.estimator.ModeKeys.TRAIN:
                    return self.train_estimator_spec(mode, loss, global_step, params)
                if mode == tf.estimator.ModeKeys.EVAL:
                    weights = tf.sequence_mask(seq_len, self.maxlen+1)
                    metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)}
                    return tf.estimator.EstimatorSpec(mode, 
                                                      loss=loss, 
                                                      eval_metric_ops=metrics)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith('train')]
            if len(filenames) == 0:
                logging.warn("Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [os.path.join(self.tfrecords_path,item) for item in 
                             os.listdir(self.tfrecords_path) if item.startswith('train')]
            dataset = tf.data.TFRecordDataset(filenames)
            dataset = dataset.repeat()

            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.shuffle(buffer_size=100*self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            dataset = dataset.batch(self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        def test_input_fn(mode):
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith(mode)]
            assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'dev':
            return lambda : test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def save(self):
        def get_features():
            features = {'x_query': tf.placeholder(dtype=tf.int64, 
                                                  shape=[None, self.maxlen],
                                                  name='x_query'),
                        'x_query_length': tf.placeholder(dtype=tf.int64,
                                                         shape=[None],
                                                         name='x_query_length'),
                        }
            features.update(self.encoder.get_features())
            return features
        self.save_model(self.create_model_fn(), None, get_features)

    def train(self):
        estimator = self.get_train_estimator(self.create_model_fn(), None)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def test(self, mode = 'test'):
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]"%mode)

    def train_and_evaluate(self):
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path,
                                        save_checkpoints_steps=self.save_interval,
                                        keep_checkpoint_max=5)

        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config)

        early_stop = tf.estimator.experimental.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name="loss",
            max_steps_without_decrease=estimator.config.save_checkpoints_steps * 2,
            run_every_secs=None,
            run_every_steps=estimator.config.save_checkpoints_steps,
        )

        train_spec=tf.estimator.TrainSpec(
                             input_fn = self.create_input_fn("train"), 
                             max_steps = self.max_steps,
                             hooks=[early_stop])

        eval_spec=tf.estimator.EvalSpec(
                             input_fn = self.create_input_fn("dev"),
                             steps = None,
                             start_delay_secs = 1, # start evaluating after N seconds
                             throttle_secs = 10,  # evaluate every N seconds
                             )
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        self.save()
Пример #12
0
class Match(object):
    def __init__(self, conf):
        self.task_type = 'match'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.graph = tf.get_default_graph()
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}")
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                          self.text_list]
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def init_embedding(self):
        self.vocab_dict = embedding[self.embedding_type].build_dict(\
                                            dict_path = self.dict_path,
                                            text_list = self.text_list,
                                            mode = self.mode)
        self.embedding = embedding[self.embedding_type](text_list = self.text_list,
                                                        vocab_dict = self.vocab_dict,
                                                        dict_path = self.dict_path,
                                                        random=self.rand_embedding,
                                                        maxlen = self.maxlen,
                                                        batch_size = self.batch_size,
                                                        embedding_size =
                                                        self.embedding_size,
                                                        conf = self.conf)

    def prepare(self):
        self.init_embedding()
        self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
        self.gt.process(self.text_list, self.label_list, self.embedding.text2id,
                        self.encoder.encoder_fun, self.vocab_dict,
                        self.tfrecords_path, self.label_path)

    def cal_loss(self, pred, labels, pos_target, neg_target, batch_size, conf):
        if self.loss_type == 'hinge_loss':
            if self.sub_loss_type == 'all':
                loss = batch_all_triplet_loss(labels, pred, conf['margin'])
            else:
                loss = batch_hard_triplet_loss(labels, pred, conf['margin'])
        else:
            loss = get_loss(type = self.loss_type, logits = pred, labels =
                                labels, **conf)
        return loss

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            if not self.use_language_model:
                self.init_embedding()
                if self.tfrecords_mode == 'class':
                    self.embed_query = self.embedding(features = features, name = 'x_query')
                else:
                    self.embed_query = self.embedding(features = features, name = 'x_query')
                    self.embed_sample = self.embedding(features = features, name = 'x_sample')
            else:
                self.embedding = None
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()
            if self.sim_mode == 'cross':
                if not self.use_language_model:
                    pred = self.encoder(x_query = self.embed_query, 
                                        x_sample = self.embed_sample,
                                        features = features)
                else:
                    pred = self.encoder(features = features)
            elif self.sim_mode == 'represent':
                if not self.use_language_model:
                    #features['x_query_length'] = features['length']
                    pred = self.encoder(self.embed_query, 
                                                     name = 'x_query', 
                                                     features = features)
                else:
                    pred = self.encoder(features = features)
            else:
                raise ValueError('unknown sim mode')

            pos_target = tf.ones(shape = [int(self.batch_size/2)], dtype = tf.float32)
            neg_target = tf.zeros(shape = [int(self.batch_size/2)], dtype = tf.float32)
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'pred': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            loss = self.cal_loss(pred,
                             labels,
                             pos_target,
                             neg_target,
                             self.batch_size,
                             self.conf)
            if mode == tf.estimator.ModeKeys.TRAIN:
                if self.use_clr:
                    self.learning_rate = cyclic_learning_rate(global_step=global_step,
                                                          learning_rate = self.learning_rate, 
                                                          mode = self.clr_mode)
                optimizer = get_train_op(global_step, 
                                         self.optimizer_type, 
                                         loss,
                                         self.learning_rate, 
                                         clip_grad = 5)
                return tf.estimator.EstimatorSpec(mode, loss = loss,
                                                      train_op=optimizer)
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            if self.tfrecords_mode == 'pair':
                size = self.num_pair
                num_classes_per_batch = 2
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            else:
                size = self.num_class
                num_classes_per_batch = 16
                num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
                             for i in range(size)]
            logging.info("tfrecords train class num: {}".format(len(filenames)))
            datasets = [tf.data.TFRecordDataset(filename) for filename in filenames]
            datasets = [dataset.repeat() for dataset in datasets]
            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
                             for i in range(self.num_class)]
            assert self.num_class == len(filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'label':
            return lambda : test_input_fn("train")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.5
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def save(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        def serving_input_receiver_fn():
            x_query = tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen],
                                   name='x_query')
            length = tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length')
            label = tf.placeholder(dtype=tf.int64, shape=[None], name='label')

            receiver_tensors = {'x_query': x_query, 'x_query_length': length, 'label': label}
            features = {'x_query': x_query, 'x_query_length': length, 'label': label}
            return tf.estimator.export.ServingInputReceiver(receiver_tensors,
                                                            features)
        estimator.export_savedmodel(
            self.export_dir_path, # 目录
            serving_input_receiver_fn, # 返回ServingInputReceiver的函数
            assets_extra=None,
            as_text=False,
            checkpoint_path=None)

    def test(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        predictions = estimator.predict(input_fn=self.create_input_fn("test"))
        predictions = list(predictions)
        predictions_vec = [item['pred'] for item in predictions]
        predictions_label = [item['label'] for item in predictions]
        if self.tfrecords_mode == 'class':
            refers = estimator.predict(input_fn=self.create_input_fn("label"))
            refers = list(refers) 

            refers_vec = [item['pred'] for item in refers]
            refers_label = [item['label'] for item in refers]

            right = 0
            thre_right = 0
            sum = 0
            scores = cosine_similarity(predictions_vec, refers_vec)
            max_id = np.argmax(scores, axis=-1)
            #max_id = self.knn(scores, predictions_label, refers_label)
            for idx, item in enumerate(max_id):
                if refers_label[item] == predictions_label[idx]:
                    if scores[idx][item] > self.score_thre:
                        thre_right += 1
                    right += 1
                sum += 1
            print("Acc:{}".format(float(right)/sum))
            print("ThreAcc:{}".format(float(thre_right)/sum))
        else:
            #TODO: 对于pair方式的评估
            pdb.set_trace()

    def knn(self, scores, predictions_label, refers_label, k = 4):
        sorted_id = np.argsort(-scores, axis = -1)
        shape = np.shape(sorted_id)
        max_id = []
        for idx in range(shape[0]):
            mp = defaultdict(int)
            for idy in range(k):
                mp[refers_label[int(sorted_id[idx][idy])]] += 1
            max_id.append(max(mp,key=mp.get))
        return max_id

    def test_unit(self, text):
        #######################init#########################
        if self.model_loaded == False:
            #添加不参与训练样本
            if os.path.exists(self.no_train_path):
                csv = pd.read_csv(self.no_train_path, header = 0, sep=",", error_bad_lines=False)
                self.text_list += list(csv['text'])
                self.label_list += list(csv['target'])
            subdirs = [x for x in Path(self.export_dir_path).iterdir()
                    if x.is_dir() and 'temp' not in str(x)]
            latest = str(sorted(subdirs)[-1])
            self.predict_fn = predictor.from_saved_model(latest)
            self.init_embedding()
            self.model_loaded = True
            self.vec_list = self._get_vecs(self.predict_fn, self.text_list)
            #self.set_zdy_labels(['睡觉','我回家了','晚安','娃娃了','周杰伦','自然语言处理'],
            #                    ['打开情景模式','打开情景模式','打开情景模式',
            #                     '打开情景模式','打开情景模式','打开情景模式'])
        text_list = self.text_list
        vec_list = self.vec_list
        label_list = self.label_list

        #用于添加自定义问句(自定义优先)
        if self.zdy != {}:
            text_list = self.zdy['text_list'] + text_list
            vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
            label_list = self.zdy['label_list'] + label_list
        vec = self._get_vecs(self.predict_fn, [text], need_preprocess = True)
        scores = cosine_similarity(vec, vec_list)[0]
        max_id = np.argmax(scores)
        max_score = scores[max_id]
        max_similar = text_list[max_id]
        logging.info("test result: {}, {}, {}".format(label_list[max_id], max_score, max_similar))
        return label_list[max_id], max_score, max_id

    def set_zdy_labels(self, text_list, label_list):
        if len(text_list) == 0 or len(label_list) == 0: 
            self.zdy = {}
            return
        self.zdy['text_list'] = text_list
        self.zdy['vec_list'] = self._get_vecs(self.predict_fn, 
                                              text_list,
                                              need_preprocess = True)
        self.zdy['label_list'] = label_list

    def _get_vecs(self, predict_fn, text_list, need_preprocess = False):
        #根据batches数据生成向量
        text_list_pred, x_query, x_query_length = self.embedding.text2id(text_list,
                                                     self.vocab_dict,
                                                     need_preprocess)
        label = [0 for _ in range(len(text_list))]

        predictions = predict_fn({'x_query': x_query, 
                                  'x_query_length': x_query_length, 
                                  'label': label})
        return predictions['pred']
Пример #13
0
import sys
sys.path.append('..')

import pickle
from utils.preprocess import Preprocess

f = open('../train_tools/dict/chatbot_dict.bin', 'rb')
word_index = pickle.load(f)
f.close()

sentence = "오늘 오후 5시 30분에 닭고기를 먹고 싶어 ㅎㅎㅎ"

# 전처리 객체 생성
p = Preprocess(userdic='../utils/user_dic.tsv')

# 형태소 분석기 실행
pos = p.pos(sentence)

# 품사 태그와 같이 키워드 출력
keywords = p.get_keywords(pos, without_tag=True)
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        # 해당 단어가 사전에 없는 경우 OOV 처리
        print(word, word_index['OOV'])
Пример #14
0
import pickle


# 말뭉치 데이터 읽기
def read_corpus_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]  # 헤더 제거
    return data


# 말뭉치 데이터 가져오기
corpus_data = read_corpus_data('./corpus.txt')

# 말뭉치 데이터에서 키워드 추출하여 dictionary list 생성
p = Preprocess()
dict = []
for c in corpus_data:
    pos = p.pos(c[1])
    for k in pos:
        dict.append(k[0])
# 사전에 사용될 word2index 생성
# 사전 첫번째 인덱스에 OOV(Out of Vocabulary) 사용
tokenizer = preprocessing.text.Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(dict)
word_index = tokenizer.word_index

# 사전 파일 생성
f = open("chatbot_dict.bin", 'wb')
try:
    pickle.dump(word_index, f)
Пример #15
0
class Match(TaskBase):
    def __init__(self, conf):
        super(Match, self).__init__(conf)
        self.task_type = 'match'
        self.conf = conf
        self.read_data()
        self.num_class = len(set(self.label_list))
        logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" %
                     self.num_class)
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def read_data(self):
        self.pre = Preprocess()
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep="\t",
                          error_bad_lines=False)
        if 'text' in csv.keys() and 'target' in csv.keys():
            #format: text \t target
            #for this format, the size for each class should be larger than 2
            self.text_list = list(csv['text'])
            self.label_list = list(csv['target'])
            self.data_type = 'column_2'
        elif 'text_a' in csv.keys() and 'text_b' in csv.keys(
        ) and 'target' in csv.keys():
            #format: text_a \t text_b \t target
            #for this format, target value can only be choosen from 0 or 1
            self.text_a_list = list(csv['text_a'])
            self.text_b_list = list(csv['text_b'])
            self.text_list = self.text_a_list + self.text_b_list
            self.label_list = list(csv['target'])
            self.data_type = 'column_3'
        else:
            raise ValueError('error format for train file')
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                          self.text_list]

    def create_model_fn(self):
        def cal_loss(pred, labels, batch_size, conf):
            if self.tfrecords_mode == 'class':
                pos_scores, neg_scores = batch_hard_triplet_scores(
                    labels, pred,
                    is_distance=self.is_distance)  # pos/neg scores
                pos_scores = tf.squeeze(pos_scores, -1)
                neg_scores = tf.squeeze(neg_scores, -1)
                #for represent,
                #     pred is a batch of tensors which size >1
                #     we can use triplet loss(hinge loss) or contrastive loss
                #if use hinge loss, we don't need labels
                #if use other loss(contrastive loss), we need define pos/neg target before
                if self.loss_type in ['hinge_loss', 'improved_triplet_loss']:
                    #pairwise
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    **conf)
                else:
                    #pointwise
                    pos_target = tf.ones(shape=[int(self.batch_size)],
                                         dtype=tf.float32)
                    neg_target = tf.zeros(shape=[int(self.batch_size)],
                                          dtype=tf.float32)

                    pos_loss = get_loss(type=self.loss_type,
                                        logits=pos_scores,
                                        labels=pos_target,
                                        **conf)
                    neg_loss = get_loss(type=self.loss_type,
                                        logits=neg_scores,
                                        labels=neg_target,
                                        **conf)
                    loss = pos_loss + neg_loss

            elif self.tfrecords_mode in ['pair', 'point']:
                if self.loss_type in ['hinge_loss', 'improved_triplet_loss']:
                    assert self.tfrecords_mode == 'pair', "only pair mode can provide <query, pos, neg> format data"
                    #pairwise
                    if self.num_output == 1:
                        pred = tf.nn.sigmoid(pred)
                    elif self.num_output == 2:
                        pred = tf.nn.softmax(pred)[:, 0]
                        pred = tf.expand_dims(pred, -1)
                    else:
                        raise ValueError(
                            'unsupported num_output, 1(sigmoid) or 2(softmax)?'
                        )
                    pos_scores = tf.strided_slice(pred, [0], [batch_size], [2])
                    neg_scores = tf.strided_slice(pred, [1], [batch_size], [2])
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    **conf)
                elif self.loss_type in ['sigmoid_loss']:
                    #pointwise
                    labels = tf.expand_dims(labels, axis=-1)
                    loss = get_loss(type=self.loss_type,
                                    logits=pred,
                                    labels=labels,
                                    **conf)
                else:
                    raise ValueError('unsupported loss for pair/point match')
            else:
                raise ValueError('unknown tfrecords_mode?')
            return loss

        def model_fn(features, labels, mode, params):
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()

            ############# encode #################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                if self.tfrecords_mode == 'class':
                    self.embed_query = self.embedding(features=features,
                                                      name='x_query')
                    output = self.encoder(self.embed_query,
                                          name='x_query',
                                          features=features)
                    output = tf.nn.l2_normalize(output, -1)

                elif self.tfrecords_mode in ['pair', 'point']:
                    if self.sim_mode == 'cross':
                        self.embed_query = self.embedding(features=features,
                                                          name='x_query')
                        self.embed_sample = self.embedding(features=features,
                                                           name='x_sample')
                        output = self.encoder(x_query=self.embed_query,
                                              x_sample=self.embed_sample,
                                              features=features)
                    elif self.sim_mode == 'represent':
                        self.embed_query = self.embedding(features=features,
                                                          name='x_query')
                        self.embed_sample = self.embedding(features=features,
                                                           name='x_sample')
                        query_encode = self.encoder(self.embed_query,
                                                    name='x_query',
                                                    features=features)
                        sample_encode = self.encoder(self.embed_sample,
                                                     name='x_sample',
                                                     features=features)
                        output = self.concat(query_encode, sample_encode)
                        output = tf.layers.dense(output,
                                                 1,
                                                 kernel_regularizer=tf.contrib.
                                                 layers.l2_regularizer(0.001),
                                                 name='fc')
                    else:
                        raise ValueError(
                            'unknown sim_mode, represent or cross')
            else:
                output = self.encoder(features=features)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                #pdb.set_trace()
                predictions = {
                    'encode':
                    output,
                    'pred':
                    tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5),
                            tf.int32) if self.num_output == 2 else
                    tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32),
                    'score':
                    tf.nn.softmax(output)[:, 0]
                    if self.num_output == 2 else tf.nn.sigmoid(output),
                    'label':
                    features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = cal_loss(output, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                return self.train_estimator_spec(mode, loss, global_step,
                                                 params)
            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            if self.tfrecords_mode == 'class':
                #size = self.num_class
                num_classes_per_batch = 32
                assert num_classes_per_batch < self.num_class
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            elif self.tfrecords_mode == 'pair':
                #data order: query,pos,query,neg
                num_sentences_per_class = 4
                num_classes_per_batch = self.batch_size // num_sentences_per_class
            elif self.tfrecords_mode == 'point':
                #data order: query, sample(pos or neg)
                num_classes_per_batch = 2
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            else:
                raise ValueError('unknown tfrecords_mode')

            #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
            #                 for i in range(size)]
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith('train')
            ]
            if len(filenames) == 0:
                logging.warn(
                    "Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [
                    os.path.join(self.tfrecords_path, item)
                    for item in os.listdir(self.tfrecords_path)
                    if item.startswith('train')
                ]
            size = len(filenames)
            logging.info("tfrecords train class num: {}".format(size))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets]
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            ##test
            #pdb.set_trace()
            #sess = tf.Session()
            #features1,label1 = sess.run([features,label])
            #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]]
            #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]]
            return features, label

        def test_input_fn(mode):
            #filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
            #                 for i in range(self.num_class * self.dev_size)]
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith(mode)
            ]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        elif mode == 'dev':
            return lambda: test_input_fn("dev")
        elif mode == 'label':
            return lambda: test_input_fn("train")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.7,
        }
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}

        def get_features():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            if self.tfrecords_mode in ['pair', 'point']:
                features.update({
                    'x_sample':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None, self.maxlen],
                                   name='x_sample'),
                    'x_sample_length':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None],
                                   name='x_sample_length')
                })
            features.update(self.encoder.get_features())
            return features

        self.save_model(self.create_model_fn(), params, get_features)

    def test(self, mode='test'):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        predictions = estimator.predict(input_fn=self.create_input_fn(mode))
        predictions = list(predictions)

        if self.tfrecords_mode == 'class':
            predictions_vec = [item['encode'] for item in predictions]
            predictions_label = [item['label'] for item in predictions]
            refers = estimator.predict(input_fn=self.create_input_fn("label"))
            refers = list(refers)

            refers_vec = [item['encode'] for item in refers]
            refers_label = [item['label'] for item in refers]

            right = 0
            thre_right = 0
            sum = 0

            if self.is_distance:
                scores = euclidean_distances(predictions_vec, refers_vec)
                selected_ids = np.argmin(scores, axis=-1)
            else:
                scores = cosine_similarity(predictions_vec, refers_vec)
                selected_ids = np.argmax(scores, axis=-1)
            for idx, item in enumerate(selected_ids):
                if refers_label[item] == predictions_label[idx]:
                    if self.is_distance:
                        if 1 - scores[idx][item] > self.score_thre:
                            thre_right += 1
                    else:
                        if scores[idx][item] > self.score_thre:
                            thre_right += 1
                    right += 1
                sum += 1
            print("Acc:{}".format(float(right) / sum))
            print("ThreAcc:{}".format(float(thre_right) / sum))
        elif self.tfrecords_mode == 'pair':
            #对于pair方式的评估
            scores = [item['score'] for item in predictions]
            labels = [item['label'] for item in predictions]
            #pdb.set_trace()

            #predictions
            scores = np.reshape(scores, [self.num_class * self.dev_size, -1])
            pred_max_ids = np.argmax(scores, axis=-1)
            #label
            labels = np.reshape(labels, [self.num_class, -1])

            right = 0
            for idx, max_id in enumerate(pred_max_ids):
                if labels[idx][max_id] == 1:
                    right += 1
            sum = len(pred_max_ids)
            print("Acc:{}".format(float(right) / sum))

        elif self.tfrecords_mode == 'point':
            scores = [item['score'] for item in predictions]
            scores = np.reshape(scores, -1)
            scores = [0 if item < self.score_thre else 1 for item in scores]
            #pred = [item['pred'] for item in predictions]
            labels = [item['label'] for item in predictions]
            res = metrics(labels=labels, logits=np.array(scores))
            print("precision:{} recall:{} f1:{}".format(
                res[3], res[4], res[5]))

    def concat(self, a, b):
        tmp = tf.concat([a, b], axis=-1)
        #return tmp
        res1 = a * b
        res2 = a + b
        res3 = a - b
        return tf.concat([tmp, res1, res2, res3], axis=-1)

    def knn(self, scores, predictions_label, refers_label, k=4):
        sorted_id = np.argsort(-scores, axis=-1)
        shape = np.shape(sorted_id)
        max_id = []
        for idx in range(shape[0]):
            mp = defaultdict(int)
            for idy in range(k):
                mp[refers_label[int(sorted_id[idx][idy])]] += 1
            max_id.append(max(mp, key=mp.get))
        return max_id
Пример #16
0
import sys

sys.path.append('..')
from utils.preprocess import Preprocess

sentence = "내일 저녁 8시에 닭튀김을 주문하고 싶어"

# 전처리 객체 생성
p = Preprocess(userdic='../utils/user_dic.tsv'
               )  # tsv : tab seperated values, 음식 이름, 시간 정보에 대한 사전

# 형태소 분석기 실행
pos = p.pos(sentence)

# 품사 태그와 같이 키워드 출력
ret = p.get_keywords(pos, without_tag=False)
print(ret)
Пример #17
0
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents


p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

corpus = read_file('ner_train.txt')

sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])

    sentences.append(sentence)
    tags.append(bio_tag)
Пример #18
0
class XGB(TaskBase):
    def __init__(self, conf):
        super(XGB, self).__init__(conf)
        self.preprocess = Preprocess()
        self.vectorizer = TfidfVectorizer()
        self.thre = 0.5
        self.read_data()

    def output_label(self):
        with open(self.dict_path, 'w') as f:
            for item in self.labels:
                f.write('{}\t{}\n'.format(item, self.labels[item]))

    def read_data(self):
        #load train_data
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep="\t",
                          error_bad_lines=False)
        train_list = self.preprocess.process(csv['text'])
        self.labels = {
            item: idx
            for idx, item in enumerate(set(csv['target']))
        }
        self.labels_rev = {self.labels[item]: item for item in self.labels}
        self.labels_rev[-1] = '未知'
        self.output_label()
        print("class_num:", len(self.labels))
        #train data weight
        X = self.vectorizer.fit_transform(
            [' '.join(item) for item in train_list])
        y = [self.labels[item] for item in csv['target']]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=0)

        self.data = {}
        self.data['x_train'] = X_train
        self.data['y_train'] = y_train
        self.data['x_test'] = X_test
        self.data['y_test'] = y_test

    def train(self):
        ### fit model for train data
        self.model = XGBClassifier(
            learning_rate=0.3,
            n_estimators=100,  # 树的个数--1000棵树建立xgboost
            max_depth=6,  # 树的深度
            min_child_weight=1,  # 叶子节点最小权重
            gamma=0.,  # 惩罚项中叶子结点个数前的参数
            subsample=0.8,  # 随机选择80%样本建立决策树
            colsample_btree=0.8,  # 随机选择80%特征建立决策树
            objective='multi:softmax',  # 指定损失函数
            scale_pos_weight=1,  # 解决样本个数不平衡的问题
            random_state=27  # 随机数
        )
        print("开始训练!")
        self.model.fit(self.data['x_train'],
                       self.data['y_train'],
                       eval_set=[(self.data['x_test'], self.data['y_test'])],
                       eval_metric="mlogloss",
                       early_stopping_rounds=10,
                       verbose=True)

        ### model evaluate

        predictions = self.model.predict_proba(self.data['x_test'])
        y_pred = np.argmax(predictions, 1)
        scores = [predictions[idx][y_pred[idx]] for idx in range(len(y_pred))]
        #for idx in range(len(y_pred)):
        #    if scores[idx] < self.thre:
        #        y_pred[idx] = -1
        accuracy = accuracy_score(self.data['y_test'], y_pred)
        print("accuarcy: %.2f%%" % (accuracy * 100.0))

        #dt = pd.DataFrame({'text':self.data['raw_test_list'],
        #                   'feature':self.data['test_list'],
        #                   'target':[self.labels_rev[item] for item in
        #                             self.data['y_test']] ,
        #                   'pred': [self.labels_rev[item] for item in
        #                            y_pred],
        #                   'score': scores })
        #dt.to_csv(self.result_path,index=False,sep=',')

    def test(self, mode='test'):
        assert os.path.exists(file), "file [%s] not existed!" % file
        lines = [line.strip() for line in open(file).readlines()]
        test_list = self.preprocess.process(lines)
        test_weight = self.vectorizer.transform(
            [' '.join(item) for item in test_list])
        predictions = self.model.predict_proba(test_weight)
        pred = np.argmax(predictions, 1)
        pdb.set_trace()
        with open(file + '.res', 'w') as f_w:
            for idx, line in enumerate(lines):
                f_w.write("{}\t{}\t{}\n".format(line,
                                                self.labels_rev[pred[idx]],
                                                predictions[idx][pred[idx]]))
Пример #19
0
class ClassifyM():
    def __init__(self):
        self.preprocess = Preprocess()
        self.thre = 0.5

    def load(self, train_path, test_path):
        #load train_data
        csv = pd.read_csv(train_path)
        train_list = self.preprocess.process(csv['text'])
        self.labels = {
            item: idx
            for idx, item in enumerate(set(csv['intent']))
        }
        self.output_label()
        self.labels_rev = {self.labels[item]: item for item in self.labels}
        self.labels_rev[-1] = '未知'

        print("class_num:", len(self.labels))
        self.labels_num = len(self.labels)
        y_train = [self.labels[item] for item in csv['intent']]
        #train data weight
        self.vectorizer = TfidfVectorizer()
        train_weight = self.vectorizer.fit_transform(
            [' '.join(item) for item in train_list])
        #load test_data
        self.result_path = test_path + ".result.csv"
        csv = pd.read_csv(test_path)
        test_list = self.preprocess.process(csv['text'])
        y_test = [self.labels[item] for item in csv['intent']]  #int label
        #test data weight
        test_weight = self.vectorizer.transform(
            [' '.join(item) for item in test_list])

        self.data = {}
        self.data['x_train'] = train_weight
        self.data['y_train'] = y_train
        self.data['x_test'] = test_weight
        self.data['y_test'] = y_test
        self.data['raw_test_list'] = csv['text']
        self.data['test_list'] = test_list

    def output_label(self):
        with open('data/label.txt', 'w') as f:
            for item in self.labels:
                f.write('{}\t{}\n'.format(item, self.labels[item]))

    def train(self):
        ### fit model for train data
        self.model = XGBClassifier(
            learning_rate=0.1,
            n_estimators=1000,  # 树的个数--1000棵树建立xgboost
            max_depth=6,  # 树的深度
            min_child_weight=1,  # 叶子节点最小权重
            gamma=0.,  # 惩罚项中叶子结点个数前的参数
            subsample=0.8,  # 随机选择80%样本建立决策树
            colsample_btree=0.8,  # 随机选择80%特征建立决策树
            objective='multi:softmax',  # 指定损失函数
            scale_pos_weight=1,  # 解决样本个数不平衡的问题
            random_state=27  # 随机数
        )
        print("开始训练!")
        self.model.fit(self.data['x_train'],
                       self.data['y_train'],
                       eval_set=[(self.data['x_test'], self.data['y_test'])],
                       eval_metric="mlogloss",
                       early_stopping_rounds=10,
                       verbose=True)

        ### model evaluate

        predictions = self.model.predict_proba(self.data['x_test'])
        y_pred = np.argmax(predictions, 1)
        scores = [predictions[idx][y_pred[idx]] for idx in range(len(y_pred))]
        for idx in range(len(y_pred)):
            if scores[idx] < self.thre:
                y_pred[idx] = -1
        accuracy = accuracy_score(self.data['y_test'], y_pred)
        print("accuarcy: %.2f%%" % (accuracy * 100.0))

        dt = pd.DataFrame({
            'text':
            self.data['raw_test_list'],
            'feature':
            self.data['test_list'],
            'target': [self.labels_rev[item] for item in self.data['y_test']],
            'pred': [self.labels_rev[item] for item in y_pred],
            'score':
            scores
        })
        dt.to_csv(self.result_path, index=False, sep=',')

    def test(self, text):
        test_list = self.preprocess.process([text])
        test_weight = self.vectorizer.transform(
            [' '.join(item) for item in test_list])
        predictions = self.model.predict_proba(test_weight)
        pred = np.argmax(predictions, 1)
        print(self.labels_rev[pred[0]], predictions[0][pred[0]])

    def test(self, file):
        lines = [line.strip() for line in open(file).readlines()]
        test_list = self.preprocess.process(lines)
        test_weight = self.vectorizer.transform(
            [' '.join(item) for item in test_list])
        predictions = self.model.predict_proba(test_weight)
        pred = np.argmax(predictions, 1)
        pdb.set_trace()
        with open(file + '.res', 'w') as f_w:
            for idx, line in enumerate(lines):
                f_w.write("{}\t{}\t{}\n".format(line,
                                                self.labels_rev[pred[idx]],
                                                predictions[idx][pred[idx]]))

    def process(self, train_path, test_path):
        self.load(train_path, test_path)
        self.train()
Пример #20
0
    def __init__(self,
                 config,
                 root_dir='/data/music/chord_recognition',
                 dataset_names=('ce200', ),
                 featuretype=FeatureTypes.cqt,
                 num_workers=20,
                 train=False,
                 preprocessing=False,
                 resize=None,
                 kfold=4):
        super(AudioDataset, self).__init__()

        self.config = config
        self.root_dir = root_dir
        self.dataset_names = dataset_names
        self.preprocessor = Preprocess(config, featuretype, dataset_names,
                                       self.root_dir)
        self.resize = resize
        self.train = train
        self.ratio = config.experiment['data_ratio']

        # preprocessing hyperparameters
        # song_hz, n_bins, bins_per_octave, hop_length
        mp3_config = config.mp3
        feature_config = config.feature
        self.mp3_string = "%d_%.1f_%.1f" % \
                          (mp3_config['song_hz'], mp3_config['inst_len'],
                           mp3_config['skip_interval'])
        self.feature_string = "%s_%d_%d_%d" % \
                              (featuretype.value, feature_config['n_bins'], feature_config['bins_per_octave'], feature_config['hop_length'])

        if feature_config['large_voca'] == True:
            # store paths if exists
            is_preprocessed = True if os.path.exists(
                os.path.join(root_dir, 'result', dataset_names[0] + '_voca',
                             self.mp3_string, self.feature_string)) else False
            if (not is_preprocessed) | preprocessing:
                midi_paths = self.preprocessor.get_all_files()

                print(' --------- need preprocessed -----------')

                if num_workers > 1:
                    num_path_per_process = math.ceil(
                        len(midi_paths) / num_workers)
                    args = [
                        midi_paths[i * num_path_per_process:(i + 1) *
                                   num_path_per_process]
                        for i in range(num_workers)
                    ]

                    # start process
                    p = Pool(processes=num_workers)
                    p.map(self.preprocessor.generate_labels_features_voca,
                          args)

                    p.close()
                else:
                    self.preprocessor.generate_labels_features_voca(midi_paths)

            # kfold is 5 fold index ( 0, 1, 2, 3, 4 )
            self.song_names, self.paths = self.get_paths_voca(kfold=kfold)
        else:
            # store paths if exists
            is_preprocessed = True if os.path.exists(
                os.path.join(root_dir, 'result', dataset_names[0],
                             self.mp3_string, self.feature_string)) else False
            if (not is_preprocessed) | preprocessing:
                midi_paths = self.preprocessor.get_all_files()

                if num_workers > 1:
                    num_path_per_process = math.ceil(
                        len(midi_paths) / num_workers)
                    args = [
                        midi_paths[i * num_path_per_process:(i + 1) *
                                   num_path_per_process]
                        for i in range(num_workers)
                    ]

                    # start process
                    p = Pool(processes=num_workers)
                    p.map(self.preprocessor.generate_labels_features_new, args)

                    p.close()
                else:
                    self.preprocessor.generate_labels_features_new(midi_paths)

            # kfold is 5 fold index ( 0, 1, 2, 3, 4 )
            self.song_names, self.paths = self.get_paths(kfold=kfold)
Пример #21
0
 def __init__(self):
     self.preprocess = Preprocess()
     self.thre = 0.5
Пример #22
0
sys.path.append(".")

import pandas as pd
import tensorflow as tf
from keras import preprocessing
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

train_file_path = "./models/intent/total_train_data.csv"
data = pd.read_csv(train_file_path)
queries = data["query"].tolist()
intents = data["intent"].tolist()

from utils.preprocess import Preprocess

p = Preprocess(word2idx_dic="./train/chatbot_bin.bin")

sequences = []
for sent in queries:
    pos = p.pos(sent)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

from config.GlobalParams import MAX_SEQ_LEN
from sklearn.model_selection import train_test_split

padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding="post")

data_size = len(padded_seqs)
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents)).shuffle(data_size)
Пример #23
0
# 엔진. 의도 분류 모듈을 만들기 전 모델의 설계 및 학습
# 파일을 읽어 의도 분류 모델을 생성하고 학습

import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

train_file = 'total_train_data.csv'
data = pd.read_csv(train_file, delimiter=',')
queries = data['query'].tolist()
intents = data['intent'].tolist()

from utils.preprocess import Preprocess
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin')

sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

from config.globalparams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs) * 0.7)
Пример #24
0
class NER(TaskBase):
    def __init__(self, conf):
        super(NER, self).__init__(conf)
        self.task_type = 'ner'
        self.conf = conf
        self.read_data()
        #if self.maxlen == -1:
        #    self.maxlen = max([len(text.split()) for text in self.text_list])
        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "keep_prob": 1,
            "is_training": False,
        })

        self.encoder = encoder[self.encoder_type](**params)

    def read_data(self):
        self.pre = Preprocess()
        self.util = NERUtil()
        self.text_list, self.label_list = self.util.load_ner_data(self.ori_path)
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list]
        self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list))))
        self.data_type = 'column_2'

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            seq_len = features['x_query_length']
            global_step = tf.train.get_or_create_global_step()

            ################ encode ##################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                embed = self.embedding(features = features, name = 'x_query')
                out = self.encoder(embed, 'x_query', features = features, middle_flag = True)
            else:
                out = self.encoder(features = features)
            logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class])

            transition_params = tf.get_variable('crf', 
                                         [self.num_class,self.num_class], 
                                         dtype=tf.float32)
            pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'logit': logits,
                    'pred_ids': pred_ids,
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            else:
                ############### loss ####################
                log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels,
                                                                      seq_len,
                                                                      transition_params)
                loss = -tf.reduce_mean(log_likelihood)
                if mode == tf.estimator.ModeKeys.TRAIN:
                    return self.train_estimator_spec(mode, loss, global_step, params)
                if mode == tf.estimator.ModeKeys.EVAL:
                    #pdb.set_trace()
                    weights = tf.sequence_mask(seq_len, self.maxlen)
                    metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)}
                    #metrics = {'acc': tf.metrics.accuracy(labels, pred_ids)}
                    return tf.estimator.EstimatorSpec(mode, 
                                                      loss=loss, 
                                                      eval_metric_ops=metrics)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith('train')]
            if len(filenames) == 0:
                logging.warn("Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [os.path.join(self.tfrecords_path,item) for item in 
                             os.listdir(self.tfrecords_path) if item.startswith('train')]
            dataset = tf.data.TFRecordDataset(filenames)
            dataset = dataset.repeat()
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        def test_input_fn(mode):
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith(mode)]
            assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'dev':
            return lambda : test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def save(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        def get_features():
            features = {'x_query': tf.placeholder(dtype=tf.int64, 
                                                  shape=[None, self.maxlen],
                                                  name='x_query'),
                        'x_query_length': tf.placeholder(dtype=tf.int64,
                                                         shape=[None],
                                                         name='x_query_length'),
                        }
                        #'label': tf.placeholder(dtype=tf.int64, 
                        #                        shape=[None],
                        #                        name='label')}
            features.update(self.encoder.get_features())
            return features
        self.save_model(self.create_model_fn(), params, get_features)

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.7
        }
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def test(self, mode = 'test'):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]"%mode)
Пример #25
0
class AudioDataset(Dataset):
    def __init__(self,
                 config,
                 root_dir='/data/music/chord_recognition',
                 dataset_names=('ce200', ),
                 featuretype=FeatureTypes.cqt,
                 num_workers=20,
                 train=False,
                 preprocessing=False,
                 resize=None,
                 kfold=4):
        super(AudioDataset, self).__init__()

        self.config = config
        self.root_dir = root_dir
        self.dataset_names = dataset_names
        self.preprocessor = Preprocess(config, featuretype, dataset_names,
                                       self.root_dir)
        self.resize = resize
        self.train = train
        self.ratio = config.experiment['data_ratio']

        # preprocessing hyperparameters
        # song_hz, n_bins, bins_per_octave, hop_length
        mp3_config = config.mp3
        feature_config = config.feature
        self.mp3_string = "%d_%.1f_%.1f" % \
                          (mp3_config['song_hz'], mp3_config['inst_len'],
                           mp3_config['skip_interval'])
        self.feature_string = "%s_%d_%d_%d" % \
                              (featuretype.value, feature_config['n_bins'], feature_config['bins_per_octave'], feature_config['hop_length'])

        if feature_config['large_voca'] == True:
            # store paths if exists
            is_preprocessed = True if os.path.exists(
                os.path.join(root_dir, 'result', dataset_names[0] + '_voca',
                             self.mp3_string, self.feature_string)) else False
            if (not is_preprocessed) | preprocessing:
                midi_paths = self.preprocessor.get_all_files()

                print(' --------- need preprocessed -----------')

                if num_workers > 1:
                    num_path_per_process = math.ceil(
                        len(midi_paths) / num_workers)
                    args = [
                        midi_paths[i * num_path_per_process:(i + 1) *
                                   num_path_per_process]
                        for i in range(num_workers)
                    ]

                    # start process
                    p = Pool(processes=num_workers)
                    p.map(self.preprocessor.generate_labels_features_voca,
                          args)

                    p.close()
                else:
                    self.preprocessor.generate_labels_features_voca(midi_paths)

            # kfold is 5 fold index ( 0, 1, 2, 3, 4 )
            self.song_names, self.paths = self.get_paths_voca(kfold=kfold)
        else:
            # store paths if exists
            is_preprocessed = True if os.path.exists(
                os.path.join(root_dir, 'result', dataset_names[0],
                             self.mp3_string, self.feature_string)) else False
            if (not is_preprocessed) | preprocessing:
                midi_paths = self.preprocessor.get_all_files()

                if num_workers > 1:
                    num_path_per_process = math.ceil(
                        len(midi_paths) / num_workers)
                    args = [
                        midi_paths[i * num_path_per_process:(i + 1) *
                                   num_path_per_process]
                        for i in range(num_workers)
                    ]

                    # start process
                    p = Pool(processes=num_workers)
                    p.map(self.preprocessor.generate_labels_features_new, args)

                    p.close()
                else:
                    self.preprocessor.generate_labels_features_new(midi_paths)

            # kfold is 5 fold index ( 0, 1, 2, 3, 4 )
            self.song_names, self.paths = self.get_paths(kfold=kfold)

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        instance_path = self.paths[idx]

        res = dict()
        data = torch.load(instance_path)
        res['feature'] = np.log(np.abs(data['feature']) + 1e-6)
        res['chord'] = data['chord']
        return res

    def get_paths(self, kfold=4):
        temp = {}
        used_song_names = list()
        for name in self.dataset_names:
            dataset_path = os.path.join(self.root_dir, "result", name,
                                        self.mp3_string, self.feature_string)
            song_names = os.listdir(dataset_path)
            for song_name in song_names:
                paths = []
                instance_names = os.listdir(
                    os.path.join(dataset_path, song_name))
                if len(instance_names) > 0:
                    used_song_names.append(song_name)
                for instance_name in instance_names:
                    paths.append(
                        os.path.join(dataset_path, song_name, instance_name))
                temp[song_name] = paths
        # throw away unused song names
        song_names = used_song_names
        song_names = SortedList(song_names)

        print('Total used song length : %d' % len(song_names))
        tmp = []
        for i in range(len(song_names)):
            tmp += temp[song_names[i]]
        print('Total instances (train and valid) : %d' % len(tmp))

        # divide train/valid dataset using k fold
        result = []
        total_fold = 5
        quotient = len(song_names) // total_fold
        remainder = len(song_names) % total_fold
        fold_num = [0]
        for i in range(total_fold):
            fold_num.append(quotient)
        for i in range(remainder):
            fold_num[i + 1] += 1
        for i in range(total_fold):
            fold_num[i + 1] += fold_num[i]

        if self.train:
            tmp = []
            # get not augmented data
            for k in range(total_fold):
                if k != kfold:
                    for i in range(fold_num[k], fold_num[k + 1]):
                        result += temp[song_names[i]]
                    tmp += song_names[fold_num[k]:fold_num[k + 1]]
            song_names = tmp
        else:
            for i in range(fold_num[kfold], fold_num[kfold + 1]):
                instances = temp[song_names[i]]
                instances = [inst for inst in instances if "1.00_0" in inst]
                result += instances
            song_names = song_names[fold_num[kfold]:fold_num[kfold + 1]]
        return song_names, result

    def get_paths_voca(self, kfold=4):
        temp = {}
        used_song_names = list()
        for name in self.dataset_names:
            dataset_path = os.path.join(self.root_dir, "result",
                                        name + '_voca', self.mp3_string,
                                        self.feature_string)
            song_names = os.listdir(dataset_path)
            for song_name in song_names:
                paths = []
                instance_names = os.listdir(
                    os.path.join(dataset_path, song_name))
                if len(instance_names) > 0:
                    used_song_names.append(song_name)
                for instance_name in instance_names:
                    paths.append(
                        os.path.join(dataset_path, song_name, instance_name))
                temp[song_name] = paths
        # throw away unused song names
        song_names = used_song_names
        song_names = SortedList(song_names)

        print('Total used song length : %d' % len(song_names))
        tmp = []
        for i in range(len(song_names)):
            tmp += temp[song_names[i]]
        print('Total instances (train and valid) : %d' % len(tmp))

        # divide train/valid dataset using k fold
        result = []
        total_fold = 5
        quotient = len(song_names) // total_fold
        remainder = len(song_names) % total_fold
        fold_num = [0]
        for i in range(total_fold):
            fold_num.append(quotient)
        for i in range(remainder):
            fold_num[i + 1] += 1
        for i in range(total_fold):
            fold_num[i + 1] += fold_num[i]

        if self.train:
            tmp = []
            # get not augmented data
            for k in range(total_fold):
                if k != kfold:
                    for i in range(fold_num[k], fold_num[k + 1]):
                        result += temp[song_names[i]]
                    tmp += song_names[fold_num[k]:fold_num[k + 1]]
            song_names = tmp
        else:
            for i in range(fold_num[kfold], fold_num[kfold + 1]):
                instances = temp[song_names[i]]
                instances = [inst for inst in instances if "1.00_0" in inst]
                result += instances
            song_names = song_names[fold_num[kfold]:fold_num[kfold + 1]]
        return song_names, result
Пример #26
0
 def __init__(self, conf):
     super(XGB, self).__init__(conf)
     self.preprocess = Preprocess()
     self.vectorizer = TfidfVectorizer()
     self.thre = 0.5
     self.read_data()
Пример #27
0
class Classify(object):
    def __init__(self, conf):
        self.task_type = 'classify'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep=",",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        self.num_output = self.num_class
        logging.info(
            f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<")
        for idx, text in enumerate(self.text_list):
            self.text_list[idx] = self.pre.get_dl_input_by_text(text)
            if len(self.text_list[idx]) == 0:
                logging.error(f"find blank lines in {idx}")

        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def init_embedding(self):
        self.vocab_dict = embedding[self.embedding_type].build_dict(\
                                            dict_path = self.dict_path,
                                            text_list = self.text_list,
                                            mode = self.mode)
        self.embedding = embedding[self.embedding_type](
            text_list=self.text_list,
            vocab_dict=self.vocab_dict,
            dict_path=self.dict_path,
            random=self.rand_embedding,
            maxlen=self.maxlen,
            batch_size=self.batch_size,
            embedding_size=self.embedding_size,
            conf=self.conf)

    def prepare(self):
        self.init_embedding()
        self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
        self.gt.process(self.text_list, self.label_list,
                        self.embedding.text2id, self.encoder.encoder_fun,
                        self.vocab_dict, self.tfrecords_path, self.label_path,
                        self.test_size)
        logging.info("tfrecords generated!")

    def cal_loss(self, pred, labels, batch_size, conf):
        loss = get_loss(type=self.loss_type,
                        logits=pred,
                        labels=labels,
                        labels_sparse=True,
                        **conf)
        return loss

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            ########### embedding #################
            if not self.use_language_model:
                self.init_embedding()
                self.embed_query = self.embedding(features=features,
                                                  name='x_query')
            else:
                self.embedding = None
            #############  encoder  #################
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()
            if not self.use_language_model:
                out = self.encoder(self.embed_query,
                                   name='x_query',
                                   features=features)
            else:
                out = self.encoder(features=features)
            #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class))
            pred = tf.nn.softmax(out)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'encode': out,
                    'logit': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = self.cal_loss(pred, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                if self.use_clr:
                    self.learning_rate = cyclic_learning_rate(
                        global_step=global_step,
                        learning_rate=self.learning_rate,
                        mode=self.clr_mode)
                optimizer = get_train_op(global_step,
                                         self.optimizer_type,
                                         loss,
                                         self.learning_rate,
                                         clip_grad=5)
                return tf.estimator.EstimatorSpec(mode,
                                                  loss=loss,
                                                  train_op=optimizer)

            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            size = self.num_class
            num_classes_per_batch = self.num_class_per_batch
            assert num_classes_per_batch <= self.num_class, \
                f"num_classes_per_batch is {num_classes_per_batch} > {self.num_class}"
            num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
                             for i in range(size)]
            logging.info("tfrecords train class num: {}".format(
                len(filenames)))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
                             for i in range(self.num_class)]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {'is_training': True, 'keep_prob': 0.5}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)
        self.save()

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)

        def serving_input_receiver_fn():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            features.update(self.encoder.features)
            return tf.estimator.export.ServingInputReceiver(features, features)

        estimator.export_savedmodel(
            self.export_dir_path,  # 目录
            serving_input_receiver_fn,  # 返回ServingInputReceiver的函数
            assets_extra=None,
            as_text=False,
            checkpoint_path=None)

    def test(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        predictions = estimator.predict(input_fn=self.create_input_fn("test"))
        predictions = list(predictions)
        scores = [item['logit'] for item in predictions]
        labels = [item['label'] for item in predictions]
        max_scores = np.max(scores, axis=-1)
        max_ids = np.argmax(scores, axis=-1)
        res = np.equal(labels, max_ids)
        right = len(list(filter(lambda x: x == True, res)))
        sum = len(res)
        print("Acc:{}".format(float(right) / sum))
Пример #28
0
class NER(object):
    def __init__(self, conf):
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.task_type = 'ner'
        self.clip_grad = 5.0
        self.optimizer_type = self.optimizer_type
        self.label2tag = {
            self.tag2label[item]: item
            for item in self.tag2label
        }
        self.shuffle = True

        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        self.pre = Preprocess()
        self.text_list, self.label_list = load_ner_data(self.train_path)
        if self.maxlen == -1:
            self.maxlen = max([len(text.split()) for text in self.text_list])
        self.trans_label_list(self.label_list, self.tag2label)

        self.text_list = [
            self.pre.get_dl_input_by_text(text) for text in self.text_list
        ]

        if not self.use_language_model:
            #build vocabulary map using training data
            self.vocab_dict = embedding[self.embedding_type].build_dict(
                dict_path=self.dict_path, text_list=self.text_list)

            #define embedding object by embedding_type
            self.embedding = embedding[self.embedding_type](
                text_list=self.text_list,
                vocab_dict=self.vocab_dict,
                dict_path=self.dict_path,
                random=self.rand_embedding,
                batch_size=self.batch_size,
                maxlen=self.maxlen,
                embedding_size=self.embedding_size,
                conf=self.conf)
            self.embed = self.embedding(name='x')
        else:
            self.embedding = None
        self.labels = tf.placeholder(tf.int32,
                                     shape=[None, None],
                                     name="labels")
        self.sequence_lengths = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name="sequence_lengths")

        #model params
        params = conf
        params.update({
            "maxlen": self.maxlen,
            "embedding_size": self.embedding_size,
            "keep_prob": self.keep_prob,
            "is_training": self.is_training,
            "batch_size": self.batch_size,
            "num_output": self.num_class
        })

        self.encoder = encoder[self.encoder_type](**params)
        if not self.use_language_model:
            self.out = self.encoder(self.embed, 'query', middle_flag=True)
        else:
            self.out = self.encoder()
        self.output_nodes = self.out.name.split(':')[0]
        self.loss(self.out)
        self.optimizer = get_train_op(self.global_step, self.optimizer_type,
                                      self.loss, self.clip_grad,
                                      self.learning_rate)
        #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        if self.use_language_model:
            tvars = tf.trainable_variables()
            init_checkpoint = conf['init_checkpoint_path']
            (assignment_map,
             initialized_variable_names) = get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    def loss(self, out):
        out_shape = tf.shape(out)
        self.logits = tf.reshape(out, [-1, out_shape[1], self.num_class])
        if not self.use_crf:
            self.labels_softmax_ = tf.argmax(self.logits, axis=-1)
            self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32)
        if self.use_crf:
            log_likelihood, self.transition_params = crf_log_likelihood(
                inputs=self.logits,
                tag_indices=self.labels,
                sequence_lengths=self.sequence_lengths)
            self.loss = -tf.reduce_mean(log_likelihood)

        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        tf.summary.scalar("loss", self.loss)

    def trans_label_list(self, label_list, tag2label):
        for idx, labels in enumerate(label_list):
            for idy, label in enumerate(labels):
                label_list[idx][idy] = tag2label[label_list[idx][idy]]

    def demo_one(self, sess, sent):
        label_list = []
        batches = batch_iter(sent,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=False)
        for batch in batches:
            seqs, labels = zip(*batch)
            label_list_, _ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag if label != 0 else label
        tag = [label2tag[label] for label in label_list[0]]
        return tag

    def train(self):
        train_data = zip(self.text_list, self.label_list)
        batches = batch_iter(train_data,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=True)

        max_acc = -1
        for step, batch in enumerate(batches):
            x_batch, labels = zip(*batch)
            sys.stdout.write(' processing: {}.'.format(step + 1) + '\r')
            step_num = step + 1

            if not self.use_language_model:
                _, x_batch, len_batch = self.embedding.text2id(
                    x_batch,
                    self.vocab_dict,
                    self.maxlen,
                    need_preprocess=False)
                feed_dict = {self.sequence_lengths: len_batch}
                feed_dict[self.labels], _ = self.embedding.pad_sequences(
                    labels)
                feed_dict.update(self.embedding.feed_dict(x_batch, 'x'))
                feed_dict.update(self.encoder.feed_dict(query=len_batch))
            else:
                feed_dict = {}
                feed_dict.update(self.encoder.feed_dict(x_batch))

            _, loss_train, step_num_ = self.sess.run(
                [self.optimizer, self.loss, self.global_step],
                feed_dict=feed_dict)
            if step_num % (self.valid_step / 10) == 0:
                logging.info('step {}, loss: {:.4}'.format(\
                    step_num,
                    loss_train))
            if step_num % (self.valid_step) == 0:
                logging.info('===========validation / test===========')
                result = self.test()
                logging.info("result:", result)
                if result['acc'] > max_acc:
                    max_acc = result['acc']
                    self.saver.save(self.sess,
                                    "{0}/{1}.ckpt".format(
                                        self.checkpoint_path, self.task_type),
                                    global_step=step)
                    write_pb(self.checkpoint_path, self.model_path,
                             ["is_training", self.output_nodes])
                else:
                    self.save_pb()
                    logging.info(f'train finished! accuracy: {max_acc}')
                    sys.exit(0)

    def test(self):
        #saver = tf.train.Saver()
        #with tf.Session() as sess:
        #    logging.info('=========== testing ===========')
        #    saver.restore(sess, self.model_path)
        #    label_list, seq_len_list = self.dev_one_epoch(sess, test)
        #    self.evaluate(label_list, seq_len_list, test)

        self.raw_dev_text_list, self.dev_label_list = load_ner_data(
            self.test_path)
        #self.raw_dev_text_list, self.dev_label_list = \
        #    self.raw_dev_text_list[:50], self.dev_label_list[:50]
        self.dev_text_list = [self.pre.get_dl_input_by_text(text) for \
                              text in self.raw_dev_text_list]
        self.trans_label_list(self.dev_label_list, self.tag2label)
        dev_data = zip(self.dev_text_list, self.dev_label_list)
        out_label_list, seq_len_list = self.dev_one_epoch(self.sess, dev_data)
        result = self.evaluate(self.dev_label_list, out_label_list, \
                               self.raw_dev_text_list, seq_len_list)
        return result

    def dev_one_epoch(self, sess, dev):
        """

        :param sess:
        :param dev:
        :return:
        """
        label_list, seq_len_list = [], []
        batches = batch_iter(dev,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=False)
        for batch in batches:
            seqs, labels = zip(*batch)
            label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
            seq_len_list.extend(seq_len_list_)
        return label_list, seq_len_list

    def predict_one_batch(self, sess, seqs):
        """

        :param sess:
        :param seqs:
        :return: label_list
                 seq_len_list
        """
        if self.use_language_model:
            _, x_batch, len_batch = self.embedding.text2id(
                seqs, self.vocab_dict, self.maxlen, need_preprocess=False)
            feed_dict = {self.sequence_lengths: len_batch}
            feed_dict.update(self.embedding.feed_dict(x_batch, 'x'))
            feed_dict.update(self.encoder.feed_dict(query=len_batch))
        else:
            feed_dict.update(self.encoder.feed_dict(x_batch))

        if self.use_crf:
            logits, transition_params = sess.run(
                [self.logits, self.transition_params], feed_dict=feed_dict)
            label_list = []
            for logit, seq_len in zip(logits, len_batch):
                viterbi_seq, _ = viterbi_decode(logit[:seq_len],
                                                transition_params)
                label_list.append(viterbi_seq)
            return label_list, len_batch

        else:
            label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict)
            return label_list, len_batch

    #def evaluate(self, label_list, seq_len_list, data, epoch=None):
    def evaluate(self, dev_label_list, out_label_list, raw_dev_text_list, \
                 seq_len_list):
        model_predict = []
        for label, label_pred, sent, seq_len in zip(dev_label_list,
                                                    out_label_list,
                                                    raw_dev_text_list,
                                                    seq_len_list):
            sent = sent.split()
            sent_res = []
            for idx in range(seq_len):
                sent_res.append([sent[idx], label[idx], label_pred[idx]])
            model_predict.append(sent_res)

        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for item in model_predict:
            sent = [i[0] for i in item]
            lab = [i[1] for i in item]
            lab_pred = [i[2] for i in item]
            accs += [a == b for (a, b) in zip(lab, lab_pred)]
            lab_chunks = set(get_chunks(lab, self.tag2label))
            lab_pred_chunks = set(get_chunks(lab_pred, self.tag2label))

            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds += len(lab_pred_chunks)
            total_correct += len(lab_chunks)
        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        return {"acc": 100 * acc, "f1": 100 * f1}
Пример #29
0
from utils.preprocess import Preprocess
from models.ner.nermodel import NerModel

p = Preprocess(word2index_dic='../train_tools/dict/chatbot_dict.bin',
               userdic='../utils/user_dic.tsv')

ner = NerModel(model_name='../models/ner/ner_model.h5', proprocess=p)
query = '오늘 오전 13시 2분에 탕수육 주문하고 싶어요'
predicts = ner.predict(query)
print(predicts)
Пример #30
0
    def __init__(self, conf):
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.task_type = 'ner'
        self.clip_grad = 5.0
        self.optimizer_type = self.optimizer_type
        self.label2tag = {
            self.tag2label[item]: item
            for item in self.tag2label
        }
        self.shuffle = True

        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        self.pre = Preprocess()
        self.text_list, self.label_list = load_ner_data(self.train_path)
        if self.maxlen == -1:
            self.maxlen = max([len(text.split()) for text in self.text_list])
        self.trans_label_list(self.label_list, self.tag2label)

        self.text_list = [
            self.pre.get_dl_input_by_text(text) for text in self.text_list
        ]

        if not self.use_language_model:
            #build vocabulary map using training data
            self.vocab_dict = embedding[self.embedding_type].build_dict(
                dict_path=self.dict_path, text_list=self.text_list)

            #define embedding object by embedding_type
            self.embedding = embedding[self.embedding_type](
                text_list=self.text_list,
                vocab_dict=self.vocab_dict,
                dict_path=self.dict_path,
                random=self.rand_embedding,
                batch_size=self.batch_size,
                maxlen=self.maxlen,
                embedding_size=self.embedding_size,
                conf=self.conf)
            self.embed = self.embedding(name='x')
        else:
            self.embedding = None
        self.labels = tf.placeholder(tf.int32,
                                     shape=[None, None],
                                     name="labels")
        self.sequence_lengths = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name="sequence_lengths")

        #model params
        params = conf
        params.update({
            "maxlen": self.maxlen,
            "embedding_size": self.embedding_size,
            "keep_prob": self.keep_prob,
            "is_training": self.is_training,
            "batch_size": self.batch_size,
            "num_output": self.num_class
        })

        self.encoder = encoder[self.encoder_type](**params)
        if not self.use_language_model:
            self.out = self.encoder(self.embed, 'query', middle_flag=True)
        else:
            self.out = self.encoder()
        self.output_nodes = self.out.name.split(':')[0]
        self.loss(self.out)
        self.optimizer = get_train_op(self.global_step, self.optimizer_type,
                                      self.loss, self.clip_grad,
                                      self.learning_rate)
        #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        if self.use_language_model:
            tvars = tf.trainable_variables()
            init_checkpoint = conf['init_checkpoint_path']
            (assignment_map,
             initialized_variable_names) = get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)