class Classify(TaskBase): def __init__(self, conf): super(Classify, self).__init__(conf) self.task_type = 'classify' self.conf = conf self.read_data() self.num_class = len(set(self.label_list)) self.num_output = self.num_class logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" % self.num_class) self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error("find blank lines in %s" % idx) self.data_type = 'column_2' def create_model_fn(self): def cal_loss(pred, labels, batch_size, conf): loss = get_loss(type=self.loss_type, logits=pred, labels=labels, labels_sparse=True, **conf) return loss def model_fn(features, labels, mode, params): #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() ############# encoder ################# if not self.use_language_model: self.embedding, _ = self.init_embedding() self.embed_query = self.embedding(features=features, name='x_query') out = self.encoder(self.embed_query, name='x_query', features=features) else: out = self.encoder(features=features) #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class)) pred = tf.nn.softmax(out) pred_labels = tf.argmax(pred, axis=-1) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'encode': out, 'logit': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = cal_loss(pred, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=pred_labels) } return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): size = self.num_class num_classes_per_batch = self.num_class_per_batch assert num_classes_per_batch <= self.num_class, \ "num_classes_per_batch is %s > %s"%(num_classes_per_batch, self.num_class) num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] if len(filenames) == 0: logging.warn( "Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] assert size == len(filenames), "each file represent one class" logging.info("tfrecords train class num: {}".format( len(filenames))) logging.info("tfrecords num_sentences_per_class:{}".format( num_sentences_per_class)) logging.info("tfrecords num_classes_per_batch:{}".format( num_classes_per_batch)) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode) ] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") elif mode == 'dev': return lambda: test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def train(self): params = {'is_training': True, 'keep_prob': 0.7} estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) self.save() def test(self, mode='test'): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]" % mode) def save(self): params = {'is_training': False, 'keep_prob': 1} def get_features(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } features.update(self.encoder.features) return features self.save_model(self.create_model_fn(), params, get_features)
class Classify(object): def __init__(self, conf): self.task_type = 'classify' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header=0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) self.num_output = self.num_class logging.info( f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<") for idx, text in enumerate(self.text_list): self.text_list[idx] = self.pre.get_dl_input_by_text(text) if len(self.text_list[idx]) == 0: logging.error(f"find blank lines in {idx}") self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def init_embedding(self): self.vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) self.embedding = embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=self.vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, maxlen=self.maxlen, batch_size=self.batch_size, embedding_size=self.embedding_size, conf=self.conf) def prepare(self): self.init_embedding() self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) self.gt.process(self.text_list, self.label_list, self.embedding.text2id, self.encoder.encoder_fun, self.vocab_dict, self.tfrecords_path, self.label_path, self.test_size) logging.info("tfrecords generated!") def cal_loss(self, pred, labels, batch_size, conf): loss = get_loss(type=self.loss_type, logits=pred, labels=labels, labels_sparse=True, **conf) return loss def create_model_fn(self): def model_fn(features, labels, mode, params): ########### embedding ################# if not self.use_language_model: self.init_embedding() self.embed_query = self.embedding(features=features, name='x_query') else: self.embedding = None ############# encoder ################# #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() if not self.use_language_model: out = self.encoder(self.embed_query, name='x_query', features=features) else: out = self.encoder(features=features) #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class)) pred = tf.nn.softmax(out) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'encode': out, 'logit': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = self.cal_loss(pred, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: if self.use_clr: self.learning_rate = cyclic_learning_rate( global_step=global_step, learning_rate=self.learning_rate, mode=self.clr_mode) optimizer = get_train_op(global_step, self.optimizer_type, loss, self.learning_rate, clip_grad=5) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=optimizer) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): size = self.num_class num_classes_per_batch = self.num_class_per_batch assert num_classes_per_batch <= self.num_class, \ f"num_classes_per_batch is {num_classes_per_batch} > {self.num_class}" num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ for i in range(size)] logging.info("tfrecords train class num: {}".format( len(filenames))) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ for i in range(self.num_class)] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") else: raise ValueError("unknown input_fn type!") def train(self): params = {'is_training': True, 'keep_prob': 0.5} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) self.save() def save(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) def serving_input_receiver_fn(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } features.update(self.encoder.features) return tf.estimator.export.ServingInputReceiver(features, features) estimator.export_savedmodel( self.export_dir_path, # 目录 serving_input_receiver_fn, # 返回ServingInputReceiver的函数 assets_extra=None, as_text=False, checkpoint_path=None) def test(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) predictions = estimator.predict(input_fn=self.create_input_fn("test")) predictions = list(predictions) scores = [item['logit'] for item in predictions] labels = [item['label'] for item in predictions] max_scores = np.max(scores, axis=-1) max_ids = np.argmax(scores, axis=-1) res = np.equal(labels, max_ids) right = len(list(filter(lambda x: x == True, res))) sum = len(res) print("Acc:{}".format(float(right) / sum))
class NER(object): def __init__(self, conf): self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.task_type = 'ner' self.clip_grad = 5.0 self.optimizer_type = self.optimizer_type self.label2tag = { self.tag2label[item]: item for item in self.tag2label } self.shuffle = True self.is_training = tf.placeholder(tf.bool, [], name="is_training") self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.where(self.is_training, 0.5, 1.0) self.pre = Preprocess() self.text_list, self.label_list = load_ner_data(self.train_path) if self.maxlen == -1: self.maxlen = max([len(text.split()) for text in self.text_list]) self.trans_label_list(self.label_list, self.tag2label) self.text_list = [ self.pre.get_dl_input_by_text(text) for text in self.text_list ] if not self.use_language_model: #build vocabulary map using training data self.vocab_dict = embedding[self.embedding_type].build_dict( dict_path=self.dict_path, text_list=self.text_list) #define embedding object by embedding_type self.embedding = embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=self.vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, batch_size=self.batch_size, maxlen=self.maxlen, embedding_size=self.embedding_size, conf=self.conf) self.embed = self.embedding(name='x') else: self.embedding = None self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") #model params params = conf params.update({ "maxlen": self.maxlen, "embedding_size": self.embedding_size, "keep_prob": self.keep_prob, "is_training": self.is_training, "batch_size": self.batch_size, "num_output": self.num_class }) self.encoder = encoder[self.encoder_type](**params) if not self.use_language_model: self.out = self.encoder(self.embed, 'query', middle_flag=True) else: self.out = self.encoder() self.output_nodes = self.out.name.split(':')[0] self.loss(self.out) self.optimizer = get_train_op(self.global_step, self.optimizer_type, self.loss, self.clip_grad, self.learning_rate) #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) if self.use_language_model: tvars = tf.trainable_variables() init_checkpoint = conf['init_checkpoint_path'] (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) def loss(self, out): out_shape = tf.shape(out) self.logits = tf.reshape(out, [-1, out_shape[1], self.num_class]) if not self.use_crf: self.labels_softmax_ = tf.argmax(self.logits, axis=-1) self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32) if self.use_crf: log_likelihood, self.transition_params = crf_log_likelihood( inputs=self.logits, tag_indices=self.labels, sequence_lengths=self.sequence_lengths) self.loss = -tf.reduce_mean(log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=self.labels) mask = tf.sequence_mask(self.sequence_lengths) losses = tf.boolean_mask(losses, mask) self.loss = tf.reduce_mean(losses) tf.summary.scalar("loss", self.loss) def trans_label_list(self, label_list, tag2label): for idx, labels in enumerate(label_list): for idy, label in enumerate(labels): label_list[idx][idy] = tag2label[label_list[idx][idy]] def demo_one(self, sess, sent): label_list = [] batches = batch_iter(sent, self.batch_size, self.epoch_num, shuffle=False) for batch in batches: seqs, labels = zip(*batch) label_list_, _ = self.predict_one_batch(sess, seqs) label_list.extend(label_list_) label2tag = {} for tag, label in self.tag2label.items(): label2tag[label] = tag if label != 0 else label tag = [label2tag[label] for label in label_list[0]] return tag def train(self): train_data = zip(self.text_list, self.label_list) batches = batch_iter(train_data, self.batch_size, self.epoch_num, shuffle=True) max_acc = -1 for step, batch in enumerate(batches): x_batch, labels = zip(*batch) sys.stdout.write(' processing: {}.'.format(step + 1) + '\r') step_num = step + 1 if not self.use_language_model: _, x_batch, len_batch = self.embedding.text2id( x_batch, self.vocab_dict, self.maxlen, need_preprocess=False) feed_dict = {self.sequence_lengths: len_batch} feed_dict[self.labels], _ = self.embedding.pad_sequences( labels) feed_dict.update(self.embedding.feed_dict(x_batch, 'x')) feed_dict.update(self.encoder.feed_dict(query=len_batch)) else: feed_dict = {} feed_dict.update(self.encoder.feed_dict(x_batch)) _, loss_train, step_num_ = self.sess.run( [self.optimizer, self.loss, self.global_step], feed_dict=feed_dict) if step_num % (self.valid_step / 10) == 0: logging.info('step {}, loss: {:.4}'.format(\ step_num, loss_train)) if step_num % (self.valid_step) == 0: logging.info('===========validation / test===========') result = self.test() logging.info("result:", result) if result['acc'] > max_acc: max_acc = result['acc'] self.saver.save(self.sess, "{0}/{1}.ckpt".format( self.checkpoint_path, self.task_type), global_step=step) write_pb(self.checkpoint_path, self.model_path, ["is_training", self.output_nodes]) else: self.save_pb() logging.info(f'train finished! accuracy: {max_acc}') sys.exit(0) def test(self): #saver = tf.train.Saver() #with tf.Session() as sess: # logging.info('=========== testing ===========') # saver.restore(sess, self.model_path) # label_list, seq_len_list = self.dev_one_epoch(sess, test) # self.evaluate(label_list, seq_len_list, test) self.raw_dev_text_list, self.dev_label_list = load_ner_data( self.test_path) #self.raw_dev_text_list, self.dev_label_list = \ # self.raw_dev_text_list[:50], self.dev_label_list[:50] self.dev_text_list = [self.pre.get_dl_input_by_text(text) for \ text in self.raw_dev_text_list] self.trans_label_list(self.dev_label_list, self.tag2label) dev_data = zip(self.dev_text_list, self.dev_label_list) out_label_list, seq_len_list = self.dev_one_epoch(self.sess, dev_data) result = self.evaluate(self.dev_label_list, out_label_list, \ self.raw_dev_text_list, seq_len_list) return result def dev_one_epoch(self, sess, dev): """ :param sess: :param dev: :return: """ label_list, seq_len_list = [], [] batches = batch_iter(dev, self.batch_size, self.epoch_num, shuffle=False) for batch in batches: seqs, labels = zip(*batch) label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs) label_list.extend(label_list_) seq_len_list.extend(seq_len_list_) return label_list, seq_len_list def predict_one_batch(self, sess, seqs): """ :param sess: :param seqs: :return: label_list seq_len_list """ if self.use_language_model: _, x_batch, len_batch = self.embedding.text2id( seqs, self.vocab_dict, self.maxlen, need_preprocess=False) feed_dict = {self.sequence_lengths: len_batch} feed_dict.update(self.embedding.feed_dict(x_batch, 'x')) feed_dict.update(self.encoder.feed_dict(query=len_batch)) else: feed_dict.update(self.encoder.feed_dict(x_batch)) if self.use_crf: logits, transition_params = sess.run( [self.logits, self.transition_params], feed_dict=feed_dict) label_list = [] for logit, seq_len in zip(logits, len_batch): viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_params) label_list.append(viterbi_seq) return label_list, len_batch else: label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict) return label_list, len_batch #def evaluate(self, label_list, seq_len_list, data, epoch=None): def evaluate(self, dev_label_list, out_label_list, raw_dev_text_list, \ seq_len_list): model_predict = [] for label, label_pred, sent, seq_len in zip(dev_label_list, out_label_list, raw_dev_text_list, seq_len_list): sent = sent.split() sent_res = [] for idx in range(seq_len): sent_res.append([sent[idx], label[idx], label_pred[idx]]) model_predict.append(sent_res) accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for item in model_predict: sent = [i[0] for i in item] lab = [i[1] for i in item] lab_pred = [i[2] for i in item] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, self.tag2label)) lab_pred_chunks = set(get_chunks(lab_pred, self.tag2label)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return {"acc": 100 * acc, "f1": 100 * f1}
class NER(TaskBase): def __init__(self, conf): super(NER, self).__init__(conf) self.task_type = 'ner' self.conf = conf self.read_data() if self.maxlen == -1: self.maxlen = max([len(text.split()) for text in self.text_list]) #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_class, "keep_prob": 1, "is_training": False, }) #params['num_output'] = 128 #self.encoder_base = encoder['transformer'](**params) #params['num_output'] = self.num_class self.encoder = encoder[self.encoder_type](**params) def read_data(self): self.pre = Preprocess() self.util = NERUtil() self.text_list, self.label_list = self.util.load_ner_data(self.ori_path) self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list] self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list)))) self.data_type = 'column_2' def create_model_fn(self): def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: self.encoder.keep_prob = 0.5 self.encoder.is_training = True else: self.encoder.keep_prob = 1 self.encoder.is_training = False seq_len = features['x_query_length'] global_step = tf.train.get_or_create_global_step() ################ encode ################## if not self.use_language_model: self.embedding, _ = self.init_embedding() embed = self.embedding(features = features, name = 'x_query') out = self.encoder(embed, 'x_query', features = features, middle_flag = True) #out = self.encoder_base(embed, 'x_query', features = features, middle_flag = True) #out = self.encoder(out, 'x_query', features = features, middle_flag = True) else: out = self.encoder(features = features) logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class]) batch_size = get_placeholder_batch_size(logits) small = -1000 start_logits = tf.concat([ small*tf.ones(shape=[batch_size, 1, self.num_class]), tf.zeros(shape=[batch_size, 1, 1])], axis=-1) pad_logits = tf.cast(small * tf.ones(shape=[batch_size, self.maxlen, 1]), tf.float32) logits = tf.concat([logits, pad_logits], axis = -1) logits = tf.concat([start_logits, logits], axis = 1) seq_len += 1 transition_params = tf.get_variable('crf', [self.num_class + 1,self.num_class + 1], dtype=tf.float32) pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'logit': logits, 'pred_ids': pred_ids, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) else: ############### loss #################### labels = tf.concat([ tf.cast(self.num_class * tf.ones(shape=[batch_size, 1]), tf.int64), labels ], axis = -1) log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels, seq_len, transition_params) loss = -tf.reduce_mean(log_likelihood) if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) if mode == tf.estimator.ModeKeys.EVAL: weights = tf.sequence_mask(seq_len, self.maxlen+1) metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] if len(filenames) == 0: logging.warn("Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat() gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.shuffle(buffer_size=100*self.batch_size) dataset = dataset.prefetch(4*self.batch_size) dataset = dataset.batch(self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label def test_input_fn(mode): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode)] assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'dev': return lambda : test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def save(self): def get_features(): features = {'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), } features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), None, get_features) def train(self): estimator = self.get_train_estimator(self.create_model_fn(), None) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def test(self, mode = 'test'): config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]"%mode) def train_and_evaluate(self): config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path, save_checkpoints_steps=self.save_interval, keep_checkpoint_max=5) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config) early_stop = tf.estimator.experimental.stop_if_no_decrease_hook( estimator=estimator, metric_name="loss", max_steps_without_decrease=estimator.config.save_checkpoints_steps * 2, run_every_secs=None, run_every_steps=estimator.config.save_checkpoints_steps, ) train_spec=tf.estimator.TrainSpec( input_fn = self.create_input_fn("train"), max_steps = self.max_steps, hooks=[early_stop]) eval_spec=tf.estimator.EvalSpec( input_fn = self.create_input_fn("dev"), steps = None, start_delay_secs = 1, # start evaluating after N seconds throttle_secs = 10, # evaluate every N seconds ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) self.save()
class Match(TaskBase): def __init__(self, conf): super(Match, self).__init__(conf) self.task_type = 'match' self.conf = conf self.read_data() self.num_class = len(set(self.label_list)) logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" % self.num_class) self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def read_data(self): self.pre = Preprocess() csv = pd.read_csv(self.ori_path, header=0, sep="\t", error_bad_lines=False) if 'text' in csv.keys() and 'target' in csv.keys(): #format: text \t target #for this format, the size for each class should be larger than 2 self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.data_type = 'column_2' elif 'text_a' in csv.keys() and 'text_b' in csv.keys( ) and 'target' in csv.keys(): #format: text_a \t text_b \t target #for this format, target value can only be choosen from 0 or 1 self.text_a_list = list(csv['text_a']) self.text_b_list = list(csv['text_b']) self.text_list = self.text_a_list + self.text_b_list self.label_list = list(csv['target']) self.data_type = 'column_3' else: raise ValueError('error format for train file') self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] def create_model_fn(self): def cal_loss(pred, labels, batch_size, conf): if self.tfrecords_mode == 'class': pos_scores, neg_scores = batch_hard_triplet_scores( labels, pred, is_distance=self.is_distance) # pos/neg scores pos_scores = tf.squeeze(pos_scores, -1) neg_scores = tf.squeeze(neg_scores, -1) #for represent, # pred is a batch of tensors which size >1 # we can use triplet loss(hinge loss) or contrastive loss #if use hinge loss, we don't need labels #if use other loss(contrastive loss), we need define pos/neg target before if self.loss_type in ['hinge_loss', 'improved_triplet_loss']: #pairwise loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, **conf) else: #pointwise pos_target = tf.ones(shape=[int(self.batch_size)], dtype=tf.float32) neg_target = tf.zeros(shape=[int(self.batch_size)], dtype=tf.float32) pos_loss = get_loss(type=self.loss_type, logits=pos_scores, labels=pos_target, **conf) neg_loss = get_loss(type=self.loss_type, logits=neg_scores, labels=neg_target, **conf) loss = pos_loss + neg_loss elif self.tfrecords_mode in ['pair', 'point']: if self.loss_type in ['hinge_loss', 'improved_triplet_loss']: assert self.tfrecords_mode == 'pair', "only pair mode can provide <query, pos, neg> format data" #pairwise if self.num_output == 1: pred = tf.nn.sigmoid(pred) elif self.num_output == 2: pred = tf.nn.softmax(pred)[:, 0] pred = tf.expand_dims(pred, -1) else: raise ValueError( 'unsupported num_output, 1(sigmoid) or 2(softmax)?' ) pos_scores = tf.strided_slice(pred, [0], [batch_size], [2]) neg_scores = tf.strided_slice(pred, [1], [batch_size], [2]) loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, **conf) elif self.loss_type in ['sigmoid_loss']: #pointwise labels = tf.expand_dims(labels, axis=-1) loss = get_loss(type=self.loss_type, logits=pred, labels=labels, **conf) else: raise ValueError('unsupported loss for pair/point match') else: raise ValueError('unknown tfrecords_mode?') return loss def model_fn(features, labels, mode, params): #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() ############# encode ################# if not self.use_language_model: self.embedding, _ = self.init_embedding() if self.tfrecords_mode == 'class': self.embed_query = self.embedding(features=features, name='x_query') output = self.encoder(self.embed_query, name='x_query', features=features) output = tf.nn.l2_normalize(output, -1) elif self.tfrecords_mode in ['pair', 'point']: if self.sim_mode == 'cross': self.embed_query = self.embedding(features=features, name='x_query') self.embed_sample = self.embedding(features=features, name='x_sample') output = self.encoder(x_query=self.embed_query, x_sample=self.embed_sample, features=features) elif self.sim_mode == 'represent': self.embed_query = self.embedding(features=features, name='x_query') self.embed_sample = self.embedding(features=features, name='x_sample') query_encode = self.encoder(self.embed_query, name='x_query', features=features) sample_encode = self.encoder(self.embed_sample, name='x_sample', features=features) output = self.concat(query_encode, sample_encode) output = tf.layers.dense(output, 1, kernel_regularizer=tf.contrib. layers.l2_regularizer(0.001), name='fc') else: raise ValueError( 'unknown sim_mode, represent or cross') else: output = self.encoder(features=features) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: #pdb.set_trace() predictions = { 'encode': output, 'pred': tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5), tf.int32) if self.num_output == 2 else tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32), 'score': tf.nn.softmax(output)[:, 0] if self.num_output == 2 else tf.nn.sigmoid(output), 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = cal_loss(output, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): if self.tfrecords_mode == 'class': #size = self.num_class num_classes_per_batch = 32 assert num_classes_per_batch < self.num_class num_sentences_per_class = self.batch_size // num_classes_per_batch elif self.tfrecords_mode == 'pair': #data order: query,pos,query,neg num_sentences_per_class = 4 num_classes_per_batch = self.batch_size // num_sentences_per_class elif self.tfrecords_mode == 'point': #data order: query, sample(pos or neg) num_classes_per_batch = 2 num_sentences_per_class = self.batch_size // num_classes_per_batch else: raise ValueError('unknown tfrecords_mode') #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ # for i in range(size)] filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] if len(filenames) == 0: logging.warn( "Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] size = len(filenames) logging.info("tfrecords train class num: {}".format(size)) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets] def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() ##test #pdb.set_trace() #sess = tf.Session() #features1,label1 = sess.run([features,label]) #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]] #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]] return features, label def test_input_fn(mode): #filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ # for i in range(self.num_class * self.dev_size)] filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode) ] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") elif mode == 'dev': return lambda: test_input_fn("dev") elif mode == 'label': return lambda: test_input_fn("train") else: raise ValueError("unknown input_fn type!") def train(self): params = { 'is_training': True, 'keep_prob': 0.7, } estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) def save(self): params = {'is_training': False, 'keep_prob': 1} def get_features(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } if self.tfrecords_mode in ['pair', 'point']: features.update({ 'x_sample': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_sample'), 'x_sample_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_sample_length') }) features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), params, get_features) def test(self, mode='test'): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) predictions = estimator.predict(input_fn=self.create_input_fn(mode)) predictions = list(predictions) if self.tfrecords_mode == 'class': predictions_vec = [item['encode'] for item in predictions] predictions_label = [item['label'] for item in predictions] refers = estimator.predict(input_fn=self.create_input_fn("label")) refers = list(refers) refers_vec = [item['encode'] for item in refers] refers_label = [item['label'] for item in refers] right = 0 thre_right = 0 sum = 0 if self.is_distance: scores = euclidean_distances(predictions_vec, refers_vec) selected_ids = np.argmin(scores, axis=-1) else: scores = cosine_similarity(predictions_vec, refers_vec) selected_ids = np.argmax(scores, axis=-1) for idx, item in enumerate(selected_ids): if refers_label[item] == predictions_label[idx]: if self.is_distance: if 1 - scores[idx][item] > self.score_thre: thre_right += 1 else: if scores[idx][item] > self.score_thre: thre_right += 1 right += 1 sum += 1 print("Acc:{}".format(float(right) / sum)) print("ThreAcc:{}".format(float(thre_right) / sum)) elif self.tfrecords_mode == 'pair': #对于pair方式的评估 scores = [item['score'] for item in predictions] labels = [item['label'] for item in predictions] #pdb.set_trace() #predictions scores = np.reshape(scores, [self.num_class * self.dev_size, -1]) pred_max_ids = np.argmax(scores, axis=-1) #label labels = np.reshape(labels, [self.num_class, -1]) right = 0 for idx, max_id in enumerate(pred_max_ids): if labels[idx][max_id] == 1: right += 1 sum = len(pred_max_ids) print("Acc:{}".format(float(right) / sum)) elif self.tfrecords_mode == 'point': scores = [item['score'] for item in predictions] scores = np.reshape(scores, -1) scores = [0 if item < self.score_thre else 1 for item in scores] #pred = [item['pred'] for item in predictions] labels = [item['label'] for item in predictions] res = metrics(labels=labels, logits=np.array(scores)) print("precision:{} recall:{} f1:{}".format( res[3], res[4], res[5])) def concat(self, a, b): tmp = tf.concat([a, b], axis=-1) #return tmp res1 = a * b res2 = a + b res3 = a - b return tf.concat([tmp, res1, res2, res3], axis=-1) def knn(self, scores, predictions_label, refers_label, k=4): sorted_id = np.argsort(-scores, axis=-1) shape = np.shape(sorted_id) max_id = [] for idx in range(shape[0]): mp = defaultdict(int) for idy in range(k): mp[refers_label[int(sorted_id[idx][idy])]] += 1 max_id.append(max(mp, key=mp.get)) return max_id
class Match(object): def __init__(self, conf): self.task_type = 'match' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.graph = tf.get_default_graph() self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}") self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def init_embedding(self): self.vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) self.embedding = embedding[self.embedding_type](text_list = self.text_list, vocab_dict = self.vocab_dict, dict_path = self.dict_path, random=self.rand_embedding, maxlen = self.maxlen, batch_size = self.batch_size, embedding_size = self.embedding_size, conf = self.conf) def prepare(self): self.init_embedding() self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) self.gt.process(self.text_list, self.label_list, self.embedding.text2id, self.encoder.encoder_fun, self.vocab_dict, self.tfrecords_path, self.label_path) def cal_loss(self, pred, labels, pos_target, neg_target, batch_size, conf): if self.loss_type == 'hinge_loss': if self.sub_loss_type == 'all': loss = batch_all_triplet_loss(labels, pred, conf['margin']) else: loss = batch_hard_triplet_loss(labels, pred, conf['margin']) else: loss = get_loss(type = self.loss_type, logits = pred, labels = labels, **conf) return loss def create_model_fn(self): def model_fn(features, labels, mode, params): if not self.use_language_model: self.init_embedding() if self.tfrecords_mode == 'class': self.embed_query = self.embedding(features = features, name = 'x_query') else: self.embed_query = self.embedding(features = features, name = 'x_query') self.embed_sample = self.embedding(features = features, name = 'x_sample') else: self.embedding = None #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() if self.sim_mode == 'cross': if not self.use_language_model: pred = self.encoder(x_query = self.embed_query, x_sample = self.embed_sample, features = features) else: pred = self.encoder(features = features) elif self.sim_mode == 'represent': if not self.use_language_model: #features['x_query_length'] = features['length'] pred = self.encoder(self.embed_query, name = 'x_query', features = features) else: pred = self.encoder(features = features) else: raise ValueError('unknown sim mode') pos_target = tf.ones(shape = [int(self.batch_size/2)], dtype = tf.float32) neg_target = tf.zeros(shape = [int(self.batch_size/2)], dtype = tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'pred': pred, 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) loss = self.cal_loss(pred, labels, pos_target, neg_target, self.batch_size, self.conf) if mode == tf.estimator.ModeKeys.TRAIN: if self.use_clr: self.learning_rate = cyclic_learning_rate(global_step=global_step, learning_rate = self.learning_rate, mode = self.clr_mode) optimizer = get_train_op(global_step, self.optimizer_type, loss, self.learning_rate, clip_grad = 5) return tf.estimator.EstimatorSpec(mode, loss = loss, train_op=optimizer) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): if self.tfrecords_mode == 'pair': size = self.num_pair num_classes_per_batch = 2 num_sentences_per_class = self.batch_size // num_classes_per_batch else: size = self.num_class num_classes_per_batch = 16 num_sentences_per_class = self.batch_size // num_classes_per_batch filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ for i in range(size)] logging.info("tfrecords train class num: {}".format(len(filenames))) datasets = [tf.data.TFRecordDataset(filename) for filename in filenames] datasets = [dataset.repeat() for dataset in datasets] #assert self.batch_size == num_sentences_per_class* num_classes_per_batch def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator(generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets(datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4*self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() #test #sess = tf.Session() #features,label = sess.run([features,label]) #features['x_query_pred'] = [item.decode('utf-8') for item in # features['x_query_pred'][1]] return features, label def test_input_fn(mode): filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ for i in range(self.num_class)] assert self.num_class == len(filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'label': return lambda : test_input_fn("train") else: raise ValueError("unknown input_fn type!") def train(self): params = { 'is_training': True, 'keep_prob': 0.5 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def save(self): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) def serving_input_receiver_fn(): x_query = tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query') length = tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length') label = tf.placeholder(dtype=tf.int64, shape=[None], name='label') receiver_tensors = {'x_query': x_query, 'x_query_length': length, 'label': label} features = {'x_query': x_query, 'x_query_length': length, 'label': label} return tf.estimator.export.ServingInputReceiver(receiver_tensors, features) estimator.export_savedmodel( self.export_dir_path, # 目录 serving_input_receiver_fn, # 返回ServingInputReceiver的函数 assets_extra=None, as_text=False, checkpoint_path=None) def test(self): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) predictions = estimator.predict(input_fn=self.create_input_fn("test")) predictions = list(predictions) predictions_vec = [item['pred'] for item in predictions] predictions_label = [item['label'] for item in predictions] if self.tfrecords_mode == 'class': refers = estimator.predict(input_fn=self.create_input_fn("label")) refers = list(refers) refers_vec = [item['pred'] for item in refers] refers_label = [item['label'] for item in refers] right = 0 thre_right = 0 sum = 0 scores = cosine_similarity(predictions_vec, refers_vec) max_id = np.argmax(scores, axis=-1) #max_id = self.knn(scores, predictions_label, refers_label) for idx, item in enumerate(max_id): if refers_label[item] == predictions_label[idx]: if scores[idx][item] > self.score_thre: thre_right += 1 right += 1 sum += 1 print("Acc:{}".format(float(right)/sum)) print("ThreAcc:{}".format(float(thre_right)/sum)) else: #TODO: 对于pair方式的评估 pdb.set_trace() def knn(self, scores, predictions_label, refers_label, k = 4): sorted_id = np.argsort(-scores, axis = -1) shape = np.shape(sorted_id) max_id = [] for idx in range(shape[0]): mp = defaultdict(int) for idy in range(k): mp[refers_label[int(sorted_id[idx][idy])]] += 1 max_id.append(max(mp,key=mp.get)) return max_id def test_unit(self, text): #######################init######################### if self.model_loaded == False: #添加不参与训练样本 if os.path.exists(self.no_train_path): csv = pd.read_csv(self.no_train_path, header = 0, sep=",", error_bad_lines=False) self.text_list += list(csv['text']) self.label_list += list(csv['target']) subdirs = [x for x in Path(self.export_dir_path).iterdir() if x.is_dir() and 'temp' not in str(x)] latest = str(sorted(subdirs)[-1]) self.predict_fn = predictor.from_saved_model(latest) self.init_embedding() self.model_loaded = True self.vec_list = self._get_vecs(self.predict_fn, self.text_list) #self.set_zdy_labels(['睡觉','我回家了','晚安','娃娃了','周杰伦','自然语言处理'], # ['打开情景模式','打开情景模式','打开情景模式', # '打开情景模式','打开情景模式','打开情景模式']) text_list = self.text_list vec_list = self.vec_list label_list = self.label_list #用于添加自定义问句(自定义优先) if self.zdy != {}: text_list = self.zdy['text_list'] + text_list vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0) label_list = self.zdy['label_list'] + label_list vec = self._get_vecs(self.predict_fn, [text], need_preprocess = True) scores = cosine_similarity(vec, vec_list)[0] max_id = np.argmax(scores) max_score = scores[max_id] max_similar = text_list[max_id] logging.info("test result: {}, {}, {}".format(label_list[max_id], max_score, max_similar)) return label_list[max_id], max_score, max_id def set_zdy_labels(self, text_list, label_list): if len(text_list) == 0 or len(label_list) == 0: self.zdy = {} return self.zdy['text_list'] = text_list self.zdy['vec_list'] = self._get_vecs(self.predict_fn, text_list, need_preprocess = True) self.zdy['label_list'] = label_list def _get_vecs(self, predict_fn, text_list, need_preprocess = False): #根据batches数据生成向量 text_list_pred, x_query, x_query_length = self.embedding.text2id(text_list, self.vocab_dict, need_preprocess) label = [0 for _ in range(len(text_list))] predictions = predict_fn({'x_query': x_query, 'x_query_length': x_query_length, 'label': label}) return predictions['pred']
class NER(TaskBase): def __init__(self, conf): super(NER, self).__init__(conf) self.task_type = 'ner' self.conf = conf self.read_data() #if self.maxlen == -1: # self.maxlen = max([len(text.split()) for text in self.text_list]) #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_class, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**params) def read_data(self): self.pre = Preprocess() self.util = NERUtil() self.text_list, self.label_list = self.util.load_ner_data(self.ori_path) self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list] self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list)))) self.data_type = 'column_2' def create_model_fn(self): def model_fn(features, labels, mode, params): self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] seq_len = features['x_query_length'] global_step = tf.train.get_or_create_global_step() ################ encode ################## if not self.use_language_model: self.embedding, _ = self.init_embedding() embed = self.embedding(features = features, name = 'x_query') out = self.encoder(embed, 'x_query', features = features, middle_flag = True) else: out = self.encoder(features = features) logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class]) transition_params = tf.get_variable('crf', [self.num_class,self.num_class], dtype=tf.float32) pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len) ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'logit': logits, 'pred_ids': pred_ids, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) else: ############### loss #################### log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels, seq_len, transition_params) loss = -tf.reduce_mean(log_likelihood) if mode == tf.estimator.ModeKeys.TRAIN: return self.train_estimator_spec(mode, loss, global_step, params) if mode == tf.estimator.ModeKeys.EVAL: #pdb.set_trace() weights = tf.sequence_mask(seq_len, self.maxlen) metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)} #metrics = {'acc': tf.metrics.accuracy(labels, pred_ids)} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] if len(filenames) == 0: logging.warn("Can't find any tfrecords file for train, prepare now!") self.prepare() filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith('train')] dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.repeat() gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4*self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label def test_input_fn(mode): filenames = [os.path.join(self.tfrecords_path,item) for item in os.listdir(self.tfrecords_path) if item.startswith(mode)] assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda : test_input_fn("test") elif mode == 'dev': return lambda : test_input_fn("dev") else: raise ValueError("unknown input_fn type!") def save(self): params = { 'is_training': False, 'keep_prob': 1 } def get_features(): features = {'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), } #'label': tf.placeholder(dtype=tf.int64, # shape=[None], # name='label')} features.update(self.encoder.get_features()) return features self.save_model(self.create_model_fn(), params, get_features) def train(self): params = { 'is_training': True, 'keep_prob': 0.7 } estimator = self.get_train_estimator(self.create_model_fn(), params) estimator.train(input_fn = self.create_input_fn("train"), max_steps = self.max_steps) self.save() def test(self, mode = 'test'): params = { 'is_training': False, 'keep_prob': 1 } config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(), config = config, params = params) if mode == 'dev': estimator.evaluate(input_fn=self.create_input_fn('dev')) elif mode == 'test': estimator.evaluate(input_fn=self.create_input_fn('test')) else: raise ValueError("unknown mode:[%s]"%mode)
class Match(object): def __init__(self, conf): self.task_type = 'match' self.conf = conf for attr in conf: setattr(self, attr, conf[attr]) self.pre = Preprocess() self.model_loaded = False self.zdy = {} csv = pd.read_csv(self.ori_path, header=0, sep=",", error_bad_lines=False) self.text_list = list(csv['text']) self.label_list = list(csv['target']) self.num_class = len(set(self.label_list)) logging.info( f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<") self.text_list = [self.pre.get_dl_input_by_text(text) for text in \ self.text_list] self.conf.update({ "maxlen": self.maxlen, "maxlen1": self.maxlen, "maxlen2": self.maxlen, "num_class": self.num_class, "embedding_size": self.embedding_size, "batch_size": self.batch_size, "num_output": self.num_output, "keep_prob": 1, "is_training": False, }) self.encoder = encoder[self.encoder_type](**self.conf) def prepare(self): vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) text2id = embedding[self.embedding_type].text2id self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) self.gt.process(self.text_list, self.label_list, text2id, self.encoder.encoder_fun, vocab_dict, self.tfrecords_path, self.label_path, self.test_size) logging.info("tfrecords generated!") def create_model_fn(self): def init_embedding(): vocab_dict = embedding[self.embedding_type].build_dict(\ dict_path = self.dict_path, text_list = self.text_list, mode = self.mode) return embedding[self.embedding_type]( text_list=self.text_list, vocab_dict=vocab_dict, dict_path=self.dict_path, random=self.rand_embedding, maxlen=self.maxlen, batch_size=self.batch_size, embedding_size=self.embedding_size, conf=self.conf) def cal_loss(pred, labels, batch_size, conf): if self.sim_mode == 'represent': pos_scores, neg_scores = batch_hard_triplet_scores( labels, pred) # pos/neg scores pos_scores = tf.squeeze(pos_scores, -1) neg_scores = tf.squeeze(neg_scores, -1) #for represent, # pred is a batch of tensors which size >1 # we can use triplet loss(hinge loss) or contrastive loss #if use hinge loss, we don't need labels #if use other loss(contrastive loss), we need define pos/neg target before if self.loss_type == 'hinge_loss': #pairwise loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, is_distance=True, **conf) else: #pointwise pos_target = tf.ones(shape=[int(self.batch_size) / 2], dtype=tf.float32) neg_target = tf.zeros(shape=[int(self.batch_size) / 2], dtype=tf.float32) pos_loss = get_loss(type=self.loss_type, logits=pos_scores, labels=pos_target, **conf) neg_loss = get_loss(type=self.loss_type, logits=neg_scores, labels=neg_target, **conf) loss = pos_loss + neg_loss elif self.sim_mode == 'cross': #for cross: # pred is a batch of tensors which size == 1 #pdb.set_trace() if self.loss_type == 'hinge_loss': #pairwise if self.num_output == 1: pred = tf.nn.sigmoid(pred) elif self.num_output == 2: pred = tf.nn.softmax(pred)[:, 0] pred = tf.expand_dims(pred, -1) else: raise ValueError( 'unsupported num_output, 1(sigmoid) or 2(softmax)?' ) pos_scores = tf.strided_slice(pred, [0], [batch_size], [2]) neg_scores = tf.strided_slice(pred, [1], [batch_size], [2]) loss = get_loss(type=self.loss_type, pos_logits=pos_scores, neg_logits=neg_scores, is_distance=False, **conf) elif self.loss_type in ['sigmoid_loss']: #pointwise #labels = tf.stack([labels, 1-labels], axis = -1) loss = get_loss(type=self.loss_type, logits=pred, labels=labels, **conf) else: raise ValueError('unsupported loss for cross match') else: raise ValueError('unknown sim mode, cross or represent?') return loss def model_fn(features, labels, mode, params): ############# embedding ################# if not self.use_language_model: self.embedding = init_embedding() if self.tfrecords_mode == 'class': self.embed_query = self.embedding(features=features, name='x_query') else: self.embed_query = self.embedding(features=features, name='x_query') self.embed_sample = self.embedding(features=features, name='x_sample') else: self.embedding = None ############# encoder ################# #model params self.encoder.keep_prob = params['keep_prob'] self.encoder.is_training = params['is_training'] global_step = tf.train.get_or_create_global_step() if self.sim_mode == 'cross': if not self.use_language_model: output = self.encoder(x_query=self.embed_query, x_sample=self.embed_sample, features=features) else: output = self.encoder(features=features) elif self.sim_mode == 'represent': if not self.use_language_model: #features['x_query_length'] = features['length'] output = self.encoder(self.embed_query, name='x_query', features=features) else: output = self.encoder(features=features) else: raise ValueError('unknown sim mode') ############### predict ################## if mode == tf.estimator.ModeKeys.PREDICT: #pdb.set_trace() predictions = { 'encode': output, 'pred': tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5), tf.int32) if self.num_output == 2 else tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32), 'score': tf.nn.softmax(output)[:, 0] if self.num_output == 2 else tf.nn.sigmoid(output), 'label': features['label'] } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ############### loss ################## loss = cal_loss(output, labels, self.batch_size, self.conf) ############### train ################## if mode == tf.estimator.ModeKeys.TRAIN: if self.use_clr: self.learning_rate = cyclic_learning_rate( global_step=global_step, learning_rate=self.learning_rate, mode=self.clr_mode) optimizer = get_train_op(global_step, self.optimizer_type, loss, self.learning_rate, clip_grad=5) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=optimizer) ############### eval ################## if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {} #{"accuracy": tf.metrics.accuracy( # labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return model_fn def create_input_fn(self, mode): n_cpu = multiprocessing.cpu_count() def train_input_fn(): if self.tfrecords_mode == 'pair': num_sentences_per_class = 4 num_classes_per_batch = self.batch_size // num_sentences_per_class else: #size = self.num_class num_classes_per_batch = 16 num_sentences_per_class = self.batch_size // num_classes_per_batch #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \ # for i in range(size)] filenames = [ os.path.join(self.tfrecords_path, item) for item in os.listdir(self.tfrecords_path) if item.startswith('train') ] size = len(filenames) logging.info("tfrecords train class num: {}".format(size)) datasets = [ tf.data.TFRecordDataset(filename) for filename in filenames ] datasets = [dataset.repeat() for dataset in datasets] #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets] def generator(): while True: labels = np.random.choice(range(size), num_classes_per_batch, replace=False) for label in labels: for _ in range(num_sentences_per_class): yield label choice_dataset = tf.data.Dataset.from_generator( generator, tf.int64) dataset = tf.contrib.data.choose_from_datasets( datasets, choice_dataset) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(4 * self.batch_size) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() ##test #pdb.set_trace() #sess = tf.Session() #features1,label1 = sess.run([features,label]) #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]] #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]] return features, label def test_input_fn(mode): filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \ for i in range(self.num_class * self.test_size)] assert self.num_class == len( filenames), "the num of tfrecords file error!" logging.info("tfrecords test class num: {}".format(len(filenames))) dataset = tf.data.TFRecordDataset(filenames) gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen) dataset = dataset.map( lambda record: gt.parse_record(record, self.encoder), num_parallel_calls=n_cpu) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(1) iterator = dataset.make_one_shot_iterator() features, label = iterator.get_next() return features, label if mode == 'train': return train_input_fn elif mode == 'test': return lambda: test_input_fn("test") elif mode == 'label': return lambda: test_input_fn("train") else: raise ValueError("unknown input_fn type!") def train(self): params = {'is_training': True, 'keep_prob': 0.5} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) estimator.train(input_fn=self.create_input_fn("train"), max_steps=self.max_steps) self.save() def save(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) def serving_input_receiver_fn(): features = { 'x_query': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_query'), 'x_query_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length'), 'label': tf.placeholder(dtype=tf.int64, shape=[None], name='label') } if self.tfrecords_mode == 'pair': features.update({ 'x_sample': tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen], name='x_sample'), 'x_sample_length': tf.placeholder(dtype=tf.int64, shape=[None], name='x_sample_length') }) features.update(self.encoder.get_features()) return tf.estimator.export.ServingInputReceiver(features, features) estimator.export_savedmodel( self.export_dir_path, # 目录 serving_input_receiver_fn, # 返回ServingInputReceiver的函数 assets_extra=None, as_text=False, checkpoint_path=None) def test(self): params = {'is_training': False, 'keep_prob': 1} config = tf.estimator.RunConfig(tf_random_seed=230, model_dir=self.checkpoint_path) estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(), config=config, params=params) predictions = estimator.predict(input_fn=self.create_input_fn("test")) predictions = list(predictions) if self.tfrecords_mode == 'class': predictions_vec = [item['encode'] for item in predictions] predictions_label = [item['label'] for item in predictions] refers = estimator.predict(input_fn=self.create_input_fn("label")) refers = list(refers) refers_vec = [item['encode'] for item in refers] refers_label = [item['label'] for item in refers] right = 0 thre_right = 0 sum = 0 scores = euclidean_distances(predictions_vec, refers_vec) selected_ids = np.argmin(scores, axis=-1) for idx, item in enumerate(selected_ids): if refers_label[item] == predictions_label[idx]: if scores[idx][item] > self.score_thre: thre_right += 1 right += 1 sum += 1 print("Acc:{}".format(float(right) / sum)) print("ThreAcc:{}".format(float(thre_right) / sum)) else: #对于pair方式的评估 scores = [item['score'] for item in predictions] labels = [item['label'] for item in predictions] #pdb.set_trace() #predictions scores = np.reshape(scores, [self.num_class * self.test_size, -1]) pred_max_ids = np.argmax(scores, axis=-1) #label labels = np.reshape(labels, [self.num_class, -1]) right = 0 for idx, max_id in enumerate(pred_max_ids): if labels[idx][max_id] == 1: right += 1 sum = len(pred_max_ids) print("Acc:{}".format(float(right) / sum)) def knn(self, scores, predictions_label, refers_label, k=4): sorted_id = np.argsort(-scores, axis=-1) shape = np.shape(sorted_id) max_id = [] for idx in range(shape[0]): mp = defaultdict(int) for idy in range(k): mp[refers_label[int(sorted_id[idx][idy])]] += 1 max_id.append(max(mp, key=mp.get)) return max_id
class Classify(object): def __init__(self, conf): self.conf = conf self.task_type = 'classify' for attr in conf: setattr(self, attr, conf[attr]) self.is_training = tf.placeholder(tf.bool, [], name="is_training") self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.where(self.is_training, 0.5, 1.0) self.pre = Preprocess() self.text_list, self.label_list = load_classify_data(self.train_path) self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list] if not self.use_language_model: #build vocabulary map using training data self.vocab_dict = embedding[self.embedding_type].build_dict(dict_path = self.dict_path, text_list = self.text_list) #define embedding object by embedding_type self.embedding = embedding[self.embedding_type](text_list = self.text_list, vocab_dict = self.vocab_dict, dict_path = self.dict_path, random=self.rand_embedding, batch_size = self.batch_size, maxlen = self.maxlen, embedding_size = self.embedding_size, conf = self.conf) self.embed = self.embedding(name = 'x') self.y = tf.placeholder(tf.int32, [None], name="y") #model params params = conf params.update({ "maxlen":self.maxlen, "embedding_size":self.embedding_size, "keep_prob":self.keep_prob, "batch_size": self.batch_size, "num_output": self.num_class, "is_training": self.is_training }) self.encoder = encoder[self.encoder_type](**params) if not self.use_language_model: self.out = self.encoder(self.embed) else: self.out = self.encoder() self.output_nodes = self.out.name.split(':')[0] self.loss(self.out) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) if self.use_language_model: tvars = tf.trainable_variables() init_checkpoint = conf['init_checkpoint_path'] (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint,assignment_map) def load_data(self, mode = 'train'): logging.info("Building dataset...") if mode == 'train': class_mp, class_mp_rev = generate_class_mp(self.label_list, self.classes_path) y = [class_mp[item] for item in self.label_list] train_x, valid_x, train_y, valid_y = \ train_test_split(self.text_list, y, test_size=0.05) return zip(train_x, train_y), zip(valid_x, valid_y) else: class_mp, class_mp_rev = load_class_mp(self.classes_path) text_list, label_list = load_classify_data(self.test_path) y = [class_mp[item] for item in label_list] return text_list, y def loss(self, out): with tf.name_scope("output"): self.scores = tf.nn.softmax(out, axis=1, name="scores") self.predictions = tf.argmax(self.scores, -1, output_type=tf.int32, name = 'predictions') with tf.name_scope("loss"): #self.loss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=self.y)) self.loss = get_loss(type = self.loss_type, logits = out, labels = self.y, labels_sparse = True, **self.conf) self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, self.y) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") def train(self): logging.info("---------start train---------") self.train_data, self.valid_data = self.load_data(mode = 'train') self.train_data = list(self.train_data) self.valid_data = list(self.valid_data) train_batches = batch_iter(self.train_data, self.batch_size, self.num_epochs) num_batches_per_epoch = (len(self.train_data) - 1) // self.batch_size + 1 max_accuracy = -1 for batch in train_batches: x_batch, y_batch = zip(*batch) train_feed_dict = { self.y: y_batch, self.is_training: True } if not self.use_language_model: _, x_batch, len_batch = self.embedding.text2id( x_batch, self.vocab_dict, need_preprocess = False) train_feed_dict.update(self.embedding.feed_dict(x_batch,'x')) train_feed_dict.update(self.encoder.feed_dict(len = len_batch)) else: train_feed_dict.update(self.encoder.feed_dict(x_batch)) _, step, loss = self.sess.run([self.optimizer, self.global_step, self.loss], feed_dict=train_feed_dict) if step % (self.valid_step/10) == 0: logging.info("step {0}: loss = {1}".format(step, loss)) if step % self.valid_step == 0: # Test accuracy with validation data for each epoch. valid_batches = batch_iter(self.valid_data, self.batch_size, 1, shuffle=False) sum_accuracy, cnt = 0, 0 for valid_batch in valid_batches: valid_x_batch, valid_y_batch = zip(*valid_batch) valid_feed_dict = { self.y: valid_y_batch, self.is_training: False } if not self.use_language_model: _, valid_x_batch, len_batch = self.embedding.text2id( valid_x_batch, self.vocab_dict, need_preprocess = False) valid_feed_dict.update(self.embedding.feed_dict(valid_x_batch,'x')) valid_feed_dict.update(self.encoder.feed_dict(len = len_batch)) else: valid_feed_dict.update(self.encoder.feed_dict(valid_x_batch)) accuracy = self.sess.run(self.accuracy, feed_dict=valid_feed_dict) sum_accuracy += accuracy cnt += 1 valid_accuracy = sum_accuracy / cnt logging.info("\nValidation Accuracy = {1}\n".format(step // num_batches_per_epoch, sum_accuracy / cnt)) # Save model if valid_accuracy > max_accuracy: max_accuracy = valid_accuracy self.saver.save(self.sess, "{0}/{1}.ckpt".format(self.checkpoint_path, self.task_type), global_step=step) logging.info("Model is saved.\n") else: self.save_pb() logging.info(f"train finished! accuracy: {max_accuracy}") sys.exit(0) def save_pb(self): write_pb(self.checkpoint_path, self.model_path, ['is_training','output/predictions','accuracy/accuracy',self.output_nodes]) def test(self): if not os.path.exists(self.model_path): self.save_pb() graph = load_pb(self.model_path) sess = tf.Session(graph=graph) self.y = graph.get_operation_by_name("y").outputs[0] self.is_training = graph.get_operation_by_name("is_training").outputs[0] self.accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0] self.scores = graph.get_tensor_by_name("output/scores:0") #self.scores = graph.get_tensor_by_name(self.output_nodes+":0") self.predictions = graph.get_tensor_by_name("output/predictions:0") mp, mp_rev = load_class_mp(self.classes_path) test_x, test_y = self.load_data("test") pred_y = [] scores = [] batches = batch_iter(zip(test_x, test_y), self.batch_size, 1, shuffle=False) sum_accuracy, cnt = 0, 0 right, all = 0, 0 vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path, mode = 'test') all_test_x = [] all_test_y = [] for batch in batches: batch_x, batch_y = zip(*batch) feed_dict = { self.y: batch_y, self.is_training: False } if not self.use_language_model: preprocess_x, batch_x_id, len_batch = self.embedding.text2id(batch_x, vocab_dict, need_preprocess = True) feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x_id, 'x')) feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch)) else: feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x)) accuracy_out, predictions_out, scores_out = sess.run([self.accuracy, self.predictions, self.scores], feed_dict=feed_dict) max_scores = [scores_out[idx][predictions_out[idx]] \ for idx in range(len(predictions_out))] sum_accuracy += accuracy_out cnt += 1 pred_y += list(predictions_out) scores += list(max_scores) all_test_x += list(batch_x) all_test_y += list(batch_y) for idx in range(len(predictions_out)): if predictions_out[idx] == int(batch_y[idx]) and max_scores[idx]> self.thre_score: right += 1 all += 1 dt = pd.DataFrame({'text': all_test_x, 'target': [mp_rev[int(item)] for item in all_test_y] , 'pred': [mp_rev[item] for item in pred_y], 'score': scores }) dt.to_csv(self.test_path+'.result.csv',index=False,sep=',') logging.info("Test Accuracy : {0}".format(sum_accuracy / cnt)) logging.info("Test Thre Accuracy : {0}".format(right / all)) def predict(self): predict_file = self.predict_path if not os.path.exists(self.model_path): self.save_pb() graph = load_pb(self.model_path) sess = tf.Session(graph=graph) self.y = graph.get_operation_by_name("y").outputs[0] self.is_training = graph.get_operation_by_name("is_training").outputs[0] #self.scores = graph.get_tensor_by_name(self.output_nodes+":0") self.scores = graph.get_tensor_by_name("output/scores:0") self.predictions = graph.get_tensor_by_name("output/predictions:0") vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path,mode = 'test') mp, mp_rev = load_class_mp(self.classes_path) with open(predict_file) as f: lines = [line.strip() for line in f.readlines()] batches = batch_iter(lines, self.batch_size, 1, shuffle=False) scores = [] predicts = [] for batch_x in batches: feed_dict = { self.is_training: False } if not self.use_language_model: preprocess_x, batch_x, len_batch = self.embedding.text2id(batch_x, vocab_dict) feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x, 'x')) feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch)) else: feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x)) predictions_out, scores_out = sess.run([self.predictions, self.scores], feed_dict=feed_dict) max_scores = [scores_out[idx][predictions_out[idx]] \ for idx in range(len(predictions_out))] predicts += list(predictions_out) scores += list(max_scores) predicts = [mp_rev[item] for item in predicts] dt = pd.DataFrame({'text': lines, 'pred': predicts, 'score': scores }) dt.to_csv(self.predict_path+'.result.csv',index=False,sep=',') def test_unit(self, text): if not os.path.exists(self.model_path): self.save_pb() graph = load_pb(self.model_path) sess = tf.Session(graph=graph) self.y = graph.get_operation_by_name("y").outputs[0] self.is_training = graph.get_operation_by_name("is_training").outputs[0] self.scores = graph.get_tensor_by_name("output/scores:0") #self.scores = graph.get_tensor_by_name(self.output_nodes+":0") self.predictions = graph.get_tensor_by_name("output/predictions:0") vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path,mode = 'test') mp, mp_rev = load_class_mp(self.classes_path) batches = batch_iter([text], self.batch_size, 1, shuffle=False) for batch_x in batches: feed_dict = { self.is_training: False } if not self.use_language_model: preprocess_x, batch_x, len_batch = self.embedding.text2id(batch_x, vocab_dict) feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x, 'x')) feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch)) else: feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x)) predictions_out, scores_out = sess.run([self.predictions, self.scores], feed_dict=feed_dict) max_scores = [scores_out[idx][predictions_out[idx]] \ for idx in range(len(predictions_out))] logging.info("preprocess: {}".format(preprocess_x)) logging.info("class:{}, score:{}, class_id:{}".format( mp_rev[predictions_out[0]], max_scores[0], predictions_out[0])) return mp_rev[predictions_out[0]], max_scores[0]