def check_intent(): if request.method == "POST": global model data_reader = DatasetReader(use_pos_feature=False, use_bert=False, use_name_feature=False) query = request.form.get('input') if query.strip() == '': return jsonify({'message': '无效查询', 'status': 0}) data_reader.load_single_input(query) # print(vocab.get_char_vocab_size()) data_reader.convert_to_ids(vocab) predict_label, sample = model.inference(data_reader, 1) predict_label = predict_label[0] print("predict label " + str(predict_label) + 'hhhh') print({ 'query': query, 'status': str(1), 'message': '涉政' if predict_label == 1 else '非涉政查询' }) return jsonify({ 'query': query, 'is_politic': str(predict_label), 'status': 1, 'message': '涉政' if predict_label == 1 else '非涉政查询' }) if request.method == 'GET': data_reader = DatasetReader(use_pos_feature=False, use_bert=False, use_name_feature=False) query = request.args.get('query') data_reader.load_single_input(query) print(vocab.get_char_vocab_size()) data_reader.convert_to_ids(vocab) model = TextCNN(vocab, num_class=2, pretrained_word_embedding=vocab.embeddings, word_embedding_size=300) model.load( "/search/odin/jdwu/classification/cls_checkpoints/politic/best_weights" ) predict_label, sample = model.inference(data_reader, 1) predict_label = predict_label[0] return jsonify({ 'query': request.args.get('query'), 'is_politic': predict_label, 'status': 1, 'message': '涉政' if predict_label == 1 else '非涉政查询' })
def experiment_with_imdb(): train_texts, train_label, test_texts, test_label = utils.load_imdb() config = { 'MAX_NUM_WORDS': 15000, 'MAX_TEXT_LEN': 500, 'NUM_CLASSES': 2, 'FILTER_SIZES': [2, 3, 4, 5], 'FILTER_NUM': 200, 'EMBED_DROPOUT': 0.3, 'DENSE_DROPOUT': 0.5, 'BATCH_SIZE': 64, 'EPOCHS': 10, } tokenizer = Tokenizer(num_words=config['MAX_NUM_WORDS']) tokenizer.fit_on_texts(train_texts) train_texts = tokenizer.texts_to_sequences(train_texts) test_texts = tokenizer.texts_to_sequences(test_texts) x_train = pad_sequences(train_texts, maxlen=config['MAX_TEXT_LEN']) x_test = pad_sequences(test_texts, maxlen=config['MAX_TEXT_LEN']) x_train, x_val, y_train, y_val = train_test_split(x_train, train_label, train_size=0.8, random_state=2018) matrix = create_glove_embeddings( embed_file='../datasets/glove.840B.300d.txt', word_index=tokenizer.word_index, max_num_words=config['MAX_NUM_WORDS']) model = TextCNN(matrix, maxlen=config['MAX_TEXT_LEN'], num_classes=config['NUM_CLASSES'], filter_sizes=config['FILTER_SIZES'], filter_num=config['FILTER_NUM'], embed_dropout=config['EMBED_DROPOUT'], dense_dropout=config['DENSE_DROPOUT']) model.fit(x=x_train, y=y_train, epochs=config['EPOCHS'], batch_size=config['BATCH_SIZE'], validation_data=(x_val, y_val), save_model=True) model.load_weight('../tmp/text_cnn') test_pred = model.predict(x_test) from sklearn.metrics import accuracy_score print('acc on test data: {}'.format(accuracy_score(test_label, test_pred)))
def train(batches, test_data, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda): with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(), tf.device('gpu:0'): cnn = TextCNN( sequence_length=sequence_length, num_classes=num_classes, vocab_size=vocab_size, filter_sizes=filter_sizes, num_filters=num_filters, embedding_size=embedding_size, l2_reg_lambda=l2_reg_lambda) # Checkpoint directory. timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, 'checkpoints')) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # Generate batches cnn.train(batches, test_data, sess) path = saver.save(sess, checkpoint_prefix) print("Saved model checkpoint to {}\n".format(path))
def build_model(params): if params.model == 'cnn': model = TextCNN(max_sequence_length=params.padding_size, max_token_num=params.vocab_size, embedding_dim=params.embed_size, output_dim=params.num_classes) model.compile(tf.optimizers.Adam(learning_rate=params.learning_rate), loss='binary_crossentropy', metrics=[micro_f1, macro_f1]) else: pass model.summary() return model
def main(args): logger.info('Checking...') SEED = args.seed check_manual_seed(SEED) check_args(args) logger.info('seed: {}'.format(args.seed)) gross_result['seed'] = args.seed logger.info('Loading config...') bert_config = BertConfig('config/bert.ini') bert_config = bert_config(args.bert_type) # for oos-eval dataset data_config = Config('config/data.ini') data_config = data_config(args.dataset) # Prepare data processor data_path = os.path.join(data_config['DataDir'], data_config[args.data_file]) # 把目录和文件名合成一个路径 label_path = data_path.replace('.json', '.label') if args.dataset == 'oos-eval': processor = OOSProcessor(bert_config, maxlen=32) elif args.dataset == 'smp': processor = SMPProcessor(bert_config, maxlen=32) else: raise ValueError('The dataset {} is not supported.'.format( args.dataset)) processor.load_label( label_path) # Adding label_to_id and id_to_label ot processor. n_class = len(processor.id_to_label) config = vars(args) # 返回参数字典 config['model_save_path'] = os.path.join(args.output_dir, 'save', 'bert.pt') config['n_class'] = n_class logger.info('config:') logger.info(config) model = TextCNN(bert_config, n_class) # Bert encoder if args.fine_tune: model.unfreeze_bert_encoder() else: model.freeze_bert_encoder() model.to(device) global_step = 0 def train(train_dataset, dev_dataset): train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size // args.gradient_accumulation_steps, shuffle=True, num_workers=2) nonlocal global_step n_sample = len(train_dataloader) early_stopping = EarlyStopping(args.patience, logger=logger) # Loss function classified_loss = torch.nn.CrossEntropyLoss().to(device) # Optimizers optimizer = AdamW(model.parameters(), args.lr) train_loss = [] if dev_dataset: valid_loss = [] valid_ind_class_acc = [] iteration = 0 for i in range(args.n_epoch): model.train() total_loss = 0 for sample in tqdm.tqdm(train_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) logits = model(token, mask, type_ids) loss = classified_loss(logits, y.long()) total_loss += loss.item() loss = loss / args.gradient_accumulation_steps loss.backward() # bp and update parameters if (global_step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 logger.info('[Epoch {}] Train: train_loss: {}'.format( i, total_loss / n_sample)) logger.info('-' * 30) train_loss.append(total_loss / n_sample) iteration += 1 if dev_dataset: logger.info( '#################### eval result at step {} ####################' .format(global_step)) eval_result = eval(dev_dataset) valid_loss.append(eval_result['loss']) valid_ind_class_acc.append(eval_result['ind_class_acc']) # 1 表示要保存模型 # 0 表示不需要保存模型 # -1 表示不需要模型,且超过了patience,需要early stop signal = early_stopping(eval_result['accuracy']) if signal == -1: break elif signal == 0: pass elif signal == 1: save_model(model, path=config['model_save_path'], model_name='bert') # logger.info(eval_result) from utils.visualization import draw_curve draw_curve(train_loss, iteration, 'train_loss', args.output_dir) if dev_dataset: draw_curve(valid_loss, iteration, 'valid_loss', args.output_dir) draw_curve(valid_ind_class_acc, iteration, 'valid_ind_class_accuracy', args.output_dir) if args.patience >= args.n_epoch: save_model(model, path=config['model_save_path'], model_name='bert') freeze_data['train_loss'] = train_loss freeze_data['valid_loss'] = valid_loss def eval(dataset): dev_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(dev_dataloader) result = dict() model.eval() # Loss function classified_loss = torch.nn.CrossEntropyLoss().to(device) all_pred = [] all_logit = [] total_loss = 0 for sample in tqdm.tqdm(dev_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) with torch.no_grad(): logit = model(token, mask, type_ids) all_logit.append(logit) all_pred.append(torch.argmax(logit, 1)) total_loss += classified_loss(logit, y.long()) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_pred = torch.cat(all_pred, 0).cpu() all_logit = torch.cat(all_logit, 0).cpu() ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y) report = metrics.classification_report(all_y, all_pred, output_dict=True) result.update(report) y_score = all_logit.softmax(1)[:, 1].tolist() eer = metrics.cal_eer(all_binary_y, y_score) oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_pred, all_binary_y) result['eer'] = eer result['ind_class_acc'] = ind_class_acc result['loss'] = total_loss / n_sample result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['auc'] = roc_auc_score(all_binary_y, y_score) result['y_score'] = y_score result['all_binary_y'] = all_binary_y freeze_data['valid_all_y'] = all_y freeze_data['vaild_all_pred'] = all_pred freeze_data['valid_score'] = y_score return result def test(dataset): load_model(model, path=config['model_save_path'], model_name='bert') test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2) n_sample = len(test_dataloader) result = dict() model.eval() # Loss function classified_loss = torch.nn.CrossEntropyLoss().to(device) all_pred = [] total_loss = 0 all_logit = [] for sample in tqdm.tqdm(test_dataloader): sample = (i.to(device) for i in sample) token, mask, type_ids, y = sample batch = len(token) with torch.no_grad(): logit = model(token, mask, type_ids) all_logit.append(logit) all_pred.append(torch.argmax(logit, 1)) total_loss += classified_loss(logit, y.long()) all_y = LongTensor( dataset.dataset[:, -1].astype(int)).cpu() # [length, n_class] all_binary_y = (all_y != 0).long() # [length, 1] label 0 is oos all_pred = torch.cat(all_pred, 0).cpu() all_logit = torch.cat(all_logit, 0).cpu() # classification report ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y) report = metrics.classification_report(all_y, all_pred, output_dict=True) oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore( all_pred, all_binary_y) result.update(report) # 只有二分类时候ERR才有意义 y_score = all_logit.softmax(1)[:, 1].tolist() eer = metrics.cal_eer(all_binary_y, y_score) result['eer'] = eer result['ind_class_acc'] = ind_class_acc result['loss'] = total_loss / n_sample result['all_y'] = all_y.tolist() result['all_pred'] = all_pred.tolist() result['all_binary_y'] = all_binary_y freeze_data['test_all_y'] = all_y.tolist() freeze_data['test_all_pred'] = all_pred.tolist() freeze_data['test_score'] = y_score result['oos_ind_precision'] = oos_ind_precision result['oos_ind_recall'] = oos_ind_recall result['oos_ind_f_score'] = oos_ind_fscore result['auc'] = roc_auc_score(all_binary_y, y_score) result['y_score'] = y_score return result if args.do_train: if config['data_file'].startswith('binary'): text_train_set = processor.read_dataset(data_path, ['train']) text_dev_set = processor.read_dataset(data_path, ['val']) elif config['dataset'] == 'oos-eval': text_train_set = processor.read_dataset(data_path, ['train', 'oos_train']) text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val']) elif config['dataset'] == 'smp': text_train_set = processor.read_dataset(data_path, ['train']) text_dev_set = processor.read_dataset(data_path, ['val']) train_features = processor.convert_to_ids(text_train_set) train_dataset = OOSDataset(train_features) dev_features = processor.convert_to_ids(text_dev_set) dev_dataset = OOSDataset(dev_features) train(train_dataset, dev_dataset) if args.do_eval: logger.info( '#################### eval result at step {} ####################'. format(global_step)) if config['data_file'].startswith('binary'): text_dev_set = processor.read_dataset(data_path, ['val']) elif config['dataset'] == 'oos-eval': text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val']) elif config['dataset'] == 'smp': text_dev_set = processor.read_dataset(data_path, ['val']) dev_features = processor.convert_to_ids(text_dev_set) dev_dataset = OOSDataset(dev_features) eval_result = eval(dev_dataset) # logger.info(eval_result) logger.info('eval_eer: {}'.format(eval_result['eer'])) logger.info('eval_oos_ind_precision: {}'.format( eval_result['oos_ind_precision'])) logger.info('eval_oos_ind_recall: {}'.format( eval_result['oos_ind_recall'])) logger.info('eval_oos_ind_f_score: {}'.format( eval_result['oos_ind_f_score'])) logger.info('eval_auc: {}'.format(eval_result['auc'])) logger.info('eval_fpr95: {}'.format( ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score']))) gross_result['eval_eer'] = eval_result['eer'] gross_result['eval_auc'] = eval_result['auc'] gross_result['eval_fpr95'] = ErrorRateAt95Recall( eval_result['all_binary_y'], eval_result['y_score']) gross_result['eval_oos_ind_precision'] = eval_result[ 'oos_ind_precision'] gross_result['eval_oos_ind_recall'] = eval_result['oos_ind_recall'] gross_result['eval_oos_ind_f_score'] = eval_result['oos_ind_f_score'] if args.do_test: logger.info( '#################### test result at step {} ####################'. format(global_step)) if config['data_file'].startswith('binary'): text_test_set = processor.read_dataset(data_path, ['test']) elif config['dataset'] == 'oos-eval': text_test_set = processor.read_dataset(data_path, ['test', 'oos_test']) elif config['dataset'] == 'smp': text_test_set = processor.read_dataset(data_path, ['test']) test_features = processor.convert_to_ids(text_test_set) test_dataset = OOSDataset(test_features) test_result = test(test_dataset) save_result(test_result, os.path.join(args.output_dir, 'test_result')) # logger.info(test_result) logger.info('test_eer: {}'.format(test_result['eer'])) logger.info('test_ood_ind_precision: {}'.format( test_result['oos_ind_precision'])) logger.info('test_ood_ind_recall: {}'.format( test_result['oos_ind_recall'])) logger.info('test_ood_ind_f_score: {}'.format( test_result['oos_ind_f_score'])) logger.info('test_auc: {}'.format(test_result['auc'])) logger.info('test_fpr95: {}'.format( ErrorRateAt95Recall(test_result['all_binary_y'], test_result['y_score']))) my_plot_roc(test_result['all_binary_y'], test_result['y_score'], os.path.join(args.output_dir, 'roc_curve.png')) save_result(test_result, os.path.join(args.output_dir, 'test_result')) gross_result['test_eer'] = test_result['eer'] gross_result['test_auc'] = test_result['auc'] gross_result['test_fpr95'] = ErrorRateAt95Recall( test_result['all_binary_y'], test_result['y_score']) gross_result['test_oos_ind_precision'] = test_result[ 'oos_ind_precision'] gross_result['test_oos_ind_recall'] = test_result['oos_ind_recall'] gross_result['test_oos_ind_f_score'] = test_result['oos_ind_f_score'] # 输出错误cases if config['dataset'] == 'oos-eval': texts = [line[0] for line in text_test_set] elif config['dataset'] == 'smp': texts = [line['text'] for line in text_test_set] else: raise ValueError('The dataset {} is not supported.'.format( args.dataset)) output_cases(texts, test_result['all_y'], test_result['all_pred'], os.path.join(args.output_dir, 'test_cases.csv'), processor) # confusion matrix plot_confusion_matrix(test_result['all_y'], test_result['all_pred'], args.output_dir) with open(os.path.join(config['output_dir'], 'freeze_data.pkl'), 'wb') as f: pickle.dump(freeze_data, f) df = pd.DataFrame( data={ 'valid_y': freeze_data['valid_all_y'], 'valid_score': freeze_data['valid_score'], }) df.to_csv(os.path.join(config['output_dir'], 'valid_score.csv')) df = pd.DataFrame( data={ 'test_y': freeze_data['test_all_y'], 'test_score': freeze_data['test_score'] }) df.to_csv(os.path.join(config['output_dir'], 'test_score.csv')) if args.result != 'no': pd_result = pd.DataFrame(gross_result) if args.seed == 16: pd_result.to_csv(args.result + '_gross_result.csv', index=False) else: pd_result.to_csv(args.result + '_gross_result.csv', index=False, mode='a', header=False) if args.seed == 8192: print(args.result) std_mean(args.result + '_gross_result.csv')
# 动态图 tf.enable_eager_execution() # 文本转换成TFrecord格式 vocab, word2id, train_size = get_text_tfrecord( tfrecord_filename=config.tfrecord_filename, classes=config.classes) # 初始化进度条 pbar = ProgressBar(train_size, config.batch_size) # 获取训练集和验证集 train_dataset, valid_dataset = text_get_dataset( tfrecord_filename=config.tfrecord_filename, epochs=1, batch_size=config.batch_size) # 初始化模型 model = TextCNN(num_classes=len(config.classes), checkpoint_dir=config.checkpoint_dir, vocab_size=len(vocab), embedding_dim=config.embedding_dim, word2id=word2id, model_type=config.model_type, keep_dropout=config.keep_dropout, k_max_pooling=config.k_max_pooling) # 训练 model.fit(training_data=train_dataset, eval_data=valid_dataset, pbar=pbar, num_epochs=config.epochs, early_stopping_rounds=10, verbose=1) # 模型保存 model.save_model(model=model)
from model.text_cnn import TextCNN from model.abilstm import ABLSTM from model.bcnn import BCNN # from model.char_cnn import CharCNN from model.char_cnn2 import CharCNN from model.bilstm import BLSTM from model.multi_text_cnn import MultiTextCNN from model.char_word_cnn import CharTextCNN #model = CharCNN(vocab,num_class=2) #model = BCNN(vocab,num_class=2) #model = CharTextCNN(vocab,num_class=2) #model = ABLSTM(vocab,num_class=2) #model = BLSTM(vocab,num_class=2) tf.reset_default_graph() save_dir = '/Users/apple/Downloads/news_qa/checkpoint' model = TextCNN(vocab,num_class=3,task_balance=0.12,soft_temperature=10) model.compile(tf.train.AdamOptimizer, 0.001) model.load('/Users/apple/Downloads/news_qa/pretrained_checkpoint/best_weights/') model.train_and_evaluate(brc_data, evaluator=None, epochs=5, save_dir=save_dir) sys.exit(1) if task_balance==1: model.train_and_evaluate(brc_data,evaluator=None,epochs=5,save_dir=save_dir) else: model.load('/Users/apple/Downloads/news_qa/checkpoint/best_weights/') model.train_and_evaluate(brc_data,evaluator=None,epochs=5,save_dir=save_dir) print('..........finish training with {} ............'.format(task_balance)) # from model.bilstm import BLSTM # model = BLSTM(vocab)
x_train = x[shuffle_indices] y_train = y[shuffle_indices] logging.info("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement = True, log_device_placement = False ) sess = tf.Session(config = session_conf) with sess.as_default(): nn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list(map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name = "global_step", trainable = False) optimizer = tf.train.AdamOptimizer(nn.learning_rate) tvars = tf.trainable_variables() grads,_ = tf.clip_by_global_norm(tf.gradients(nn.loss, tvars), params['grad_clip']) grads_and_vars = tuple(zip(grads,tvars)) train_op = optimizer.apply_gradients(grads_and_vars, global_step = global_step) grad_summaries = [] for g,v in grads_and_vars: if g is not None:
def train_cnn(): # Data Preparation # ================================================== if FLAGS.init_embedding_path is not None: embedding = np.load(FLAGS.init_embedding_path) print("Using pre-trained word embedding which shape is {}\n".format( embedding.shape)) FLAGS.vocab_size = embedding.shape[0] FLAGS.embedding_size = embedding.shape[1] if FLAGS.init_model_path is not None: assert os.path.isdir( FLAGS.init_model_path), "init_model_path must be a directory\n" ckpt = tf.train.get_checkpoint_state(FLAGS.init_model_path) assert ckpt, "No checkpoint found in {}\n".format( FLAGS.init_model_path) assert ckpt.model_checkpoint_path, "No model_checkpoint_path found in checkpoint\n" # Create root directory timestamp = str(int(time.time())) root_dir = os.path.join(os.path.curdir, 'runs', 'textcnn', 'trained_result_' + timestamp) os.makedirs(root_dir) # Load data print("Loading data...\n") x_data = np.loadtxt(FLAGS.x_data_file) x_data = x_data.reshape(20480, 20, 30) x_data = x_data.reshape(20480, 600) y_data = np.loadtxt(FLAGS.y_data_file) print("data load finished") # Split dataset # x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=FLAGS.test_size, stratify=y_data, random_state=0) # x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0) # Training # ================================================== with tf.Graph().as_default(): tf_config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) tf_config.gpu_options.allow_growth = FLAGS.gpu_allow_growth with tf.Session(config=tf_config).as_default() as sess: cnn = TextCNN(vocab_size=FLAGS.vocab_size, embedding_size=FLAGS.embedding_size, sequence_length=FLAGS.sequence_length, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, num_classes=FLAGS.num_classes, learning_rate=FLAGS.learning_rate, grad_clip=FLAGS.grad_clip, l2_reg_lambda=FLAGS.l2_reg_lambda) # Output directory for models and summaries out_dir = os.path.abspath(root_dir) print("Writing to {}...\n".format(out_dir)) # Summaries for loss and accuracy tf.summary.scalar("loss", cnn.loss) tf.summary.scalar("accuracy", cnn.accuracy) merged_summary = tf.summary.merge_all() # Summaries dictionary train_summary_dir = os.path.join(out_dir, 'summaries', 'train') val_summary_dir = os.path.join(out_dir, 'summaries', 'val') train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) # Checkpoint directory, will not create itself checkpoint_dir = os.path.abspath( os.path.join(out_dir, 'checkpoints')) checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # Initialize all variables sess.run(tf.global_variables_initializer()) # Using pre-trained word embedding # if FLAGS.init_embedding_path is not None: # sess.run(cnn.embedding.assign(embedding)) # del embedding # Continue training from saved model if FLAGS.init_model_path is not None: saver.restore(sess, ckpt.model_checkpoint_path) # Training start print("Start training...\n") best_at_step = 0 best_val_accuracy = 0 #**************************************** # Generate train batches train_batches = data_utils.batch_iter(list(zip(x_data, y_data)), FLAGS.batch_size) start = time.time() cnn_feature_temp = [] for batch in train_batches: # Training model on x_batch and y_batch x_batch, y_batch = zip(*batch) feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.keep_prob: FLAGS.dropout_keep_prob, cnn.is_training: True } pooled_concat_flat, _, global_step, train_summaries, train_loss, train_accuracy = sess.run( [ cnn.pooled_concat_flat, cnn.train_op, cnn.global_step, merged_summary, cnn.loss, cnn.accuracy ], feed_dict=feed_dict) cnn_feature_temp.append(pooled_concat_flat.tolist()) np.savetxt( "../data/char_data/char_dim/char_cnn_embeddings_20_30_dim256.txt", np.array(cnn_feature_temp).reshape(20480, 192)) # cnn_feature.append(cnn_feature_temp) # with open('./embeddings.txt','w', encoding='utf-8')as f: # for line in cnn_feature_temp: # for content in line : # f.write(str(content).lstrip('[').rstrip(']') + '\n') print('finished training')
def train(): # Training # ================================================== #x_train, x_dev, y_train, y_dev ,vocab_size= load_data() with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(embeddings, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=vocab_size, embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary # vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } # _, step, summaries, loss, accuracy,(w,idx) = sess.run( # [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy,cnn.get_w2v_W()], # feed_dict) _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # print w[:2],idx[:2] train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) def dev_test(): batches_dev = data_helpers.batch_iter(list(zip(x_dev, y_dev)), FLAGS.batch_size, 1) for batch_dev in batches_dev: x_batch_dev, y_batch_dev = zip(*batch_dev) dev_step(x_batch_dev, y_batch_dev, writer=dev_summary_writer) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) # Training loop. For each batch... if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_test() if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
os.environ["CUDA_VISIBLE_DEVICES"] = " " vocab_file = '../examples/politic_vocab5.txt' # vocab.load_from_file('vocab_bool.txt') vocab = Vocab(lower=True) from data.data_reader_new import DatasetReader from model.text_cnn import TextCNN if os.path.exists(vocab_file): vocab.load_from_file(vocab_file) print(vocab.get_word_vocab()) @app.route('/') def search_index(): return render_template('index.html') model = TextCNN(vocab, num_class=2, pretrained_word_embedding=vocab.embeddings, word_embedding_size=300) model.load( "/search/odin/jdwu/classification/cls_checkpoints/politic/best_weights") @app.route('/get_politic_intent', methods=['POST', 'GET']) def check_intent(): if request.method == "POST": global model data_reader = DatasetReader(use_pos_feature=False, use_bert=False, use_name_feature=False) query = request.form.get('input') if query.strip() == '': return jsonify({'message': '无效查询', 'status': 0})