예제 #1
0
def check_intent():
    if request.method == "POST":
        global model
        data_reader = DatasetReader(use_pos_feature=False,
                                    use_bert=False,
                                    use_name_feature=False)
        query = request.form.get('input')
        if query.strip() == '':
            return jsonify({'message': '无效查询', 'status': 0})

        data_reader.load_single_input(query)
        # print(vocab.get_char_vocab_size())
        data_reader.convert_to_ids(vocab)

        predict_label, sample = model.inference(data_reader, 1)
        predict_label = predict_label[0]
        print("predict label " + str(predict_label) + 'hhhh')
        print({
            'query': query,
            'status': str(1),
            'message': '涉政' if predict_label == 1 else '非涉政查询'
        })
        return jsonify({
            'query': query,
            'is_politic': str(predict_label),
            'status': 1,
            'message': '涉政' if predict_label == 1 else '非涉政查询'
        })

    if request.method == 'GET':
        data_reader = DatasetReader(use_pos_feature=False,
                                    use_bert=False,
                                    use_name_feature=False)
        query = request.args.get('query')
        data_reader.load_single_input(query)
        print(vocab.get_char_vocab_size())
        data_reader.convert_to_ids(vocab)

        model = TextCNN(vocab,
                        num_class=2,
                        pretrained_word_embedding=vocab.embeddings,
                        word_embedding_size=300)
        model.load(
            "/search/odin/jdwu/classification/cls_checkpoints/politic/best_weights"
        )
        predict_label, sample = model.inference(data_reader, 1)
        predict_label = predict_label[0]
        return jsonify({
            'query': request.args.get('query'),
            'is_politic': predict_label,
            'status': 1,
            'message': '涉政' if predict_label == 1 else '非涉政查询'
        })
예제 #2
0
def experiment_with_imdb():

    train_texts, train_label, test_texts, test_label = utils.load_imdb()

    config = {
        'MAX_NUM_WORDS': 15000,
        'MAX_TEXT_LEN': 500,
        'NUM_CLASSES': 2,
        'FILTER_SIZES': [2, 3, 4, 5],
        'FILTER_NUM': 200,
        'EMBED_DROPOUT': 0.3,
        'DENSE_DROPOUT': 0.5,
        'BATCH_SIZE': 64,
        'EPOCHS': 10,
    }

    tokenizer = Tokenizer(num_words=config['MAX_NUM_WORDS'])
    tokenizer.fit_on_texts(train_texts)

    train_texts = tokenizer.texts_to_sequences(train_texts)
    test_texts = tokenizer.texts_to_sequences(test_texts)

    x_train = pad_sequences(train_texts, maxlen=config['MAX_TEXT_LEN'])
    x_test = pad_sequences(test_texts, maxlen=config['MAX_TEXT_LEN'])

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      train_label,
                                                      train_size=0.8,
                                                      random_state=2018)

    matrix = create_glove_embeddings(
        embed_file='../datasets/glove.840B.300d.txt',
        word_index=tokenizer.word_index,
        max_num_words=config['MAX_NUM_WORDS'])

    model = TextCNN(matrix,
                    maxlen=config['MAX_TEXT_LEN'],
                    num_classes=config['NUM_CLASSES'],
                    filter_sizes=config['FILTER_SIZES'],
                    filter_num=config['FILTER_NUM'],
                    embed_dropout=config['EMBED_DROPOUT'],
                    dense_dropout=config['DENSE_DROPOUT'])

    model.fit(x=x_train,
              y=y_train,
              epochs=config['EPOCHS'],
              batch_size=config['BATCH_SIZE'],
              validation_data=(x_val, y_val),
              save_model=True)

    model.load_weight('../tmp/text_cnn')

    test_pred = model.predict(x_test)
    from sklearn.metrics import accuracy_score
    print('acc on test data: {}'.format(accuracy_score(test_label, test_pred)))
예제 #3
0
파일: train.py 프로젝트: zbn123/tf_cnn_mf
def train(batches,
          test_data,
          sequence_length,
          num_classes,
          vocab_size,
          embedding_size,
          filter_sizes,
          num_filters,
          l2_reg_lambda):

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default(), tf.device('gpu:0'):
            cnn = TextCNN(
                sequence_length=sequence_length,
                num_classes=num_classes,
                vocab_size=vocab_size,
                filter_sizes=filter_sizes,
                num_filters=num_filters,
                embedding_size=embedding_size,
                l2_reg_lambda=l2_reg_lambda)

            # Checkpoint directory.
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp))
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, 'checkpoints'))
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

            # Generate batches
            cnn.train(batches, test_data, sess)
            path = saver.save(sess, checkpoint_prefix)
            print("Saved model checkpoint to {}\n".format(path))
예제 #4
0
def build_model(params):
    if params.model == 'cnn':
        model = TextCNN(max_sequence_length=params.padding_size,
                        max_token_num=params.vocab_size,
                        embedding_dim=params.embed_size,
                        output_dim=params.num_classes)
        model.compile(tf.optimizers.Adam(learning_rate=params.learning_rate),
                      loss='binary_crossentropy',
                      metrics=[micro_f1, macro_f1])

    else:

        pass

    model.summary()
    return model
예제 #5
0
def main(args):
    logger.info('Checking...')
    SEED = args.seed
    check_manual_seed(SEED)
    check_args(args)
    logger.info('seed: {}'.format(args.seed))
    gross_result['seed'] = args.seed

    logger.info('Loading config...')
    bert_config = BertConfig('config/bert.ini')
    bert_config = bert_config(args.bert_type)

    # for oos-eval dataset
    data_config = Config('config/data.ini')
    data_config = data_config(args.dataset)

    # Prepare data processor
    data_path = os.path.join(data_config['DataDir'],
                             data_config[args.data_file])  # 把目录和文件名合成一个路径
    label_path = data_path.replace('.json', '.label')

    if args.dataset == 'oos-eval':
        processor = OOSProcessor(bert_config, maxlen=32)
    elif args.dataset == 'smp':
        processor = SMPProcessor(bert_config, maxlen=32)
    else:
        raise ValueError('The dataset {} is not supported.'.format(
            args.dataset))

    processor.load_label(
        label_path)  # Adding label_to_id and id_to_label ot processor.

    n_class = len(processor.id_to_label)
    config = vars(args)  # 返回参数字典
    config['model_save_path'] = os.path.join(args.output_dir, 'save',
                                             'bert.pt')
    config['n_class'] = n_class

    logger.info('config:')
    logger.info(config)

    model = TextCNN(bert_config, n_class)  # Bert encoder
    if args.fine_tune:
        model.unfreeze_bert_encoder()
    else:
        model.freeze_bert_encoder()
    model.to(device)

    global_step = 0

    def train(train_dataset, dev_dataset):
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps,
                                      shuffle=True,
                                      num_workers=2)

        nonlocal global_step
        n_sample = len(train_dataloader)
        early_stopping = EarlyStopping(args.patience, logger=logger)
        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)

        # Optimizers
        optimizer = AdamW(model.parameters(), args.lr)

        train_loss = []
        if dev_dataset:
            valid_loss = []
            valid_ind_class_acc = []
        iteration = 0
        for i in range(args.n_epoch):

            model.train()

            total_loss = 0
            for sample in tqdm.tqdm(train_dataloader):
                sample = (i.to(device) for i in sample)
                token, mask, type_ids, y = sample
                batch = len(token)

                logits = model(token, mask, type_ids)
                loss = classified_loss(logits, y.long())
                total_loss += loss.item()
                loss = loss / args.gradient_accumulation_steps
                loss.backward()
                # bp and update parameters
                if (global_step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            logger.info('[Epoch {}] Train: train_loss: {}'.format(
                i, total_loss / n_sample))
            logger.info('-' * 30)

            train_loss.append(total_loss / n_sample)
            iteration += 1

            if dev_dataset:
                logger.info(
                    '#################### eval result at step {} ####################'
                    .format(global_step))
                eval_result = eval(dev_dataset)

                valid_loss.append(eval_result['loss'])
                valid_ind_class_acc.append(eval_result['ind_class_acc'])

                # 1 表示要保存模型
                # 0 表示不需要保存模型
                # -1 表示不需要模型,且超过了patience,需要early stop
                signal = early_stopping(eval_result['accuracy'])
                if signal == -1:
                    break
                elif signal == 0:
                    pass
                elif signal == 1:
                    save_model(model,
                               path=config['model_save_path'],
                               model_name='bert')

                # logger.info(eval_result)

        from utils.visualization import draw_curve
        draw_curve(train_loss, iteration, 'train_loss', args.output_dir)
        if dev_dataset:
            draw_curve(valid_loss, iteration, 'valid_loss', args.output_dir)
            draw_curve(valid_ind_class_acc, iteration,
                       'valid_ind_class_accuracy', args.output_dir)

        if args.patience >= args.n_epoch:
            save_model(model,
                       path=config['model_save_path'],
                       model_name='bert')

        freeze_data['train_loss'] = train_loss
        freeze_data['valid_loss'] = valid_loss

    def eval(dataset):
        dev_dataloader = DataLoader(dataset,
                                    batch_size=args.predict_batch_size,
                                    shuffle=False,
                                    num_workers=2)
        n_sample = len(dev_dataloader)
        result = dict()
        model.eval()

        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)
        all_pred = []
        all_logit = []
        total_loss = 0
        for sample in tqdm.tqdm(dev_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            with torch.no_grad():
                logit = model(token, mask, type_ids)
                all_logit.append(logit)
                all_pred.append(torch.argmax(logit, 1))
                total_loss += classified_loss(logit, y.long())

        all_y = LongTensor(
            dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_pred = torch.cat(all_pred, 0).cpu()
        all_logit = torch.cat(all_logit, 0).cpu()
        ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y)
        report = metrics.classification_report(all_y,
                                               all_pred,
                                               output_dict=True)
        result.update(report)
        y_score = all_logit.softmax(1)[:, 1].tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(
            all_pred, all_binary_y)

        result['eer'] = eer
        result['ind_class_acc'] = ind_class_acc
        result['loss'] = total_loss / n_sample

        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['y_score'] = y_score
        result['all_binary_y'] = all_binary_y

        freeze_data['valid_all_y'] = all_y
        freeze_data['vaild_all_pred'] = all_pred
        freeze_data['valid_score'] = y_score

        return result

    def test(dataset):
        load_model(model, path=config['model_save_path'], model_name='bert')
        test_dataloader = DataLoader(dataset,
                                     batch_size=args.predict_batch_size,
                                     shuffle=False,
                                     num_workers=2)
        n_sample = len(test_dataloader)
        result = dict()
        model.eval()

        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)
        all_pred = []
        total_loss = 0
        all_logit = []
        for sample in tqdm.tqdm(test_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            with torch.no_grad():
                logit = model(token, mask, type_ids)
                all_logit.append(logit)
                all_pred.append(torch.argmax(logit, 1))
                total_loss += classified_loss(logit, y.long())

        all_y = LongTensor(
            dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_pred = torch.cat(all_pred, 0).cpu()
        all_logit = torch.cat(all_logit, 0).cpu()

        # classification report
        ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y)
        report = metrics.classification_report(all_y,
                                               all_pred,
                                               output_dict=True)
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(
            all_pred, all_binary_y)
        result.update(report)
        # 只有二分类时候ERR才有意义
        y_score = all_logit.softmax(1)[:, 1].tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        result['eer'] = eer
        result['ind_class_acc'] = ind_class_acc
        result['loss'] = total_loss / n_sample
        result['all_y'] = all_y.tolist()
        result['all_pred'] = all_pred.tolist()
        result['all_binary_y'] = all_binary_y

        freeze_data['test_all_y'] = all_y.tolist()
        freeze_data['test_all_pred'] = all_pred.tolist()
        freeze_data['test_score'] = y_score

        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['y_score'] = y_score
        return result

    if args.do_train:
        if config['data_file'].startswith('binary'):
            text_train_set = processor.read_dataset(data_path, ['train'])
            text_dev_set = processor.read_dataset(data_path, ['val'])
        elif config['dataset'] == 'oos-eval':
            text_train_set = processor.read_dataset(data_path,
                                                    ['train', 'oos_train'])
            text_dev_set = processor.read_dataset(data_path,
                                                  ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_train_set = processor.read_dataset(data_path, ['train'])
            text_dev_set = processor.read_dataset(data_path, ['val'])

        train_features = processor.convert_to_ids(text_train_set)
        train_dataset = OOSDataset(train_features)
        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)

        train(train_dataset, dev_dataset)

    if args.do_eval:
        logger.info(
            '#################### eval result at step {} ####################'.
            format(global_step))
        if config['data_file'].startswith('binary'):
            text_dev_set = processor.read_dataset(data_path, ['val'])
        elif config['dataset'] == 'oos-eval':
            text_dev_set = processor.read_dataset(data_path,
                                                  ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_dev_set = processor.read_dataset(data_path, ['val'])

        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)
        eval_result = eval(dev_dataset)
        # logger.info(eval_result)
        logger.info('eval_eer: {}'.format(eval_result['eer']))
        logger.info('eval_oos_ind_precision: {}'.format(
            eval_result['oos_ind_precision']))
        logger.info('eval_oos_ind_recall: {}'.format(
            eval_result['oos_ind_recall']))
        logger.info('eval_oos_ind_f_score: {}'.format(
            eval_result['oos_ind_f_score']))
        logger.info('eval_auc: {}'.format(eval_result['auc']))
        logger.info('eval_fpr95: {}'.format(
            ErrorRateAt95Recall(eval_result['all_binary_y'],
                                eval_result['y_score'])))
        gross_result['eval_eer'] = eval_result['eer']
        gross_result['eval_auc'] = eval_result['auc']
        gross_result['eval_fpr95'] = ErrorRateAt95Recall(
            eval_result['all_binary_y'], eval_result['y_score'])
        gross_result['eval_oos_ind_precision'] = eval_result[
            'oos_ind_precision']
        gross_result['eval_oos_ind_recall'] = eval_result['oos_ind_recall']
        gross_result['eval_oos_ind_f_score'] = eval_result['oos_ind_f_score']

    if args.do_test:
        logger.info(
            '#################### test result at step {} ####################'.
            format(global_step))
        if config['data_file'].startswith('binary'):
            text_test_set = processor.read_dataset(data_path, ['test'])
        elif config['dataset'] == 'oos-eval':
            text_test_set = processor.read_dataset(data_path,
                                                   ['test', 'oos_test'])
        elif config['dataset'] == 'smp':
            text_test_set = processor.read_dataset(data_path, ['test'])

        test_features = processor.convert_to_ids(text_test_set)
        test_dataset = OOSDataset(test_features)
        test_result = test(test_dataset)
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))
        # logger.info(test_result)
        logger.info('test_eer: {}'.format(test_result['eer']))
        logger.info('test_ood_ind_precision: {}'.format(
            test_result['oos_ind_precision']))
        logger.info('test_ood_ind_recall: {}'.format(
            test_result['oos_ind_recall']))
        logger.info('test_ood_ind_f_score: {}'.format(
            test_result['oos_ind_f_score']))
        logger.info('test_auc: {}'.format(test_result['auc']))
        logger.info('test_fpr95: {}'.format(
            ErrorRateAt95Recall(test_result['all_binary_y'],
                                test_result['y_score'])))

        my_plot_roc(test_result['all_binary_y'], test_result['y_score'],
                    os.path.join(args.output_dir, 'roc_curve.png'))
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))

        gross_result['test_eer'] = test_result['eer']
        gross_result['test_auc'] = test_result['auc']
        gross_result['test_fpr95'] = ErrorRateAt95Recall(
            test_result['all_binary_y'], test_result['y_score'])
        gross_result['test_oos_ind_precision'] = test_result[
            'oos_ind_precision']
        gross_result['test_oos_ind_recall'] = test_result['oos_ind_recall']
        gross_result['test_oos_ind_f_score'] = test_result['oos_ind_f_score']

        # 输出错误cases
        if config['dataset'] == 'oos-eval':
            texts = [line[0] for line in text_test_set]
        elif config['dataset'] == 'smp':
            texts = [line['text'] for line in text_test_set]
        else:
            raise ValueError('The dataset {} is not supported.'.format(
                args.dataset))

        output_cases(texts, test_result['all_y'], test_result['all_pred'],
                     os.path.join(args.output_dir, 'test_cases.csv'),
                     processor)

        # confusion matrix
        plot_confusion_matrix(test_result['all_y'], test_result['all_pred'],
                              args.output_dir)

    with open(os.path.join(config['output_dir'], 'freeze_data.pkl'),
              'wb') as f:
        pickle.dump(freeze_data, f)
    df = pd.DataFrame(
        data={
            'valid_y': freeze_data['valid_all_y'],
            'valid_score': freeze_data['valid_score'],
        })
    df.to_csv(os.path.join(config['output_dir'], 'valid_score.csv'))

    df = pd.DataFrame(
        data={
            'test_y': freeze_data['test_all_y'],
            'test_score': freeze_data['test_score']
        })
    df.to_csv(os.path.join(config['output_dir'], 'test_score.csv'))

    if args.result != 'no':
        pd_result = pd.DataFrame(gross_result)
        if args.seed == 16:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False)
        else:
            pd_result.to_csv(args.result + '_gross_result.csv',
                             index=False,
                             mode='a',
                             header=False)
        if args.seed == 8192:
            print(args.result)
            std_mean(args.result + '_gross_result.csv')
# 动态图
tf.enable_eager_execution()
# 文本转换成TFrecord格式
vocab, word2id, train_size = get_text_tfrecord(
    tfrecord_filename=config.tfrecord_filename, classes=config.classes)
# 初始化进度条
pbar = ProgressBar(train_size, config.batch_size)
# 获取训练集和验证集
train_dataset, valid_dataset = text_get_dataset(
    tfrecord_filename=config.tfrecord_filename,
    epochs=1,
    batch_size=config.batch_size)
# 初始化模型
model = TextCNN(num_classes=len(config.classes),
                checkpoint_dir=config.checkpoint_dir,
                vocab_size=len(vocab),
                embedding_dim=config.embedding_dim,
                word2id=word2id,
                model_type=config.model_type,
                keep_dropout=config.keep_dropout,
                k_max_pooling=config.k_max_pooling)
# 训练
model.fit(training_data=train_dataset,
          eval_data=valid_dataset,
          pbar=pbar,
          num_epochs=config.epochs,
          early_stopping_rounds=10,
          verbose=1)
# 模型保存
model.save_model(model=model)
        from model.text_cnn import TextCNN
        from model.abilstm import  ABLSTM
        from model.bcnn import  BCNN
        # from model.char_cnn import CharCNN
        from model.char_cnn2 import  CharCNN
        from model.bilstm import  BLSTM
        from model.multi_text_cnn import  MultiTextCNN
        from model.char_word_cnn import  CharTextCNN
        #model = CharCNN(vocab,num_class=2)
        #model = BCNN(vocab,num_class=2)
        #model = CharTextCNN(vocab,num_class=2)
        #model = ABLSTM(vocab,num_class=2)
        #model = BLSTM(vocab,num_class=2)
        tf.reset_default_graph()
        save_dir = '/Users/apple/Downloads/news_qa/checkpoint'
        model = TextCNN(vocab,num_class=3,task_balance=0.12,soft_temperature=10)
        model.compile(tf.train.AdamOptimizer, 0.001)
        model.load('/Users/apple/Downloads/news_qa/pretrained_checkpoint/best_weights/')
        model.train_and_evaluate(brc_data, evaluator=None, epochs=5, save_dir=save_dir)
        sys.exit(1)

        if task_balance==1:
            model.train_and_evaluate(brc_data,evaluator=None,epochs=5,save_dir=save_dir)
        else:
            model.load('/Users/apple/Downloads/news_qa/checkpoint/best_weights/')
            model.train_and_evaluate(brc_data,evaluator=None,epochs=5,save_dir=save_dir)

        print('..........finish training with  {} ............'.format(task_balance))

    # from model.bilstm import  BLSTM
    # model = BLSTM(vocab)
예제 #8
0
x_train = x[shuffle_indices]
y_train = y[shuffle_indices]

logging.info("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))

with tf.Graph().as_default():
	session_conf = tf.ConfigProto(
		allow_soft_placement = True,
		log_device_placement = False
		)
	sess = tf.Session(config = session_conf)
	with sess.as_default():
		nn = TextCNN(
				sequence_length=x_train.shape[1],
				num_classes=y_train.shape[1],
				vocab_size=len(vocab_processor.vocabulary_),
				embedding_size=params['embedding_dim'],
				filter_sizes=list(map(int, params['filter_sizes'].split(","))),
				num_filters=params['num_filters'],
				l2_reg_lambda=params['l2_reg_lambda'])

		global_step = tf.Variable(0, name = "global_step", trainable = False)
		optimizer = tf.train.AdamOptimizer(nn.learning_rate)
		tvars = tf.trainable_variables()
		grads,_ = tf.clip_by_global_norm(tf.gradients(nn.loss, tvars), params['grad_clip'])
		grads_and_vars = tuple(zip(grads,tvars))
		train_op = optimizer.apply_gradients(grads_and_vars, global_step = global_step)


		grad_summaries = []
		for g,v in grads_and_vars:
			if g is not None:
예제 #9
0
def train_cnn():
    # Data Preparation
    # ==================================================
    if FLAGS.init_embedding_path is not None:
        embedding = np.load(FLAGS.init_embedding_path)
        print("Using pre-trained word embedding which shape is {}\n".format(
            embedding.shape))
        FLAGS.vocab_size = embedding.shape[0]
        FLAGS.embedding_size = embedding.shape[1]
    if FLAGS.init_model_path is not None:
        assert os.path.isdir(
            FLAGS.init_model_path), "init_model_path must be a directory\n"
        ckpt = tf.train.get_checkpoint_state(FLAGS.init_model_path)
        assert ckpt, "No checkpoint found in {}\n".format(
            FLAGS.init_model_path)
        assert ckpt.model_checkpoint_path, "No model_checkpoint_path found in checkpoint\n"

    # Create root directory
    timestamp = str(int(time.time()))
    root_dir = os.path.join(os.path.curdir, 'runs', 'textcnn',
                            'trained_result_' + timestamp)
    os.makedirs(root_dir)

    # Load data
    print("Loading data...\n")
    x_data = np.loadtxt(FLAGS.x_data_file)
    x_data = x_data.reshape(20480, 20, 30)
    x_data = x_data.reshape(20480, 600)
    y_data = np.loadtxt(FLAGS.y_data_file)
    print("data load finished")

    # Split dataset
    # x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=FLAGS.test_size, stratify=y_data, random_state=0)
    # x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

    # Training
    # ==================================================
    with tf.Graph().as_default():
        tf_config = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        tf_config.gpu_options.allow_growth = FLAGS.gpu_allow_growth

        with tf.Session(config=tf_config).as_default() as sess:
            cnn = TextCNN(vocab_size=FLAGS.vocab_size,
                          embedding_size=FLAGS.embedding_size,
                          sequence_length=FLAGS.sequence_length,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          num_classes=FLAGS.num_classes,
                          learning_rate=FLAGS.learning_rate,
                          grad_clip=FLAGS.grad_clip,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Output directory for models and summaries
            out_dir = os.path.abspath(root_dir)
            print("Writing to {}...\n".format(out_dir))

            # Summaries for loss and accuracy
            tf.summary.scalar("loss", cnn.loss)
            tf.summary.scalar("accuracy", cnn.accuracy)
            merged_summary = tf.summary.merge_all()

            # Summaries dictionary
            train_summary_dir = os.path.join(out_dir, 'summaries', 'train')
            val_summary_dir = os.path.join(out_dir, 'summaries', 'val')
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)
            val_summary_writer = tf.summary.FileWriter(val_summary_dir,
                                                       sess.graph)

            # Checkpoint directory, will not create itself
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, 'checkpoints'))
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt')
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Using pre-trained word embedding
            # if FLAGS.init_embedding_path is not None:
            #     sess.run(cnn.embedding.assign(embedding))
            #     del embedding

            # Continue training from saved model
            if FLAGS.init_model_path is not None:
                saver.restore(sess, ckpt.model_checkpoint_path)

            # Training start
            print("Start training...\n")
            best_at_step = 0
            best_val_accuracy = 0

            #****************************************
            # Generate train batches
            train_batches = data_utils.batch_iter(list(zip(x_data, y_data)),
                                                  FLAGS.batch_size)
            start = time.time()

            cnn_feature_temp = []
            for batch in train_batches:
                # Training model on x_batch and y_batch
                x_batch, y_batch = zip(*batch)
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.keep_prob: FLAGS.dropout_keep_prob,
                    cnn.is_training: True
                }
                pooled_concat_flat, _, global_step, train_summaries, train_loss, train_accuracy = sess.run(
                    [
                        cnn.pooled_concat_flat, cnn.train_op, cnn.global_step,
                        merged_summary, cnn.loss, cnn.accuracy
                    ],
                    feed_dict=feed_dict)
                cnn_feature_temp.append(pooled_concat_flat.tolist())

            np.savetxt(
                "../data/char_data/char_dim/char_cnn_embeddings_20_30_dim256.txt",
                np.array(cnn_feature_temp).reshape(20480, 192))
            # cnn_feature.append(cnn_feature_temp)
            # with open('./embeddings.txt','w', encoding='utf-8')as f:
            #     for line in cnn_feature_temp:
            #         for content in line :
            #                 f.write(str(content).lstrip('[').rstrip(']') + '\n')

            print('finished training')
def train():
    # Training
    # ==================================================
    #x_train, x_dev, y_train, y_dev ,vocab_size= load_data()
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(embeddings,
                          sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=vocab_size,
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            # vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                # _, step, summaries, loss, accuracy,(w,idx) = sess.run(
                #     [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy,cnn.get_w2v_W()],
                #     feed_dict)
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)

                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                # print w[:2],idx[:2]
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)

            def dev_test():
                batches_dev = data_helpers.batch_iter(list(zip(x_dev, y_dev)),
                                                      FLAGS.batch_size, 1)
                for batch_dev in batches_dev:
                    x_batch_dev, y_batch_dev = zip(*batch_dev)
                    dev_step(x_batch_dev,
                             y_batch_dev,
                             writer=dev_summary_writer)

            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                # Training loop. For each batch...
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_test()

                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
예제 #11
0
os.environ["CUDA_VISIBLE_DEVICES"] = " "
vocab_file = '../examples/politic_vocab5.txt'  # vocab.load_from_file('vocab_bool.txt')
vocab = Vocab(lower=True)
from data.data_reader_new import DatasetReader
from model.text_cnn import TextCNN
if os.path.exists(vocab_file): vocab.load_from_file(vocab_file)
print(vocab.get_word_vocab())


@app.route('/')
def search_index():
    return render_template('index.html')


model = TextCNN(vocab,
                num_class=2,
                pretrained_word_embedding=vocab.embeddings,
                word_embedding_size=300)
model.load(
    "/search/odin/jdwu/classification/cls_checkpoints/politic/best_weights")


@app.route('/get_politic_intent', methods=['POST', 'GET'])
def check_intent():
    if request.method == "POST":
        global model
        data_reader = DatasetReader(use_pos_feature=False,
                                    use_bert=False,
                                    use_name_feature=False)
        query = request.form.get('input')
        if query.strip() == '':
            return jsonify({'message': '无效查询', 'status': 0})