Exemplo n.º 1
0
    def train(self, config, word2vec, tokenizer):
        from datasets.dataset import Dataset, DatasetParam
        dataset_args = DatasetParam()
        dataset_args.output_dir = config['data_params']['output_dir']
        dataset_args.embed_dim = config['data_params']['embed_dim']
        dataset_args.max_sentence_len = config['data_params'][
            'max_sentence_len']
        dataset_args.min_word_freq = config['data_params']['min_word_freq']
        dataset_args.max_vocab_size = config['data_params']['max_vocab_size']
        dataset_args.test_rate = config['data_params']['test_rate']
        dataset_args.tokenizer = tokenizer
        dataset_args.data_dir = config['data_params']['data_dir']
        dataset_args.cate_list = config['model_params']['cate_list']
        dataset_args.word2vec_iterator = word2vec
        dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir']
        dataset_args.data_vocab_tag = str(
            config['data_params']['data_vocab_tag'])
        dataset_args.data_file = config['data_params']['data_file']
        dataset = Dataset(dataset_args)
        train_set, test_set = dataset.buildWithAllData(False)
        x_train, y_train = zip(*train_set)
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test, y_test = zip(*test_set)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        # 加载贝叶斯模型
        from sklearn.naive_bayes import BernoulliNB
        from sklearn.externals import joblib
        classifier = BernoulliNB()

        # 训练模型并保存
        classifier.fit(x_train, y_train)
        joblib.dump(classifier,
                    os.path.join(dataset_args.output_dir, 'bayes_model.m'))

        # 验证并计算acc
        y_ = classifier.predict(x_test)
        acc = np.mean(
            [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])],
            axis=0)
        print("eval acc: %f" % acc)
Exemplo n.º 2
0
    def train(self, config, word2vec, tokenizer):
        from datasets.dataset import Dataset, DatasetParam
        dataset_args = DatasetParam()
        dataset_args.output_dir = config['data_params']['output_dir']
        dataset_args.embed_dim = config['data_params']['embed_dim']
        dataset_args.max_sentence_len = config['data_params'][
            'max_sentence_len']
        dataset_args.min_word_freq = config['data_params']['min_word_freq']
        dataset_args.max_vocab_size = config['data_params']['max_vocab_size']
        dataset_args.test_rate = config['data_params']['test_rate']
        dataset_args.tokenizer = tokenizer
        dataset_args.data_dir = config['data_params']['data_dir']
        dataset_args.cate_list = config['model_params']['cate_list']
        dataset_args.word2vec_iterator = word2vec
        dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir']
        dataset_args.data_vocab_tag = str(
            config['data_params']['data_vocab_tag'])
        dataset_args.data_file = config['data_params']['data_file']
        dataset = Dataset(dataset_args)

        # 加载xgboost参数
        xgboost_args = dict()
        xgboost_args['learning_rate'] = config['xgboost_params'][
            'learning_rate']
        xgboost_args['n_estimators'] = config['xgboost_params'][
            'n_estimators']  # 树的个数--100棵树建立xgboost 总共迭代的次数
        xgboost_args['max_depth'] = config['xgboost_params'][
            'max_depth']  # 树的深度
        xgboost_args['min_child_weight'] = config['xgboost_params'][
            'min_child_weight']  # 叶子节点最小权重
        xgboost_args['gamma'] = config['xgboost_params'][
            'gamma']  # 惩罚项中叶子结点个数前的参数
        xgboost_args['subsample'] = config['xgboost_params'][
            'subsample']  # 随机选择80%样本建立决策树
        xgboost_args['colsample_btree'] = config['xgboost_params'][
            'colsample_btree']  # 随机选择80%特征建立决策树
        xgboost_args['objective'] = config['xgboost_params'][
            'objective']  # 指定损失函数
        xgboost_args['scale_pos_weight'] = config['xgboost_params'][
            'scale_pos_weight']  # 解决样本个数不平衡的问题
        xgboost_args['nthread'] = config['xgboost_params'][
            'nthread']  # 使用全部CPU进行并行运算
        xgboost_args['random_state'] = config['xgboost_params'][
            'random_state']  # 随机数
        xgboost_args['num_class'] = config['xgboost_params'][
            'num_class']  # 分类数目
        xgboost_args['eval_metric'] = config['xgboost_params']['eval_metric']
        xgboost_args['early_stopping_rounds'] = config['xgboost_params'][
            'early_stopping_rounds']
        xgboost_args['verbose'] = config['xgboost_params']['verbose']

        # 生成数据集
        train_set, test_set = dataset.buildWithAllData(False)
        x_train, y_train = zip(*train_set)
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test, y_test = zip(*test_set)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        # 加载xgboost模型
        from xgboost import XGBClassifier
        from sklearn.externals import joblib
        classifier = XGBClassifier(
            learning_rate=xgboost_args['learning_rate'],
            n_estimators=xgboost_args['n_estimators'],
            max_depth=xgboost_args['max_depth'],
            min_child_weight=xgboost_args['min_child_weight'],
            gamma=xgboost_args['gamma'],
            subsample=xgboost_args['subsample'],
            colsample_btree=xgboost_args['colsample_btree'],
            objective=xgboost_args['objective'],
            scale_pos_weight=xgboost_args['scale_pos_weight'],
            nthread=xgboost_args['nthread'],
            random_state=xgboost_args['random_state'],
            num_class=xgboost_args['num_class'])

        # 训练模型并保存
        classifier.fit(
            x_train,
            y_train,
            eval_set=[(x_test, y_test)],
            eval_metric=xgboost_args['eval_metric'],
            early_stopping_rounds=xgboost_args['early_stopping_rounds'],
            verbose=xgboost_args['verbose'])
        joblib.dump(classifier,
                    os.path.join(dataset_args.output_dir, 'xgboost_model.m'))

        # 验证并计算acc
        y_ = classifier.predict(x_test)
        acc = np.mean(
            [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])],
            axis=0)
        print("eval acc: %f" % acc)