def train(self, config, word2vec, tokenizer): from datasets.dataset import Dataset, DatasetParam dataset_args = DatasetParam() dataset_args.output_dir = config['data_params']['output_dir'] dataset_args.embed_dim = config['data_params']['embed_dim'] dataset_args.max_sentence_len = config['data_params'][ 'max_sentence_len'] dataset_args.min_word_freq = config['data_params']['min_word_freq'] dataset_args.max_vocab_size = config['data_params']['max_vocab_size'] dataset_args.test_rate = config['data_params']['test_rate'] dataset_args.tokenizer = tokenizer dataset_args.data_dir = config['data_params']['data_dir'] dataset_args.cate_list = config['model_params']['cate_list'] dataset_args.word2vec_iterator = word2vec dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir'] dataset_args.data_vocab_tag = str( config['data_params']['data_vocab_tag']) dataset_args.data_file = config['data_params']['data_file'] dataset = Dataset(dataset_args) train_set, test_set = dataset.buildWithAllData(False) x_train, y_train = zip(*train_set) x_train = np.array(x_train) y_train = np.array(y_train) x_test, y_test = zip(*test_set) x_test = np.array(x_test) y_test = np.array(y_test) # 加载贝叶斯模型 from sklearn.naive_bayes import BernoulliNB from sklearn.externals import joblib classifier = BernoulliNB() # 训练模型并保存 classifier.fit(x_train, y_train) joblib.dump(classifier, os.path.join(dataset_args.output_dir, 'bayes_model.m')) # 验证并计算acc y_ = classifier.predict(x_test) acc = np.mean( [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])], axis=0) print("eval acc: %f" % acc)
def train(self, config, word2vec, tokenizer): from datasets.dataset import Dataset, DatasetParam dataset_args = DatasetParam() dataset_args.output_dir = config['data_params']['output_dir'] dataset_args.embed_dim = config['data_params']['embed_dim'] dataset_args.max_sentence_len = config['data_params'][ 'max_sentence_len'] dataset_args.min_word_freq = config['data_params']['min_word_freq'] dataset_args.max_vocab_size = config['data_params']['max_vocab_size'] dataset_args.test_rate = config['data_params']['test_rate'] dataset_args.tokenizer = tokenizer dataset_args.data_dir = config['data_params']['data_dir'] dataset_args.cate_list = config['model_params']['cate_list'] dataset_args.word2vec_iterator = word2vec dataset_args.data_vocab_dir = config['data_params']['data_vocab_dir'] dataset_args.data_vocab_tag = str( config['data_params']['data_vocab_tag']) dataset_args.data_file = config['data_params']['data_file'] dataset = Dataset(dataset_args) # 加载xgboost参数 xgboost_args = dict() xgboost_args['learning_rate'] = config['xgboost_params'][ 'learning_rate'] xgboost_args['n_estimators'] = config['xgboost_params'][ 'n_estimators'] # 树的个数--100棵树建立xgboost 总共迭代的次数 xgboost_args['max_depth'] = config['xgboost_params'][ 'max_depth'] # 树的深度 xgboost_args['min_child_weight'] = config['xgboost_params'][ 'min_child_weight'] # 叶子节点最小权重 xgboost_args['gamma'] = config['xgboost_params'][ 'gamma'] # 惩罚项中叶子结点个数前的参数 xgboost_args['subsample'] = config['xgboost_params'][ 'subsample'] # 随机选择80%样本建立决策树 xgboost_args['colsample_btree'] = config['xgboost_params'][ 'colsample_btree'] # 随机选择80%特征建立决策树 xgboost_args['objective'] = config['xgboost_params'][ 'objective'] # 指定损失函数 xgboost_args['scale_pos_weight'] = config['xgboost_params'][ 'scale_pos_weight'] # 解决样本个数不平衡的问题 xgboost_args['nthread'] = config['xgboost_params'][ 'nthread'] # 使用全部CPU进行并行运算 xgboost_args['random_state'] = config['xgboost_params'][ 'random_state'] # 随机数 xgboost_args['num_class'] = config['xgboost_params'][ 'num_class'] # 分类数目 xgboost_args['eval_metric'] = config['xgboost_params']['eval_metric'] xgboost_args['early_stopping_rounds'] = config['xgboost_params'][ 'early_stopping_rounds'] xgboost_args['verbose'] = config['xgboost_params']['verbose'] # 生成数据集 train_set, test_set = dataset.buildWithAllData(False) x_train, y_train = zip(*train_set) x_train = np.array(x_train) y_train = np.array(y_train) x_test, y_test = zip(*test_set) x_test = np.array(x_test) y_test = np.array(y_test) # 加载xgboost模型 from xgboost import XGBClassifier from sklearn.externals import joblib classifier = XGBClassifier( learning_rate=xgboost_args['learning_rate'], n_estimators=xgboost_args['n_estimators'], max_depth=xgboost_args['max_depth'], min_child_weight=xgboost_args['min_child_weight'], gamma=xgboost_args['gamma'], subsample=xgboost_args['subsample'], colsample_btree=xgboost_args['colsample_btree'], objective=xgboost_args['objective'], scale_pos_weight=xgboost_args['scale_pos_weight'], nthread=xgboost_args['nthread'], random_state=xgboost_args['random_state'], num_class=xgboost_args['num_class']) # 训练模型并保存 classifier.fit( x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=xgboost_args['eval_metric'], early_stopping_rounds=xgboost_args['early_stopping_rounds'], verbose=xgboost_args['verbose']) joblib.dump(classifier, os.path.join(dataset_args.output_dir, 'xgboost_model.m')) # 验证并计算acc y_ = classifier.predict(x_test) acc = np.mean( [1 if y_[i] == y_test[i] else 0 for i in range(y_test.shape[0])], axis=0) print("eval acc: %f" % acc)