def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert ({pipe.steps[-1][1].random_state for pipe in clf_ws } == {pipe.steps[-1][1].random_state for pipe in clf_no_ws})
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
class Models(object): """ 获取基于机器学习的文本算法 """ def __init__(self, model_path=None, feature_engineer=False, train_mode=True): # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中 self.res_model = torchvision.models.resnet152(pretrained=True).to( config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True).to(config.device) self.wide_model = torchvision.models.wide_resnet101_2( pretrained=True).to(config.device) # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert').to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if not train_mode: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} else: # 如果 feature_engineer, 则使用lightgbm 进行训练, 反之对比经典机器学习模型 if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): print(" generate embedding feature ") # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合 train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.tfidf, self.ml_data.w2v) # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为: # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300] # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_mean:[seq, 300] -> [300] # w2v_max:[seq, 300] -> [300] # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300] # w2v_win_3_mean # w2v_win_4_mean # w2v_win_2_max # w2v_win_3_max # w2v_win_4_max test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) print("generate lda feature ") # 生成 bag of word 格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...] # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), test['bow'])) # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布 print("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.book_cover_path) # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) # print("generate autoencoder feature ") # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder # TODO # train_ae = get_autoencoder_feature( # train, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) # test_ae = get_autoencoder_feature( # test, # self.ml_data.ae.max_fe atures, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") # 将所有的特征拼接到一起 train = formate_data( train, train_tfidf) # train = formate_data(train, train_tfidf, train_ae) test = formate_data( test, test_tfidf) # test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': print("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': print("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) print("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): print("get all feature") # 生成所有 feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTE deal with unbalance data ") # https://www.zhihu.com/question/269698662 # https://www.cnblogs.com/kamekin/p/9824294.html self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' print('search best param') # 使用 set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) print('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 print('Train accuracy %s' % per) # 输出测试集的准确率 print('test accuracy %s' % acc) # 输出recall print('test recall %s' % recall) # 输出F1-score print('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果 for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 print(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 print(model_name + '_' + ' test accuracy %s' % acc) # 输出recall print(model_name + '_' + 'test recall %s' % recall) # 输出F1-score print(model_name + '_' + 'test F1_score %s' % f1) def process(self, title, desc): # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in get_stop_word_list()]) df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow)) print("generate autoencoder feature ") # df_ae = get_autoencoder_feature(df, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf) #, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' self.model = joblib.load(path)
class Models(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中 ########################################### # TODO: module 2 task 2.1 # ########################################### self.res_model = torchvision.models.resnet152( pretrained=True) # res model for modal feature [1* 1000] self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if train_mode: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合 ########################################### # TODO: module 3 task 1.1 # ########################################### train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.em.tfidf, self.ml_data.em.w2v) logger.info("generate autoencoder feature ") # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder train_ae = get_autoencoder_feature( train, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) test_ae = get_autoencoder_feature( test, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) logger.info("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) logger.info("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.root_path + '/data/book_cover/') # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding ########################################### # TODO: module 3 task 1.2 # ########################################### train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) logger.info("generate bert feature ") ########################################### # TODO: module 3 task 1.3 # ########################################### train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) logger.info("generate lda feature ") ########################################### # TODO: module 3 task 1.4 # ########################################### # 生成bag of word格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) logger.info("formate data") # 将所有的特征拼接到一起 train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") # 生成所有feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble ########################################### # TODO: module 4 task 1.1 # ########################################### if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') # 使用set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': ########################################### # TODO: module 4 task 1.2 # ########################################### # param = self.param_search(search_method=search_method) # param['params']['num_leaves'] = int(param['params']['num_leaves']) # param['params']['max_depth'] = int(param['params']['max_depth']) param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) ########################################### # TODO: module 4 task 1.3 # ########################################### Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def process(self, title, desc): ########################################### # TODO: module 5 task 1.1 # ########################################### # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in self.ml_data.em.stopWords]) df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf, self.ml_data.em.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), df.bow)) print("generate autoencoder feature ") df_ae = get_autoencoder_feature(df, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' ########################################### # TODO: module 5 task 1.1 # ########################################### inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### self.model = joblib.load(path)
class Models(object): def __init__(self, feature_engineer=False): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 1. 使用torchvision 初始化resnet152模型 # 2. 使用torchvision 初始化 resnext101_32x8d 模型 # 3. 使用torchvision 初始化 wide_resnet101_2 模型 # 4. 加载bert 模型 print("load") self.res_model = torchvision.models.resnet152(pretrained=False) self.res_model.load_state_dict( torch.load(config.root_path + '/model/resnet150/resnet152-b121ed2d.pth')) self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) self.ml_data = MLData(debug_mode=True) if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', device='gpu', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") train_tfidf, test_tfidf, train, test = get_embedding_feature( self.ml_data) logger.info("generate basic feature ") # 1. 获取 基本的 NLP feature train = get_basic_feature(train) test = get_basic_feature(test) print(test.loc[0]) logger.info("generate modal feature ") cover = os.listdir(config.root_path + '/data/book_cover/') train['cover'] = train.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 1. 获取 三大CV模型的 modal embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) print(len(test.loc[0, 'res_embedding'])) #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) logger.info("generate bert feature ") # 1. 获取bert embedding train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print(test.loc[0]) logger.info("generate lda feature ") # 1. 获取 lda feature train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) print(test['queryCutRMStopWord']) print(test['bow']) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) print(test['lda']) print(test.loc[0]) logger.info("formate data") print(test) print(test_tfidf) train, test = formate_data(train, test, train_tfidf, test_tfidf) print(test) print(test.loc[0]) cols = [x for x in train.columns if str(x) not in ['labelIndex']] print(cols) X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] print(y_test) return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") # 1. 使用over_sampling 处理样本不平衡问题 print(self.y_train) self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) print(self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") # 1. 使用 under_sampling 处理样本不平衡问题 print(self.X_train) #print(self.y_train) self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) print(self.X_train) #print(self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) logger.info('fit model ') self.model.fit(self.X_train, self.y_train) # 1. 预测测试集的label # 2. 预测训练机的label # 3. 计算percision , accuracy, recall, fi_score Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): ''' @description: using different embedding feature to train common ML models @param {type} X_train, feature of train X_test, feature of test set y_train, label of train set y_test, label of test set feature_method, three options , tfidf, word2vec and fasttext @return: None ''' for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 logger.info(model_name + '_' + ' test accuracy %s' % acc) # 输出recall logger.info(model_name + '_' + 'test recall %s' % recall) # 输出F1-score logger.info(model_name + '_' + 'test F1_score %s' % f1) def predict(self, title, desc): inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): self.model = joblib.load(path)