def run(data): train = data.loc[data.ret != -1].reset_index(drop=True) test = data.loc[data.ret == -1].reset_index(drop=True) feat_arr = [ '162', '110', '86', '168', '8', '84', '113', '96', '60', '108', '194', '170', '66', '89', '165', '192', '24', '18', '366', '258', '354', '360', '11', '276', '120', '158', '270', '246', '372', '6', '12', '164', '342', '81', '57', '254', '252', '63', '176', '374', '77' ] lgb_params = { 'boosting_type': 'gbdt', 'num_leaves': 150, 'reg_alpha': 0., 'reg_lambda': 1, 'n_estimators': 60, 'objective': 'binary', 'subsample': 0.9, 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'min_child_weight': 5 } s = CV(_df=train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=10) pred = s.get_result(test[feat_arr]) result = test[['file_name']].reset_index(drop=True).copy() result['ret'] = pred result['ret'].loc[result['ret'] > 0.01] = 1 result['ret'].loc[result['ret'] <= 0.01] = 0 result = result.rename(columns={'file_name': 'id'}) return result
def filterChoice(filterFile): filterTerms = [] noOfFilters = int(input('Do you want to filter ' + filterFile +' by 1 or multiple terms? Choose \n 1. 1 term' \ '\n 2. Multiple terms \n')) if noOfFilters == 1: print("What term to you want to filter by? Here are the categories:") for k, v in CV.items(): print('{key}: {values}'.format( key=k, values=', '.join('{}'.format(', '.join(x.split())) for x in v))) filterTerm = input("What term do you want to filter by? \n") if (any(filterTerm in value for value in CV.values())): print('Filter by ' + filterTerm) else: print("Please choose a category to filter by") elif noOfFilters == 2: print("What term to you want to filter by? Here are the categories:") for k, v in CV.items(): print('{key}: {values}'.format( key=k, values=', '.join('{}'.format(', '.join(x.split())) for x in v))) filterTerms = input( "What terms do you want to filter by? (Split up the terms with a ', ' (comma)) \n" ) terms_list = re.split("[, ] ", filterTerms) #print(terms_list) else: print('Choose either options 1 or 2')
def test_get_seed_exists(self): ''' Tests to see whether the get_seed() method properly returns a seed given no input seed ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold=CV(X,Y,seed=None) self.assertTrue(kfold.get_seed())
def test_shuffle_method_x(self): ''' Tests to see whether the shuffle method works right for the feature dataframe ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold=CV(X,Y) x_shuff , _ = kfold.shuffle() self.assertFalse(x_shuff.equals(X))
def test_shuffle_method_y(self): ''' Tests to see whether the shuffle method works right for the label dataframe ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold=CV(X,Y) _ , y_shuff = kfold.shuffle() self.assertFalse(y_shuff.equals(Y))
def test_get_seed_returns_correct(self): ''' Tests to see whether the get_seed() method properly returns a seed given an input seed ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) seed = 42 kfold=CV(X,Y,seed=42) self.assertEqual(seed,kfold.get_seed())
def Loop(self): out = pd.DataFrame() for t in range(self.loop): data = self.getdata() cv = CV(data, self.label_col, self.n_splits) cv.nFold() out = pd.concat([out, cv.out], ignore_index=True) out = out.groupby('ID').apply(lambda x: x.mean()) return out
def test_CV_split(self): ''' Tests the split method to see whether the splits are of the right shape and length ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) n_splits = 3 # Initialize CV kfold=CV(X,Y) # Do splits of 3 self.assertEqual(len(kfold.split(n_splits=n_splits)[0]), 3) self.assertEqual(len(kfold.split(n_splits=n_splits)[1]), 3)
def test_seed_shuffle(self): ''' Tests to see whether inputting a seed creates reproducible X and Y dataframes. ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) seed = 42 kfold=CV(X,Y,seed=42) x = X.sample(frac=1,random_state=seed) x.reset_index(drop=True,inplace=True) y = Y.sample(frac=1,random_state=seed) y.reset_index(drop=True,inplace=True) self.assertTrue(x.equals(kfold.shuffle()[0]))
def test_init_shuffle_x(self): ''' Tests to see whether the x and y dataframes are shuffled properly in the constructor ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold=CV(X,Y) self.assertFalse(kfold.x.equals(X))
def test_init_shuffle_y(self): ''' Tests to see whether the class returns the x and y dataframes properly ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold=CV(X,Y) self.assertFalse(kfold.y.equals(Y))
def test_CV_init(self): ''' Tests the constructor for the CV class to see whether the CV object, X, and Y dataframes are instanciated ''' X=pd.DataFrame([1,2,3,4,5,6]) Y=pd.DataFrame([1,2,3,4,4,5]) kfold = CV(X,Y) self.assertIsInstance(kfold, CV) self.assertIsInstance(kfold.y, pd.DataFrame) self.assertIsInstance(kfold.x, pd.DataFrame)
def collectCV(self): for cvFilePath, cvFileName, cvPost in zip(self.fileNamesWithPath, self.CVFileName, self.cvPostList): try: newCV = CV(cvFileName, cvFilePath, cvPost) self.CVList.append(newCV) except Exception as e: print(cvFileName) print("in collection of CV \t" + str(e))
def cv_register(): try: new_session_id = (len(cv_list) + 1) new_cv = CV(session_id=new_session_id, status=True) cv_list.append(new_cv) cv_details = {'status': True, 'session': new_session_id} cv_camera_connector() return new_session_id except Exception as e: print("Exception: " + str(e)) traceback.print_exc() return jsonify({'status': False, 'exception': str(e)})
"\n" "\n\t You are expected to enter these options in the correct order followed" "\n\t by the URLs or Headlines that you wish to test !" "\n" "\t Input arguments needed:\n" "\n\t\t [--url/--headline] = Input a URL to extract a headline from or input headline directly\n" "\t\t [URL or TITLE) = Input the URLs and/or headlines enclosed in single '' or double \"\" quotes. \n" "\t\t [--filter] = Specifies that you would like to filter your search.\n" "\t\t [Filter(s)] = Input the terms you want the headline or URL to be filtered by. \n" "\t\t [-output] = Specifies that you would like your output saved to a specific path.\n" "\t\t [Path] = Enter the file path where you would like your output to be stored.\n" "\n\t Example Input:\n" "\n\t --headline 'David Jimson is a good bloke, apparently !' --filter Europe Person Sport\n" "\t [cont.] --output C:\Program Files\example.json" "\n\t The following terms are valid filters:") for k, v in CV.items(): print('\t\t {key}: {values}'.format(key=k, values=', '.join('{}'.format(', '.join(x.split())) for x in v))) break if was_option: continue elif last_option == "--url" or last_option == "-u": url = args[i] import urllib.request with urllib.request.urlopen('http://python.org/') as response: html = response.read() elif last_option == "--headline" or last_option == "-i": headlines.append(args[i]) elif last_option == "--filter" or last_option == "-f": CV_vals = CV.values() CV_single = [] for sublist in CV_vals:
from CV import CV import os import sys import datetime if __name__ == '__main__': cwd = r'D:\temp\新建文件夹' time = str(datetime.datetime.now()) time = time.replace(':', ':') cwd2 = cwd + '\\temp_coco56_' + time os.mkdir(cwd2) os.chdir(cwd2) ins = CV() os.chdir('..') ins.combineTs() os.rmdir(cwd2) pass
from preprocess import Prep from CV import CV from performance import Portfolio, MarketIntradayPortfolio import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from pylab import * from datetime import datetime HS300 = getStock_C('000300') SP500 = getStock_A('^GSPC') HS300 = addFeatures(HS300) SP500 = addFeatures(SP500) HS300.drop('ADOSC', axis=1) X_train, y_train, X_test, y_test = Prep(HS300) Classify(X_train, y_train, X_test, y_test, 'RF') CV(X_train, y_train, 9, 'RF') clf = LDA() y_pred = clf.fit(X_train, y_train).predict(X_test) symbol = 'CSI300' start_test = datetime(2014,1,1) end_period = datetime(2015,9,29) bars = HS300[['Open','AdjClose']] bars = bars[start_test:end_period] signals = pd.DataFrame(index=bars.index) signals['signal'] = 0.0 signals['signal'] = y_pred #Short the stock signals.signal[signals.signal == 0] = -1
from CV import CV import os import sys import datetime if __name__ == '__main__': cwd = r'D:\\temp\新建文件夹' time = str(datetime.datetime.now()) time = time.replace(':',':') cwd2 = cwd + '\\temp_coco56_' + time os.mkdir(cwd2) os.chdir(cwd2) ins = CV() ins.split_OneVedio_Into_MultipleVedios_WithNumber(2) os.chdir('..') os.rmdir(cwd2) pass
from CV import CV import os import sys import datetime if __name__ == '__main__': cwd = r'D:\temp\新建文件夹' time = str(datetime.datetime.now()) time = time.replace(':', ':') cwd2 = cwd + '\\temp_coco56_' + time os.mkdir(cwd2) os.chdir(cwd2) ins = CV() #ins.Transcode(aimedFormat='.ts', dealOldFilesMode=1) ins.Transcode(aimedFormat='.mp4', dealOldFilesMode=1) os.chdir('..') os.rmdir(cwd2) pass
if __name__=='__main__': """ test_data = {'a' : [coll.defaultdict(list, {'aa' : range(10,100,10), 'ab' : range(1,10)}), coll.defaultdict(list, {'ac' : range(1,10,2), 'aa' : [2,2,2]}), coll.defaultdict(list, {'ad' : range(10, 20, 3)}) ], 'b' : [coll.defaultdict(list, {'ba' : range(1,20,3), 'bb' : range(1,15)}), coll.defaultdict(list, {'bc' : range(2,20,4)}), coll.defaultdict(list, {'bd' : range(50,300,150)}) ], } """ test_data = pp.split_samples(pp.load_data()) for u in test_data.keys(): if u not in {'9999999','SERLHOU'}: del test_data[u] print test_data.keys() test_cv = CV(DensityAuth, test_data) ''' for i in test_cv.partition_data('shit', test_data['a'], 1): f**k.pprint(i) ''' for i in test_cv.validate(): pass print "DONESKI"
import cv2 as cv import time from EmotionRecognition import EmotionRecognition from FaceDetection import FaceDetection from CV import CV compV = CV(True) # true for is on computer. False for on raspbery pi image, found = compV.findFace() if found: print compV.processEmotion(image)
import sys import csv import os start_time = time.time() print start_time, 'initializing algorithm' test_data,pkd = filter_users_val(split_samples(load_data())) """ for u in test_data.keys(): if u not in {'1227981','ADabongofo'}: del test_data[u] del pkd[u] """ print test_data.keys() test_cv = CV(DensityAuth, test_data,pkd) with open('./kde_result.csv', 'rw+') as outfile: result_writer = csv.writer(outfile) for n,i in enumerate(test_cv.validate()): train_res, cv_res = i result_writer.writerow(['user', 'train_IPR', 'train_FRR', 'train_GT', 'train_IT', 'CV_IPR', 'CV_FRR', 'CV_GT', 'CV_IT']) for u in train_res.keys(): result_writer.writerow([u] + list(train_res[u]) + list(cv_res[u])) result_writer.writerow([]) print start_time-time.time(), '- finished validation', n
def get_plot(vid_name): print("--------------------------------------") print(vid_name[3:]) # Run CV for given video comp_vis = CV(vid_name) img = comp_vis.run_cv() img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Estimate trajectory for the bottom of the ball all_data_bottom = np.asarray(comp_vis.bottom_ball) x_bottom_data = all_data_bottom[:, 0] y_bottom_data = all_data_bottom[:, 1] # Ordinary least squares (OLS) bottom_w_OLS = PolyRegression(x_bottom_data, y_bottom_data, 2, OLS) bottom_w_OLS.get_weight_vector() # Total least squares (TLS) bottom_w_TLS = PolyRegression(x_bottom_data, y_bottom_data, 2, TLS) bottom_w_TLS.get_weight_vector() # RANSAC bottom_w_RANSAC = PolyRegression(x_bottom_data, y_bottom_data, 2, RANSAC) bottom_w_RANSAC.get_weight_vector() # Estimate trajectory for the bottom of the ball all_data_top = np.asarray(comp_vis.top_ball) x_top_data = all_data_top[:, 0] y_top_data = all_data_top[:, 1] # OLS top_w_OLS = PolyRegression(x_top_data, y_top_data, 2, OLS) top_w_OLS.get_weight_vector() # TLS top_w_TLS = PolyRegression(x_top_data, y_top_data, 2, TLS) top_w_TLS.get_weight_vector() # RANSAC top_w_RANSAC = PolyRegression(x_top_data, y_top_data, 2, RANSAC) top_w_RANSAC.get_weight_vector() # Plot results plt.imshow(img) plt.plot(x_top_data, top_w_OLS.predict(x_top_data), label=f'OLS, {top_w_OLS.eqn_str}') plt.plot(x_top_data, top_w_TLS.predict(x_top_data), label=f'TLS, {top_w_TLS.eqn_str}') plt.plot(x_top_data, top_w_RANSAC.predict(x_top_data), label=f'RANSAC, {top_w_RANSAC.eqn_str}') plt.legend() plt.title(f"Top Trajectory\n{vid_name[3:]}") plt.grid(True) plt.figure() plt.imshow(img) plt.plot(x_bottom_data, bottom_w_OLS.predict(x_bottom_data), label=f'OLS, {bottom_w_OLS.eqn_str}') plt.plot(x_bottom_data, bottom_w_TLS.predict(x_bottom_data), label=f'TLS, {bottom_w_TLS.eqn_str}') plt.plot(x_bottom_data, bottom_w_RANSAC.predict(x_bottom_data), label=f'RANSAC, {bottom_w_RANSAC.eqn_str}') plt.legend() plt.title(f"Bottom Trajectory\n{vid_name[3:]}") plt.grid(True) plt.show() print("")
def run(data, result_temp): test = data.loc[data.ret==-1].reset_index(drop=True) data = data.loc[data.ret!=-1].reset_index(drop=True) file_name_dict = {} for f1 in os.listdir(config.TRAIN_PATH): for f2 in os.listdir(config.TRAIN_PATH+f1): file_name_dict[f2] = int(f1) data['multi_label'] = data.file_name.apply(lambda x:file_name_dict[x]) data = data.loc[data['multi_label']!=14].reset_index(drop=True) clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=10, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective='multiclass', min_child_weight=1, min_child_samples=20, subsample=0.7, subsample_freq=0, colsample_bytree=0.7, reg_alpha=0.0, reg_lambda=0.0, random_state=3) train_x, val_x, train_y, val_y = train_test_split(data.drop(['file_name', 'ret', 'multi_label'], axis=1), data['multi_label'], random_state=3, test_size=0.3) clf.fit(train_x, train_y, verbose=False,early_stopping_rounds=100, eval_metric='logloss', eval_set=[(val_x, val_y)]) pred_val = clf.predict(val_x) result_val = pd.DataFrame(index=list(range(len(val_x)))) result_val['label'] = val_y.tolist() result_val['pred'] = pred_val pred_test = clf.predict(test[train_x.columns.tolist()]) result_test = pd.DataFrame(index=list(range(len(test)))) result_test['pred'] = pred_test test['multi_label'] = pred_test pred = clf.predict_proba(test[train_x.columns.tolist()]) temp = [] for i in range(len(pred)): temp.append(np.max(pred[i])) test['prob'] = temp '''单独训练''' print('training...') result_dict = {} result_prob_dict = {} c = Counter(data.multi_label) for class_ in tqdm(list(c.keys())): lgb_params = { 'boosting_type':'gbdt', 'num_leaves':8, 'reg_alpha':0., 'reg_lambda':1, 'n_estimators':30, 'objective':'binary', 'subsample':0.7, 'colsample_bytree':0.6, 'learning_rate':0.1, 'min_child_weight':1} s = CV(_df=data.loc[data.multi_label==class_].drop(['file_name', 'multi_label'], axis=1).reset_index(drop=True), label_name='ret') s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=8) # , eval_metrics=f1_score test_temp = test.loc[test.multi_label==class_].reset_index(drop=True) pred_temp = s.get_result(test_temp.drop(['file_name', 'multi_label','prob', 'ret'], axis=1)) for i in range(len(test_temp)): result_dict[test_temp['file_name'][i]] = pred_temp[i] result_prob_dict[test_temp['file_name'][i]] = test_temp['prob'][i] df = pd.DataFrame(index=range(len(result_dict))) df['id'] = result_dict.keys() df['ret'] = result_dict.values() df['prob'] = result_prob_dict.values() df['multi_score'] = 2*(1-df.ret)**2*df.prob/((1-df.ret)**2+df.prob) dict_ = {} tp_df = df.loc[np.logical_and(df.prob>0.999, df.ret<0.1)].copy() tp_df = tp_df.reset_index(drop=True) for i in range(len(tp_df)): dict_[tp_df['id'][i]] = 0 print(len(dict_)) result = result_temp.copy() result['pred_2'] = result['id'].apply(lambda x:0 if x in dict_ else 1) result['pred_2'] = result['pred_2'] * result['ret'] r = result[['id', 'pred_2']].copy() r.columns = ['id', 'ret'] r['ret'] = r['ret'].astype(int) return r
def run(data, result_best): feat_arr = [ '185_new', '237_new', '176_new', '243_new', '544_new', '85_new', '245_new', '103_new', '249_new', '83_new', '545_new', '555_new', '183_new', '187_new', '135_new', '161_new', '89_new', '171_new', '242_new', '529_new', '91_new', '146_new', '547_new', '123_new', '576_new', '97_new', '447_new', '475_new', '141_new', '143_new', '159_new', '452_new', '540_new', '543_new', '239_new', '573_new', '145_new', '163_new', '181_new', '355_new' ] # 名字转换 temp_1 = os.listdir(config.TRAIN_PATH)[0] d = pd.read_csv(config.TRAIN_PATH + '/' + temp_1 + '/' + os.listdir(config.TRAIN_PATH + '/' + temp_1)[0]) name_lst = [] for col in d.columns: name_lst.append(col + '_var') for col in d.columns.tolist() + ['_功角', '_视在功率', '_变频器出入口温差', '_变频器出入口压力']: name_lst.append(col + '_mean') name_lst.append(col + '_min') name_lst.append(col + '_max') name_lst.append(col + '_ptp') name_lst.append(col + '_median') name_lst.append(col + '_sum') for col in [['叶片1角度', '叶片2角度', '叶片3角度'], ['变桨电机1电流', '变桨电机2电流', '变桨电机3电流'], ['x方向振动值', 'y方向振动值'], [ '发电机定子温度1', '发电机定子温度2', '发电机定子温度3', '发电机定子温度4', '发电机定子温度5', '发电机定子温度6' ], ['发电机空气温度1', '发电机空气温度2'], ['主轴承温度1', '主轴承温度2'], ['变桨电机1功率估算', '变桨电机2功率估算', '变桨电机3功率估算'], ['叶片1电池箱温度', '叶片2电池箱温度', '叶片3电池箱温度'], ['叶片1变桨电机温度', '叶片2变桨电机温度', '叶片3变桨电机温度'], ['叶片1变频器箱温度', '叶片2变频器箱温度', '叶片3变频器箱温度'], ['叶片1超级电容电压', '叶片2超级电容电压', '叶片3超级电容电压'], ['驱动1晶闸管温度', '驱动2晶闸管温度', '驱动3晶闸管温度'], ['驱动1输出扭矩', '驱动2输出扭矩', '驱动3输出扭矩']]: name_lst.append('_'.join(col) + '_mean') name_lst.append('_'.join(col) + '_sum') name_lst.append('_'.join(col) + '_var') dict_name = {} col_lst = data.columns.tolist()[:-1] for i in range(len(name_lst)): dict_name[col_lst[i]] = name_lst[i] data = data[ feat_arr + [str(name_lst.index('液压制动压力_max')) + '_new', 'ret', 'file_name']] data.columns = [ dict_name[i] for i in feat_arr + [str(name_lst.index('液压制动压力_max')) + '_new'] ] + ['ret', 'file_name'] test = data.loc[data.ret == -1].reset_index(drop=True) data = data.loc[data.ret != -1].reset_index(drop=True) file_name_dict = {} for f1 in os.listdir(config.TRAIN_PATH): for f2 in os.listdir(config.TRAIN_PATH + f1): file_name_dict[f2] = int(f1) data['multi_label'] = data.file_name.apply(lambda x: file_name_dict[x]) data_14 = data.loc[data['multi_label'] == 14].reset_index(drop=True) data = data.loc[data['multi_label'] != 14].reset_index(drop=True) lgb_params = { 'boosting_type': 'gbdt', 'num_leaves': 8, 'reg_alpha': 0., 'reg_lambda': 1, 'n_estimators': 50, 'objective': 'binary', 'subsample': 0.7, 'colsample_bytree': 0.6, 'learning_rate': 0.1, 'min_child_weight': 1 } feat_arr = [dict_name[i] for i in feat_arr] # ============================================================================= # '''6751 - 6755 test_02.csv 0816''' # ============================================================================= temp_test = test.loc[np.logical_and( np.logical_and(test['液压制动压力_max'] < 1.32, test['液压制动压力_max'] > 1), test['x方向振动值_mean'] < -1.5)] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6755 - 6772 submission_3.csv 0816''' temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 3.4, test['x方向振动值_mean'] > 1.2), np.logical_and(test['y方向振动值_mean'] < 3, test['y方向振动值_mean'] > 2))] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6772 - 6773 test.csv 0816''' temp_test = test.loc[np.logical_and( np.logical_and(test['液压制动压力_max'] > 1.32, test['液压制动压力_max'] > 1), np.logical_and( np.logical_and(test['x方向振动值_mean'] < -0.3, test['x方向振动值_mean'] < 22.05), np.logical_and(test['y方向振动值_mean'] < .8, test['y方向振动值_mean'] > 0)))] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6773 - 6816''' test['temp'] = test['x方向振动值_mean'] + 1.2 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( np.logical_and( test['x方向振动值_mean'] < 0.34, # 0.34 test['x方向振动值_mean'] > -0.25), np.logical_and(test['y方向振动值_mean'] > 0, test['y方向振动值_mean'] < 1.8)), test['temp'] < 0)] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/678.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.18: temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.18: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.18: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6822 - 6834''' temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 2, test['x方向振动值_mean'] > 1.25), np.logical_and(test['y方向振动值_mean'] < 3.2, test['y方向振动值_mean'] > 2))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.34: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.34: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.34: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 result_best = result_best_2.copy() test['temp'] = test['x方向振动值_mean'] + 0.75 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( np.logical_and( test['y方向振动值_mean'] < 1.25, # 1.25 , 1 test['y方向振动值_mean'] > 0.75), # 0.75, 1 np.logical_and(test['x方向振动值_mean'] > 0.25, test['x方向振动值_mean'] < 0.6)), test['temp'] > 0)] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.45: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.45: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1.7, test['x方向振动值_mean'] > 1.36), np.logical_and(test['y方向振动值_mean'] > 1.4, test['y方向振动值_mean'] < 1.8))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.27: # 0.18 0.27 0.4 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best.ret.sum() result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.27: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 # ============================================================================= # result_sub.to_csv('../V9_final/result/0820_1.csv', index=False) # result_best = pd.read_csv('../V9_final/result/0820_1.csv') # ============================================================================= result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0.8), np.logical_and(test['y方向振动值_mean'] > 0.9, test['y方向振动值_mean'] < 1.1))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/0820_1.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: # 0.18 0.27 0.4 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: # 0.18 0.27 0.4 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() test['temp'] = test['x方向振动值_mean'] + 0.2 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( test['液压制动压力_max'] > 1, np.logical_and( np.logical_and(test['x方向振动值_mean'] < 0.4, test['x方向振动值_mean'] > -0.3), np.logical_and(test['y方向振动值_mean'] > -0.3, test['y方向振动值_mean'] < 0.14))), test['temp'] > 0)] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/0820_2.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.3: # 0.18 0.27 0.4 0.27 0.39 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best.ret.sum() result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.3: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 3, test['x方向振动值_mean'] > 1.8), np.logical_and(test['y方向振动值_mean'] > 1, test['y方向振动值_mean'] < 2.1))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.38: # 0.18 0.27 0.4 0.27 0.39 0.3 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.37: # 0.18 0.27 0.4 0.27 0.39 0.3 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.37: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and( test['x方向振动值_mean'] < -0.8, # -0.55 test['x方向振动值_mean'] > -2), np.logical_and(test['y方向振动值_mean'] > -0.9, test['y方向振动值_mean'] < -0.2))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.25: # 0.18 0.27 0.4 0.27 0.39 0.3 0.37 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.25: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0), np.logical_and(test['y方向振动值_mean'] > 1.9, test['y方向振动值_mean'] < 2.5))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_sub.ret.sum() temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[ i] < 0.2: # 0.18 0.27 0.4 0.27 0.39 0.3 0.37 0.25 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_sub.ret.sum() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.8: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 1 result_best = result_sub.copy() return result_best
if __name__=='__main__': from CV import CV from preprocessor import split_samples, load_data, filter_users_val P.np.seterr(all='ignore') all_data, pkd = filter_users_val(split_samples(load_data())) for u in all_data.keys(): if all_data[u] == []: del all_data[u] del pkd[u] gbfa = CV(lambda: GammaBFAuth(all_data), all_data, pkd) with open('./bf_result.csv', 'rw+') as res_file: result_writer = csv.writer(res_file) result_writer.writerow(['user', 'CV_IPR', 'CV_FRR', 'CV_GT', 'CV_IT']) for n,i in enumerate(gbfa.validate_user('1227981')): cv_res = i ''' result_writer.writerow(['user', 'train_IPR', 'train_FRR', 'train_GT', 'train_IT', 'CV_IPR', 'CV_FRR', 'CV_GT', 'CV_IT']) for u in train_res.keys():
from CV import CV if __name__ == '__main__': d = r'H:\度盘\siki学院公开课第009期-忍者跑酷 Ninja' cv1 = CV(workDir=d, sleepTime=1) # speed = 1 # speed = 1.1 # speed = 1.2 # speed = 1.3 # speed = 1.4 # speed = 1.5 speed = 1.6 # speed = 1.8 # speed = 2 dealOldFilesMode = 0 gpu = False threads = 2 cv1.dealV(speed, dealOldFilesMode, gpu, threads) pass
def populate(): cv = CV(input.get()) cv.set_job(job_title.get()) cv.set_company_name(company_name.get()) cv.set_company_name_short(company_name_short.get()) cv.set_company_addr(company_addr.get()) cv.set_company_province(company_province.get()) cv.set_receiver(receiver.get()) cv.set_receiver_title(receiver_title.get()) cv.set_receiver_last_name(receiver_last_name.get()) cv.set_paragraph(paragraph.get("1.0", "end")) cv.populate(output.get())
from CV import CV if __name__ == '__main__': d = r'D:\s\De\度盘\C#项目开发实战入门(光盘资源)\Video' cv1 = CV(workDir=d, sleepTime=0) # speed = 1 # speed = 1.1 # speed = 1.2 # speed = 1.3 # speed = 1.4 # speed = 1.5 speed = 1.6 # speed = 1.8 # speed = 2 dealOldFilesMode = 0 gpu = False cv1.dealV(speed, dealOldFilesMode, gpu, 8)
def feed(self, arr, d=0.001): temp_dict = {} for item in self.data.columns.tolist(): temp_dict[item] = 1 for item in arr: assert item in temp_dict # start '''拼接''' train_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)]), 0)) train_val_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)]), 0)) test_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[self.data[self.label_name]==-1]), 0)) _onehot_feature = [] _cv_feature = [] _row_feature = [] for item in arr: if item not in config.type_dict: _row_feature.append(item) elif config.type_dict[item] == 'cv': _cv_feature.append(item) elif config.type_dict[item] == 'onehot': _onehot_feature.append(item) else: print('name error') return for features in _onehot_feature: self.data[features] = LabelEncoder().fit_transform(self.data[features].astype(str)) _train = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)] _train_val = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)] _test = self.data.loc[self.data[self.label_name]==-1] enc = OneHotEncoder() for feature in _onehot_feature: enc.fit(self.data[feature].values.reshape(-1, 1)) train_csr = sparse.hstack((train_csr, enc.transform(_train[feature].values.reshape(-1, 1))), 'csr', 'bool') train_val_csr = sparse.hstack((train_val_csr, enc.transform(_train_val[feature].values.reshape(-1, 1))), 'csr', 'bool') test_csr = sparse.hstack((test_csr, enc.transform(_test[feature].values.reshape(-1, 1))), 'csr', 'bool') cv = CountVectorizer(min_df=20) for feature in _cv_feature: self.data[feature] = self.data[feature].astype(str) cv.fit(self.data[feature]) train_csr = sparse.hstack((train_csr, cv.transform(_train[feature].astype(str))), 'csr', 'bool') train_val_csr = sparse.hstack((train_val_csr, cv.transform(_train_val[feature].astype(str))), 'csr', 'bool') test_csr = sparse.hstack((test_csr, cv.transform(_test[feature].astype(str))), 'csr', 'bool') train_csr = sparse.hstack((sparse.csr_matrix(_train[_row_feature]), train_csr), 'csr').astype('float32') train_val_csr = sparse.hstack((sparse.csr_matrix(_train_val[_row_feature]), train_val_csr), 'csr').astype('float32') test_csr = sparse.hstack((sparse.csr_matrix(_test[_row_feature]), test_csr), 'csr').astype('float32') if len(self.train_score_lst) != 0: for ix in range(len(self.train_score_lst)): train_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_score_lst[ix]).reshape(-1, 1)), train_csr), 'csr').astype('float32') train_val_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_val_score_lst[ix]).reshape(-1, 1)), train_val_csr), 'csr').astype('float32') test_csr = sparse.hstack((sparse.csr_matrix(np.array(self.test_score_lst[ix]).reshape(-1, 1)), test_csr), 'csr').astype('float32') '''CV,与之前的轮子直接对接''' lgb_params = { 'boosting_type':'gbdt', 'num_leaves':200, 'reg_alpha':1, 'reg_lambda':1, 'n_estimators':100000, 'objective':'binary', 'subsample':0.7, 'colsample_bytree':0.6, 'learning_rate':0.02, 'min_child_weight':1} c = CV(_df=train_csr, y=_train[self.label_name].values, random_state=self.random_state, is_val=False) c.CV(is_print=True, lgb_params=lgb_params, n_splits=5, round_cv=1) self.train_pred = 0 for item in c.MS_arr: self.train_pred += np.array(item['pred_train']) self.train_pred /= len(c.MS_arr) self.train_score_lst.append(self.train_pred) self.test_score_lst.append(c.get_result(test_csr)) self.train_val_score_lst.append(c.get_result(train_val_csr)) self.c = c self.c_lst.append(c)