# ### 不同尺度的降维 # In[8]: svd100 = TruncatedSVD(n_components=100, n_iter=15, random_state=666) brand_100 = pd.DataFrame(svd100.fit_transform(data.iloc[:, 1:])) brand_100['device_id'] = data.device_id.values # In[9]: svd550 = TruncatedSVD(n_components=550, n_iter=15, random_state=666) brand_550 = pd.DataFrame(svd550.fit_transform(data.iloc[:, 1:])) brand_550['device_id'] = data.device_id.values train = train_data.merge(brand_550, on='device_id', how='left') test = test_data.merge(brand_550, on='device_id', how='left') # In[14]: def train_code(train_data, test_data, label, num_class, n_folds=5): labels = train_data[[label]] train_data = train_data.drop(['device_id', 'label'], axis=1) test_data = test_data.drop(['device_id', 'label'], axis=1) train_predvec = np.zeros((train_data.shape[0], num_class)) test_predvec = np.zeros((test_data.shape[0], num_class)) SKF = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018) for train_indices, valid_indices in SKF.split(train_data, labels): # Training data for the fold x_train = train_data.loc[train_indices, :]
features.to_csv('features/h1.csv', index=False) # ## Load train_data, test_data # In[17]: # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # ### 第一组特征 # In[18]: h1_train = train_data.merge(features, on='device_id', how='left') h1_test = test_data.merge(features, on='device_id', how='left') # ## Xgboost # In[19]: import gc import numpy as np import xgboost as xgb import matplotlib.pyplot as plt from sklearn.metrics import log_loss from sklearn.model_selection import StratifiedKFold import warnings warnings.filterwarnings('ignore') get_ipython().run_line_magic('matplotlib', 'inline')
ch_vector['device_id'] = groupfeature.device_id.values # In[10]: # s_hour + c_hour sc_vector = sh_vector.merge(ch_vector, on='device_id', how='left') sc_vector.to_csv('features/h3.csv',index=False) # In[11]: train_set = train_data.merge(sc_vector, on='device_id', how='left') test_set = test_data.merge(sc_vector, on='device_id', how='left') # train code # In[12]: def xgbc_code(train_data, test_data,label, num_class, n_folds=5, obj='multi:softprob', metric='mlogloss'): labels = train_data[[label]] train_data = train_data.drop(['device_id','sex','age','label'],axis=1) test_data = test_data.drop(['device_id'],axis=1) train_predvec = np.zeros((train_data.shape[0], num_class)) test_predvec = np.zeros((test_data.shape[0], num_class))
# In[11]: # load trian test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # ------------------------ # ## Merge data # In[12]: train_data = train_data.merge(label_tfidf,on='device_id',how='left') test_data = test_data.merge(label_tfidf, on='device_id',how='left') # ------------------------------------ # # Train code # In[17]: from sklearn.neural_network import MLPClassifier from sklearn.model_selection import KFold,StratifiedKFold from sklearn.metrics import log_loss import warnings import gc
# --------------- # ## Train/test # In[6]: train_path = '../Demo/deviceid_train.tsv' test_path = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_path, test_path) # ### Merge(applist) # In[7]: train_data = train_data.merge(device_applist, on='device_id', how='left') test_data = test_data.merge(device_applist, on='device_id', how='left') # # MLPC # In[8]: from sklearn.model_selection import StratifiedKFold from sklearn.neural_network import MLPClassifier from sklearn.metrics import log_loss import gc import warnings warnings.filterwarnings('ignore') # In[9]:
h1 = pd.read_csv('features/h1.csv') h2 = pd.read_csv('features/h2_cnt300.csv') # In[ ]: # trian data , test data # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # Merge data # In[ ]: train_data = train_data.merge(applist, on='device_id', how='left') train_data = train_data.merge(labelcnt, on='device_id', how='left') train_data = train_data.merge(brand, on='device_id', how='left') train_data = train_data.merge(h1, on='device_id', how='left') train_data = train_data.merge(h2, on='device_id', how='left') test_data = test_data.merge(applist, on='device_id', how='left') test_data = test_data.merge(labelcnt, on='device_id', how='left') test_data = test_data.merge(brand, on='device_id', how='left') test_data = test_data.merge(h1, on='device_id', how='left') test_data = test_data.merge(h2, on='device_id', how='left') # Feature select # In[ ]: