get_ipython().run_line_magic('matplotlib', 'inline') # 机型数据:每个设备的品牌和型号【deviceid_brand.tsv】 # In[2]: device_brand = pd.read_csv('new_feature/device_brand.csv') # label = sex+age, one-hot encode # In[3]: # trian data , test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # In[4]: train_data.drop(['sex', 'age'], axis=1, inplace=True) test_data['label'] = 'unknow' data = train_data.append(test_data) # Merge device_brand # In[5]: data = data.merge(device_brand, on='device_id', how='left')
# In[3]: applist = pd.read_csv('features/applist_tfidf.csv') labelcnt = pd.read_csv('features/label_tfidf.csv') brand = pd.read_csv('features/brand100.csv') h1 = pd.read_csv('features/h1.csv') h2 = pd.read_csv('features/h2_tfidf300.csv') h3 = pd.read_csv('features/h3.csv') # In[4]: # trian data , test data # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # # Merge data # In[4]: data = applist.merge(labelcnt, on='device_id', how='left') data = data.merge(brand, on='device_id', how='left') data = data.merge(h1, on='device_id', how='left') data = data.merge(h2, on='device_id', how='left') data = data.merge(h3, on='device_id', how='left') # --------------------------------------------- # # Feature select # In[5]:
features['btype'] = pd.Categorical(features.btype).codes features['mfua'] = pd.Categorical(features.mfua).codes features['label_1'] = pd.Categorical(features.label_1).codes features['label_2'] = pd.Categorical(features.label_2).codes features.to_csv('features/h1.csv', index=False) # ## Load train_data, test_data # In[17]: # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # ### 第一组特征 # In[18]: h1_train = train_data.merge(features, on='device_id', how='left') h1_test = test_data.merge(features, on='device_id', how='left') # ## Xgboost # In[19]: import gc import numpy as np import xgboost as xgb
# In[2]: # 行为数据 behavior = pd.read_csv('../Demo/deviceid_package_start_close.tsv',sep='\t', names = ['device_id','app_id','start','close']) # In[3]: # trian data , test data # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # ## start , close # In[4]: from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer vectorizer=CountVectorizer() # In[5]:
# In[10]: label_cnt = label1_cnt.merge(label2_cnt, on='device_id', how='left') label_tfidf= label1_tfidf.merge(label2_tfidf, on='device_id',how='left') # In[11]: # load trian test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # ------------------------ # ## Merge data # In[12]: train_data = train_data.merge(label_tfidf,on='device_id',how='left') test_data = test_data.merge(label_tfidf, on='device_id',how='left') # ------------------------------------ # # Train code
app_svd.to_csv('features/applist_cnt.csv', index=False) app_tfidf.to_csv('features/applist_tfidf.csv', index=False) # In[5]: device_applist = device_applist.merge(app_svd, on='device_id', how='left') # --------------- # ## Train/test # In[6]: train_path = '../Demo/deviceid_train.tsv' test_path = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_path, test_path) # ### Merge(applist) # In[7]: train_data = train_data.merge(device_applist, on='device_id', how='left') test_data = test_data.merge(device_applist, on='device_id', how='left') # # MLPC # In[8]: from sklearn.model_selection import StratifiedKFold from sklearn.neural_network import MLPClassifier from sklearn.metrics import log_loss
# In[ ]: applist = pd.read_csv('features/applist_cnt.csv') labelcnt = pd.read_csv('features/label_cnt.csv') brand = pd.read_csv('features/brand100.csv') h1 = pd.read_csv('features/h1.csv') h2 = pd.read_csv('features/h2_cnt300.csv') # In[ ]: # trian data , test data # train test data train_datapath = '../Demo/deviceid_train.tsv' test_datapath = '../Demo/deviceid_test.tsv' train_data, test_data = LoadData(train_datapath, test_datapath) # Merge data # In[ ]: train_data = train_data.merge(applist, on='device_id', how='left') train_data = train_data.merge(labelcnt, on='device_id', how='left') train_data = train_data.merge(brand, on='device_id', how='left') train_data = train_data.merge(h1, on='device_id', how='left') train_data = train_data.merge(h2, on='device_id', how='left') test_data = test_data.merge(applist, on='device_id', how='left') test_data = test_data.merge(labelcnt, on='device_id', how='left') test_data = test_data.merge(brand, on='device_id', how='left') test_data = test_data.merge(h1, on='device_id', how='left')