示例#1
0
get_ipython().run_line_magic('matplotlib', 'inline')

# 机型数据:每个设备的品牌和型号【deviceid_brand.tsv】

# In[2]:

device_brand = pd.read_csv('new_feature/device_brand.csv')

# label =  sex+age, one-hot encode

# In[3]:

# trian data , test data
train_datapath = '../Demo/deviceid_train.tsv'
test_datapath = '../Demo/deviceid_test.tsv'
train_data, test_data = LoadData(train_datapath, test_datapath)

# In[4]:

train_data.drop(['sex', 'age'], axis=1, inplace=True)

test_data['label'] = 'unknow'

data = train_data.append(test_data)

# Merge device_brand

# In[5]:

data = data.merge(device_brand, on='device_id', how='left')
示例#2
0
# In[3]:

applist = pd.read_csv('features/applist_tfidf.csv')
labelcnt = pd.read_csv('features/label_tfidf.csv')
brand = pd.read_csv('features/brand100.csv')
h1 = pd.read_csv('features/h1.csv')
h2 = pd.read_csv('features/h2_tfidf300.csv')
h3 = pd.read_csv('features/h3.csv')

# In[4]:

# trian data , test data
# train test data
train_datapath = '../Demo/deviceid_train.tsv'
test_datapath = '../Demo/deviceid_test.tsv'
train_data, test_data = LoadData(train_datapath, test_datapath)

# # Merge data

# In[4]:

data = applist.merge(labelcnt, on='device_id', how='left')
data = data.merge(brand, on='device_id', how='left')
data = data.merge(h1, on='device_id', how='left')
data = data.merge(h2, on='device_id', how='left')
data = data.merge(h3, on='device_id', how='left')

# ---------------------------------------------
# # Feature select

# In[5]:
示例#3
0
features['btype'] = pd.Categorical(features.btype).codes

features['mfua'] = pd.Categorical(features.mfua).codes
features['label_1'] = pd.Categorical(features.label_1).codes
features['label_2'] = pd.Categorical(features.label_2).codes

features.to_csv('features/h1.csv', index=False)

# ## Load train_data, test_data

# In[17]:

# train test data
train_datapath = '../Demo/deviceid_train.tsv'
test_datapath = '../Demo/deviceid_test.tsv'
train_data, test_data = LoadData(train_datapath, test_datapath)

# ### 第一组特征

# In[18]:

h1_train = train_data.merge(features, on='device_id', how='left')
h1_test = test_data.merge(features, on='device_id', how='left')

# ## Xgboost

# In[19]:

import gc
import numpy as np
import xgboost as xgb
示例#4
0
# In[2]:


# 行为数据
behavior = pd.read_csv('../Demo/deviceid_package_start_close.tsv',sep='\t',
                       names = ['device_id','app_id','start','close'])


# In[3]:


# trian data , test data
# train test data
train_datapath =  '../Demo/deviceid_train.tsv' 
test_datapath =  '../Demo/deviceid_test.tsv' 
train_data, test_data = LoadData(train_datapath, test_datapath)


# ## start , close

# In[4]:


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=CountVectorizer()


# In[5]:

示例#5
0
# In[10]:


label_cnt = label1_cnt.merge(label2_cnt, on='device_id', how='left')

label_tfidf= label1_tfidf.merge(label2_tfidf, on='device_id',how='left')


# In[11]:


# load trian test data
train_datapath =  '../Demo/deviceid_train.tsv' 
test_datapath =  '../Demo/deviceid_test.tsv' 
train_data, test_data = LoadData(train_datapath, test_datapath)


# ------------------------
# ## Merge data

# In[12]:


train_data = train_data.merge(label_tfidf,on='device_id',how='left')

test_data = test_data.merge(label_tfidf, on='device_id',how='left')


# ------------------------------------
# # Train code
示例#6
0
app_svd.to_csv('features/applist_cnt.csv', index=False)
app_tfidf.to_csv('features/applist_tfidf.csv', index=False)

# In[5]:

device_applist = device_applist.merge(app_svd, on='device_id', how='left')

# ---------------

# ## Train/test

# In[6]:

train_path = '../Demo/deviceid_train.tsv'
test_path = '../Demo/deviceid_test.tsv'
train_data, test_data = LoadData(train_path, test_path)

# ### Merge(applist)

# In[7]:

train_data = train_data.merge(device_applist, on='device_id', how='left')
test_data = test_data.merge(device_applist, on='device_id', how='left')

# # MLPC

# In[8]:

from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
示例#7
0
文件: lgbcnt.py 项目: AlexanLee/ta
# In[ ]:

applist = pd.read_csv('features/applist_cnt.csv')
labelcnt = pd.read_csv('features/label_cnt.csv')
brand = pd.read_csv('features/brand100.csv')
h1 = pd.read_csv('features/h1.csv')
h2 = pd.read_csv('features/h2_cnt300.csv')

# In[ ]:

# trian data , test data
# train test data
train_datapath = '../Demo/deviceid_train.tsv'
test_datapath = '../Demo/deviceid_test.tsv'
train_data, test_data = LoadData(train_datapath, test_datapath)

# Merge data

# In[ ]:

train_data = train_data.merge(applist, on='device_id', how='left')
train_data = train_data.merge(labelcnt, on='device_id', how='left')
train_data = train_data.merge(brand, on='device_id', how='left')
train_data = train_data.merge(h1, on='device_id', how='left')
train_data = train_data.merge(h2, on='device_id', how='left')

test_data = test_data.merge(applist, on='device_id', how='left')
test_data = test_data.merge(labelcnt, on='device_id', how='left')
test_data = test_data.merge(brand, on='device_id', how='left')
test_data = test_data.merge(h1, on='device_id', how='left')