Exemplo n.º 1
0
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 09:52:48 2016

@author: onodera
"""

import homedepot as hd  #comment out syn_tbl
import pandas as pd
import numpy as np

train, prd = hd.load_all(onlytrain=True, mk_csv=True)
items = pd.read_csv(hd.path_sub + '06sort_items_query.csv',
                    index_col=0)['0'].tolist()
train['s_item_myrule'] = train.s_.map(lambda x: hd.get_sitem(x))
train['s_item_list'] = train.s_.map(
    lambda x: hd.get_item_list_in_list(items, x))
train['s_item'] = ''
for i in train.index:
    if train['s_item_list'].values[i] == '':
        train['s_item'].values[i] = train['s_item_myrule'].values[i]
    else:
        train['s_item'].values[i] = train['s_item_list'].values[i]

items = pd.read_csv(hd.path_sub + '06sort_items_title.csv',
                    index_col=0)['0'].tolist()
prd['t_item_myrule'] = hd.hdp.pred_item_in_title(prd)
prd['t_item_myrule'] = prd.t_item_myrule.map(lambda x: hd.str_stem(x))
prd['t_item_list'] = prd.t_.map(lambda x: hd.get_item_list_in_list(items, x))
prd['t_item'] = ''
for i in prd.index:
Exemplo n.º 2
0
Created on Mon Feb 29 13:13:41 2016

@author: onodera
"""

import pandas as pd
import numpy as np
import time
import homedepot as hd
reload(hd)

#seed = 1457665531 # fix
seed = int(time.time())
np.random.seed(seed)

tt, prd = hd.load_all(mod=4)

if len(tt) == 240760:
    test_id = tt[np.isnan(tt.r)][tt.s_len > 4]['id']
    test_id = test_id.to_frame().id.tolist()
else:
    raise Exception("SUCCESS!!!")

#==============================================================================
# train test
#==============================================================================
labels = np.array(tt[~np.isnan(tt.r)].r)
tt = pd.merge(tt, prd, on='pid', how='left')
col = [x for x in tt.columns if tt[x].dtype != 'O' and x not in ['id', 'pid']]
tt = tt[col]
col.remove('r')
Exemplo n.º 3
0
import time
import homedepot as hd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
reload(hd)

#seed = 1457665531 # fix
seed = int(time.time())
np.random.seed(seed)

sub = hd.load_sub()
sub.drop(['relevance'], axis=1, inplace=True)

words = [1, 2, 3, 4]
for w in words:
    tt, prd = hd.load_all(word=w)
    test_id = tt[np.isnan(tt.r)]['id']
    test_id = test_id.to_frame()
    #raise Exception("SUCCESS!!!")

    #==============================================================================
    # train test
    #==============================================================================
    labels = np.array(tt[~np.isnan(tt.r)].r)
    tt = pd.merge(tt, prd, on='pid', how='left')
    col = [
        x for x in tt.columns if tt[x].dtype != 'O' and x not in ['id', 'pid']
    ]
    tt = tt[col]
    col.remove('r')
    'train'