예제 #1
0
df['st'] = df.s_item + ' ' + df.t_item

freq = df.st.value_counts()
df = freq.to_frame()


def ret0(s):
    return s.split()[0]


def ret1(s):
    return s.split()[1]


df['s'] = df.index.map(lambda x: ret0(x))
df['t'] = df.index.map(lambda x: ret1(x))

df = df[df.st > 2][df.s != df.t][df.s.map(len) > 2]

hd.out(df[['s', 't']], '07syn')

raise

for i in train.index:
    if 'screww' in train.s[i]:
        print train.s_[i], i

for i in range(30):
    print train.s_item1[i], hd.get_syn(train.s_item1[i])
예제 #2
0
                      nround,
                      watchlist,
                      early_stopping_rounds=50,
                      verbose_eval=True)
    if i == 0:
        pred = model.predict(xgb.DMatrix(test))
    else:
        pred += model.predict(xgb.DMatrix(test))

raise Exception("SUCCESS!!!")

sub = hd.load_sub()
sub.relevance = pred / cnt
sub.ix[sub.relevance > 3, 'relevance'] = 3
sub.ix[sub.relevance < 1, 'relevance'] = 1
hd.out(sub, 'xgb10_327_2')
#==============================================================================
#
#==============================================================================
from sklearn.ensemble import RandomForestRegressor
import stacking as st
from sklearn.metrics import mean_squared_error
clf = RandomForestRegressor(
    n_estimators=np.random.randint(5000, 5300),
    criterion="mse",
    max_features=0.8,
    max_depth=np.random.randint(10, 12),
    n_jobs=-1,
)

pred = st.predict(train, labels, clf)
예제 #3
0
                p1, p2 = np.random.randint(GLENGTH, size=2)
                self.gtype[p1], self.gtype[p2] = self.gtype[p2], self.gtype[p1]
        self.f = 0.0


#==============================================================================
#
#==============================================================================
# initialize
pop = Pop()

for i in range(GENERATION):
    pop.kill_genes()
    pop.calc_f()
    pop.print_f()
    pop.copy_top_gene()
    pop.generate_population()

    elapsed_time = time.time() - start
    print 'elapsed_time:', (elapsed_time / 60), "min"

    items['order'] = pop.genes[0].gtype
    items.sort_values(by='order', inplace=True)
    hd.out(items, 'items_opted_' + str(ospid), True)
    items.sort_index(inplace=True)

# main
if __name__ == "__main__":

    print pop.genes[0].f
예제 #4
0
    weight.value = weight.value.map(lambda x: rep_alpha(x))
    weight = weight[weight.value.str.replace(".", "").str.isdigit() == True]
    del weight['name']

    merged = pd.concat([depth, height, width])
    del merged['name']
    merged.sort_values(by='pid', inplace=True)
    merged.reset_index(drop=True, inplace=True)

    li = []
    for pid in merged.pid.unique().tolist():
        li.append(
            [int(pid), ' '.join(merged[merged.pid == pid].value.tolist())])
    size = pd.DataFrame(li, columns=['pid', 'att_size'])
    prd = pd.merge(prd, size, on='pid', how='left')

    li = []
    for pid in weight.pid.unique().tolist():
        li.append(
            [int(pid), ' '.join(weight[weight.pid == pid].value.tolist())])
    weight = pd.DataFrame(li, columns=['pid', 'att_weight'])
    prd = pd.merge(prd, weight, on='pid', how='left')

    return prd


prd = get_size(prd)
prd.fillna('', inplace=True)

hd.out(prd[['pid', 'att_size', 'att_weight']], '03att_size_weight')
예제 #5
0
        cross = pd.crosstab(df.s, df.t)
        if w == words[0]:
            cross_tbl = cross
        else:
            cross_tbl = pd.concat([cross_tbl, cross])

cross_tbl.fillna(0, inplace=True)
cross_tbl = cross_tbl.div(cross_tbl.sum(1), axis=0)

cross_tbl['w'] = get_entropy(cross_tbl)

subset = cross_tbl['w']
subset = subset.to_frame()
subset['s'] = subset.index

hd.out(subset[['s', 'w']], '06entropy_weight1')
#cross_tbl['sum'] = cross_tbl.sum(axis=1)

raise Exception("!!!!!!!!")

merged = tt.merge(prd, on='pid')
merged = merged[merged.r < 1.5]

words = hd.mk_freq_table(merged.s_).index.tolist()

for w in words:
    tmp = merged[merged.t_.map(lambda x: is_contain_w(x, w))].t_.tolist()
    if len(tmp) > 0:
        tmp = flaten_list(tmp, w)
        df = pd.DataFrame(tmp, columns=['s', 't'])
        cross = pd.crosstab(df.s, df.t)
예제 #6
0
    ============================"""
print '========== PRODUCT =========='
prd = prd.sort_values(by='pid')
"TITLE"
prd.reset_index(drop=True,inplace=True)
prd['tlen'] = prd.t.str.len()
prd['t_'] = prd.t.map(lambda x:hd.str_stem(x)).str.split()
prd.loc[:,'t_len'] = prd.t_.str.len()

merged = pd.merge(tt,prd[['pid','t_']],on='pid',how='left')

tt = hd.rep_comb(merged)

tt.s = tt.s_.map(lambda x:' '.join(x))

tt['comb'] = 1 - (tt['s_len'] == tt.s_.str.len())*1

hd.out(tt[['id','s','typo','comb']],'04query_comb'+str(mod))

raise Exception('SUCCESS')
#==============================================================================
# CONCAT
#==============================================================================
for i in range(4):
    if i == 0:
        tt = pd.read_csv(path_sub+'04query_comb'+str(i)+'.csv')
    else:
        tt = pd.concat([tt,pd.read_csv(path_sub+'04query_comb'+str(i)+'.csv')])
    tt.sort_values(by='id',inplace=True)

hd.out(tt[['id','s','typo','comb']],'04query_comb')
예제 #7
0
        xgval = xgb.DMatrix(x_valid, label=y_valid)

        #train using early stopping and predict
        watchlist = [(xgtrain, 'train'), (xgval, 'val')]
        model = xgb.train(param,
                          xgtrain,
                          nround,
                          watchlist,
                          early_stopping_rounds=50,
                          verbose_eval=True)
        if i == 0:
            pred = model.predict(xgb.DMatrix(test))
        else:
            pred += model.predict(xgb.DMatrix(test))

    test_id['relevance'] = pred / cnt

    if w == words[0]:
        stack = test_id
    else:
        stack = pd.concat([stack, test_id])

sub = pd.merge(sub, stack, on='id', how='left')
sub = sub[sub.relevance > 0]

sub.ix[sub.relevance > 3, 'relevance'] = 3
sub.ix[sub.relevance < 1, 'relevance'] = 1
hd.out(sub, 'xgb10_words1-4_328_1')

raise Exception("SUCCESS!!!")
예제 #8
0
#==============================================================================

li = []

for q in qq:
    base['item'] = base.s_.map(lambda x: get_item_list_in_list([q], x))

    base['match'] = set_flg(base)
    print q
    li.append(get_dict(base))

df = pd.DataFrame(li)
col = ['col_' + str(x) for x in rr]
df.columns = col
df.index = qq
hd.out(df, '05item_tbl_query', True)
df = pd.read_csv(hd.path_sub + '05item_tbl_query.csv', index_col=0)

df['sum'] = df[col].sum(1)
df['ave'] = df[col].mean(1)

df.fillna(0, inplace=True)
df = df[df['col_2.33'] > 0.4][df['col_2.67'] > 0.5][df['col_3.0'] > 0.5][
    df['sum'] != 1]

hd.out(df, '05item_tbl_query_selected', True)
#==============================================================================
# EXTRACT ITEMS 2
#==============================================================================

li = []
예제 #9
0
att = att[att.name == name]
del att['name']
att.columns = [['pid', name]]
att = att[att.pid.isin(pid)]
att[name] = att[name].map(lambda x: hd.str_stem(x))

prd = pd.merge(prd, att, on='pid', how='left')

prd = prd.drop_duplicates(subset='pid')
prd.reset_index(drop=True, inplace=True)
prd.rename(columns={'MFG Brand Name': 'p_brand'}, inplace=True)
prd['p_brand'] = prd.p_brand.fillna('null')
brand = prd.p_brand.str.lower().unique().tolist()
brand.remove('null')


def revise_brand(prd, brand):
    for i in prd.index:
        if prd.p_brand[i] == 'null':
            for j in range(4):
                if ' '.join(prd.t_[i][:j]) in brand:
                    br = ' '.join(prd.t_[i][:j])
                    prd.p_brand[i] = br
                    break
    return prd


prd = revise_brand(prd, brand)

hd.out(prd[['pid', 'p_brand']], '01att_brand')
        return True
    return False


def is_contain_words(li, words):
    cnt = 0
    for w in words:
        if w in li:
            cnt += 1
    if cnt == len(words):
        return True
    return False


tt.s_ = tt.s_.map(lambda x: del_num(x))

query = tt.s_.map(lambda x: ' '.join(x)).unique().tolist()

query = map(lambda x: x.split(), query)

li = []
for qq in query:
    li.append([
        ' '.join(qq),
        len(prd[prd.t_.map(lambda x: is_contain_words(x, qq))])
    ])

df = pd.DataFrame(li, columns=['s', 'w'])

hd.out(df, '05tfidf_weight_allquery')
예제 #11
0
            items.remove(i)
            items.insert(j, i)
            base['item'] = base.t_.map(
                lambda x: get_item_list_in_list(items, x))
            base['match'] = set_flg(base)
            score = fitness_function(base)
            print 'item:', i, '   score_best:', score_best, '   score:', score
            if score_best < score:
                score_best = score
            elif score_best == score:
                ng.append(i)
            else:
                items.remove(i)
                items.insert(ind_bk, i)
    df = pd.DataFrame(items)
    hd.out(df, '06sort_items_title', True)
    df = pd.DataFrame(ng)
    hd.out(df, '06sort_items_title_ng', True)

raise
for step in steps:
    for i in items[step:]:
        if i not in ng:
            sw = True
            ins_pos = items.index(i)
            while sw == True:
                ins_pos -= step
                if ins_pos < 0:
                    sw = False
                    break
                ind_bk = items.index(i)
        mate += prd['att_material'][i]
        
    li_color.append(' '.join(color))
    li_power.append(' '.join(power))
    li_mate.append(' '.join(mate))
    
prd['att_color'] = li_color
prd['att_power'] = li_power
prd['att_material'] = li_mate
prd.drop(['Color','Color Family','Color/Finish','Finish','Finish Family',
          'Mount Type','Fuel Type','Power Type','Bulb Type'],axis=1, 
          inplace=True)

prd['att_material'] = prd.att_material.map(lambda x:hd.str_stem(x))

hd.out(prd[['pid','att_color','att_power','att_material']],'02att_color_power_material')














예제 #13
0
words = [x for x in hd.mk_freq_table(merged.s_).index.tolist() if len(x) > 2]

li = []
for w in words:
    #    other = list(words)
    #    other.remove(w)

    contain_s = merged[merged.t_.map(lambda x:is_contain_w(x,w))]\
                      [merged.s_.map(lambda x:is_contain_w(x,w))]
    print w, ':', 'MEAN', round(contain_s.r.mean(),
                                3), 'STD', round(contain_s.r.std(), 3)
    li.append([w, round(contain_s.r.mean(), 3)])
df = pd.DataFrame(li, columns=['s', 'imp'])

df.fillna(0, inplace=True)
hd.out(df, '04item_imp')

#==============================================================================
#
#==============================================================================


def flaten_list(lili):
    li = []
    for i in lili:
        for j in i:
            if j.isalpha() and len(j) > 2:
                li.append(j)
    return li

예제 #14
0
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  7 22:23:34 2016

@author: Kazuki
"""

import homedepot as hd  #comment out syn_tbl
import numpy as np
import pandas as pd
import hdpath as hdp

tt, prd = hd.load_all(onlytrain=True, mk_csv=True)

hd.out(tt, 'tt')
hd.out(prd, 'prd')

po, ps = hdp.load()
tt = pd.read_csv(ps + 'tt.csv')
tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split()
tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x))
prd = pd.read_csv(ps + 'prd.csv')
prd['t_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split()


def is_contain_w(li, w):
    if w in li:
        return True
    return False

예제 #15
0
                      watchlist,
                      early_stopping_rounds=50,
                      verbose_eval=True)
    if i == 0:
        pred = model.predict(xgb.DMatrix(test))
    else:
        pred += model.predict(xgb.DMatrix(test))

sub = hd.load_sub()
sub.relevance = pred / cnt

sub = sub[sub.id.isin(test_id)]

sub.ix[sub.relevance > 3, 'relevance'] = 3
sub.ix[sub.relevance < 1, 'relevance'] = 1
hd.out(sub, 'xgb10_words5-_328_1')

raise Exception("SUCCESS!!!")

#==============================================================================
# merge
#==============================================================================

pred = pd.concat([
    pd.read_csv(hd.path_sub + 'xgb10_words1-4_328_1.csv'),
    pd.read_csv(hd.path_sub + 'xgb10_words5-_328_1.csv')
])
pred.sort_values(by='id', inplace=True)

#sub = hd.load_sub()
#sub.relevance = pred
예제 #16
0
            items.remove(i)
            items.insert(j, i)
            base['item'] = base.s_.map(
                lambda x: get_item_list_in_list(items, x))
            base['match'] = set_flg(base)
            score = fitness_function(base)
            print 'item:', i, '   score_best:', score_best, '   score:', score
            if score_best < score:
                score_best = score
            elif score_best == score:
                ng.append(i)
            else:
                items.remove(i)
                items.insert(ind_bk, i)
    df = pd.DataFrame(items)
    hd.out(df, '06sort_items_query', True)
    df = pd.DataFrame(ng)
    hd.out(df, '06sort_items_query_ng', True)

raise
for step in steps:
    for i in items[step:]:
        if i not in ng:
            sw = True
            ins_pos = items.index(i)
            while sw == True:
                ins_pos -= step
                if ins_pos < 0:
                    sw = False
                    break
                ind_bk = items.index(i)
예제 #17
0
#==============================================================================
s_table = hd.mk_freq_table(train.s_)

s_table = s_table.to_frame()

li = []
for i in s_table.index:
    sum_r = 0
    cnt = 0
    for j in train.index:
        if i in train.s_[j]:
            sum_r += train.r[j]
            cnt += 1
    li.append(float(sum_r) / cnt)
s_table['ave_r'] = li
hd.out(s_table, 's_tbl')

#==============================================================================
#
#==============================================================================
t_table = hd.mk_freq_table(train.t_)

t_table = t_table.to_frame()

li = []
for i in t_table.index:
    sum_r = 0
    cnt = 0
    for j in train.index:
        if i in train.t_[j]:
            sum_r += train.r[j]
예제 #18
0
prd['t2'] = prd.t_.map(lambda x:' '.join(x))

s_freq = hd.mk_freq_table(tt.s_)
s_freq = s_freq.to_frame()
s_ind = s_freq.index.tolist()

def is_contain_w(li,w):
    if w in li:
        return True
    return False
    
def is_contain_words(li,words):
    cnt = 0
    for w in words:
        if w in li:
            cnt +=1
    if cnt == len(words):
        return True
    return False


li = []
for w in s_ind:
    li.append(len(prd[prd.t_.map(lambda x:is_contain_w(x,w))]))
s_freq['w'] = li

s_freq['s'] = s_freq.index

hd.out(s_freq[['s','w']],'05tfidf_weight')