df['st'] = df.s_item + ' ' + df.t_item freq = df.st.value_counts() df = freq.to_frame() def ret0(s): return s.split()[0] def ret1(s): return s.split()[1] df['s'] = df.index.map(lambda x: ret0(x)) df['t'] = df.index.map(lambda x: ret1(x)) df = df[df.st > 2][df.s != df.t][df.s.map(len) > 2] hd.out(df[['s', 't']], '07syn') raise for i in train.index: if 'screww' in train.s[i]: print train.s_[i], i for i in range(30): print train.s_item1[i], hd.get_syn(train.s_item1[i])
nround, watchlist, early_stopping_rounds=50, verbose_eval=True) if i == 0: pred = model.predict(xgb.DMatrix(test)) else: pred += model.predict(xgb.DMatrix(test)) raise Exception("SUCCESS!!!") sub = hd.load_sub() sub.relevance = pred / cnt sub.ix[sub.relevance > 3, 'relevance'] = 3 sub.ix[sub.relevance < 1, 'relevance'] = 1 hd.out(sub, 'xgb10_327_2') #============================================================================== # #============================================================================== from sklearn.ensemble import RandomForestRegressor import stacking as st from sklearn.metrics import mean_squared_error clf = RandomForestRegressor( n_estimators=np.random.randint(5000, 5300), criterion="mse", max_features=0.8, max_depth=np.random.randint(10, 12), n_jobs=-1, ) pred = st.predict(train, labels, clf)
p1, p2 = np.random.randint(GLENGTH, size=2) self.gtype[p1], self.gtype[p2] = self.gtype[p2], self.gtype[p1] self.f = 0.0 #============================================================================== # #============================================================================== # initialize pop = Pop() for i in range(GENERATION): pop.kill_genes() pop.calc_f() pop.print_f() pop.copy_top_gene() pop.generate_population() elapsed_time = time.time() - start print 'elapsed_time:', (elapsed_time / 60), "min" items['order'] = pop.genes[0].gtype items.sort_values(by='order', inplace=True) hd.out(items, 'items_opted_' + str(ospid), True) items.sort_index(inplace=True) # main if __name__ == "__main__": print pop.genes[0].f
weight.value = weight.value.map(lambda x: rep_alpha(x)) weight = weight[weight.value.str.replace(".", "").str.isdigit() == True] del weight['name'] merged = pd.concat([depth, height, width]) del merged['name'] merged.sort_values(by='pid', inplace=True) merged.reset_index(drop=True, inplace=True) li = [] for pid in merged.pid.unique().tolist(): li.append( [int(pid), ' '.join(merged[merged.pid == pid].value.tolist())]) size = pd.DataFrame(li, columns=['pid', 'att_size']) prd = pd.merge(prd, size, on='pid', how='left') li = [] for pid in weight.pid.unique().tolist(): li.append( [int(pid), ' '.join(weight[weight.pid == pid].value.tolist())]) weight = pd.DataFrame(li, columns=['pid', 'att_weight']) prd = pd.merge(prd, weight, on='pid', how='left') return prd prd = get_size(prd) prd.fillna('', inplace=True) hd.out(prd[['pid', 'att_size', 'att_weight']], '03att_size_weight')
cross = pd.crosstab(df.s, df.t) if w == words[0]: cross_tbl = cross else: cross_tbl = pd.concat([cross_tbl, cross]) cross_tbl.fillna(0, inplace=True) cross_tbl = cross_tbl.div(cross_tbl.sum(1), axis=0) cross_tbl['w'] = get_entropy(cross_tbl) subset = cross_tbl['w'] subset = subset.to_frame() subset['s'] = subset.index hd.out(subset[['s', 'w']], '06entropy_weight1') #cross_tbl['sum'] = cross_tbl.sum(axis=1) raise Exception("!!!!!!!!") merged = tt.merge(prd, on='pid') merged = merged[merged.r < 1.5] words = hd.mk_freq_table(merged.s_).index.tolist() for w in words: tmp = merged[merged.t_.map(lambda x: is_contain_w(x, w))].t_.tolist() if len(tmp) > 0: tmp = flaten_list(tmp, w) df = pd.DataFrame(tmp, columns=['s', 't']) cross = pd.crosstab(df.s, df.t)
============================""" print '========== PRODUCT ==========' prd = prd.sort_values(by='pid') "TITLE" prd.reset_index(drop=True,inplace=True) prd['tlen'] = prd.t.str.len() prd['t_'] = prd.t.map(lambda x:hd.str_stem(x)).str.split() prd.loc[:,'t_len'] = prd.t_.str.len() merged = pd.merge(tt,prd[['pid','t_']],on='pid',how='left') tt = hd.rep_comb(merged) tt.s = tt.s_.map(lambda x:' '.join(x)) tt['comb'] = 1 - (tt['s_len'] == tt.s_.str.len())*1 hd.out(tt[['id','s','typo','comb']],'04query_comb'+str(mod)) raise Exception('SUCCESS') #============================================================================== # CONCAT #============================================================================== for i in range(4): if i == 0: tt = pd.read_csv(path_sub+'04query_comb'+str(i)+'.csv') else: tt = pd.concat([tt,pd.read_csv(path_sub+'04query_comb'+str(i)+'.csv')]) tt.sort_values(by='id',inplace=True) hd.out(tt[['id','s','typo','comb']],'04query_comb')
xgval = xgb.DMatrix(x_valid, label=y_valid) #train using early stopping and predict watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(param, xgtrain, nround, watchlist, early_stopping_rounds=50, verbose_eval=True) if i == 0: pred = model.predict(xgb.DMatrix(test)) else: pred += model.predict(xgb.DMatrix(test)) test_id['relevance'] = pred / cnt if w == words[0]: stack = test_id else: stack = pd.concat([stack, test_id]) sub = pd.merge(sub, stack, on='id', how='left') sub = sub[sub.relevance > 0] sub.ix[sub.relevance > 3, 'relevance'] = 3 sub.ix[sub.relevance < 1, 'relevance'] = 1 hd.out(sub, 'xgb10_words1-4_328_1') raise Exception("SUCCESS!!!")
#============================================================================== li = [] for q in qq: base['item'] = base.s_.map(lambda x: get_item_list_in_list([q], x)) base['match'] = set_flg(base) print q li.append(get_dict(base)) df = pd.DataFrame(li) col = ['col_' + str(x) for x in rr] df.columns = col df.index = qq hd.out(df, '05item_tbl_query', True) df = pd.read_csv(hd.path_sub + '05item_tbl_query.csv', index_col=0) df['sum'] = df[col].sum(1) df['ave'] = df[col].mean(1) df.fillna(0, inplace=True) df = df[df['col_2.33'] > 0.4][df['col_2.67'] > 0.5][df['col_3.0'] > 0.5][ df['sum'] != 1] hd.out(df, '05item_tbl_query_selected', True) #============================================================================== # EXTRACT ITEMS 2 #============================================================================== li = []
att = att[att.name == name] del att['name'] att.columns = [['pid', name]] att = att[att.pid.isin(pid)] att[name] = att[name].map(lambda x: hd.str_stem(x)) prd = pd.merge(prd, att, on='pid', how='left') prd = prd.drop_duplicates(subset='pid') prd.reset_index(drop=True, inplace=True) prd.rename(columns={'MFG Brand Name': 'p_brand'}, inplace=True) prd['p_brand'] = prd.p_brand.fillna('null') brand = prd.p_brand.str.lower().unique().tolist() brand.remove('null') def revise_brand(prd, brand): for i in prd.index: if prd.p_brand[i] == 'null': for j in range(4): if ' '.join(prd.t_[i][:j]) in brand: br = ' '.join(prd.t_[i][:j]) prd.p_brand[i] = br break return prd prd = revise_brand(prd, brand) hd.out(prd[['pid', 'p_brand']], '01att_brand')
return True return False def is_contain_words(li, words): cnt = 0 for w in words: if w in li: cnt += 1 if cnt == len(words): return True return False tt.s_ = tt.s_.map(lambda x: del_num(x)) query = tt.s_.map(lambda x: ' '.join(x)).unique().tolist() query = map(lambda x: x.split(), query) li = [] for qq in query: li.append([ ' '.join(qq), len(prd[prd.t_.map(lambda x: is_contain_words(x, qq))]) ]) df = pd.DataFrame(li, columns=['s', 'w']) hd.out(df, '05tfidf_weight_allquery')
items.remove(i) items.insert(j, i) base['item'] = base.t_.map( lambda x: get_item_list_in_list(items, x)) base['match'] = set_flg(base) score = fitness_function(base) print 'item:', i, ' score_best:', score_best, ' score:', score if score_best < score: score_best = score elif score_best == score: ng.append(i) else: items.remove(i) items.insert(ind_bk, i) df = pd.DataFrame(items) hd.out(df, '06sort_items_title', True) df = pd.DataFrame(ng) hd.out(df, '06sort_items_title_ng', True) raise for step in steps: for i in items[step:]: if i not in ng: sw = True ins_pos = items.index(i) while sw == True: ins_pos -= step if ins_pos < 0: sw = False break ind_bk = items.index(i)
mate += prd['att_material'][i] li_color.append(' '.join(color)) li_power.append(' '.join(power)) li_mate.append(' '.join(mate)) prd['att_color'] = li_color prd['att_power'] = li_power prd['att_material'] = li_mate prd.drop(['Color','Color Family','Color/Finish','Finish','Finish Family', 'Mount Type','Fuel Type','Power Type','Bulb Type'],axis=1, inplace=True) prd['att_material'] = prd.att_material.map(lambda x:hd.str_stem(x)) hd.out(prd[['pid','att_color','att_power','att_material']],'02att_color_power_material')
words = [x for x in hd.mk_freq_table(merged.s_).index.tolist() if len(x) > 2] li = [] for w in words: # other = list(words) # other.remove(w) contain_s = merged[merged.t_.map(lambda x:is_contain_w(x,w))]\ [merged.s_.map(lambda x:is_contain_w(x,w))] print w, ':', 'MEAN', round(contain_s.r.mean(), 3), 'STD', round(contain_s.r.std(), 3) li.append([w, round(contain_s.r.mean(), 3)]) df = pd.DataFrame(li, columns=['s', 'imp']) df.fillna(0, inplace=True) hd.out(df, '04item_imp') #============================================================================== # #============================================================================== def flaten_list(lili): li = [] for i in lili: for j in i: if j.isalpha() and len(j) > 2: li.append(j) return li
# -*- coding: utf-8 -*- """ Created on Mon Mar 7 22:23:34 2016 @author: Kazuki """ import homedepot as hd #comment out syn_tbl import numpy as np import pandas as pd import hdpath as hdp tt, prd = hd.load_all(onlytrain=True, mk_csv=True) hd.out(tt, 'tt') hd.out(prd, 'prd') po, ps = hdp.load() tt = pd.read_csv(ps + 'tt.csv') tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split() tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x)) prd = pd.read_csv(ps + 'prd.csv') prd['t_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split() def is_contain_w(li, w): if w in li: return True return False
watchlist, early_stopping_rounds=50, verbose_eval=True) if i == 0: pred = model.predict(xgb.DMatrix(test)) else: pred += model.predict(xgb.DMatrix(test)) sub = hd.load_sub() sub.relevance = pred / cnt sub = sub[sub.id.isin(test_id)] sub.ix[sub.relevance > 3, 'relevance'] = 3 sub.ix[sub.relevance < 1, 'relevance'] = 1 hd.out(sub, 'xgb10_words5-_328_1') raise Exception("SUCCESS!!!") #============================================================================== # merge #============================================================================== pred = pd.concat([ pd.read_csv(hd.path_sub + 'xgb10_words1-4_328_1.csv'), pd.read_csv(hd.path_sub + 'xgb10_words5-_328_1.csv') ]) pred.sort_values(by='id', inplace=True) #sub = hd.load_sub() #sub.relevance = pred
items.remove(i) items.insert(j, i) base['item'] = base.s_.map( lambda x: get_item_list_in_list(items, x)) base['match'] = set_flg(base) score = fitness_function(base) print 'item:', i, ' score_best:', score_best, ' score:', score if score_best < score: score_best = score elif score_best == score: ng.append(i) else: items.remove(i) items.insert(ind_bk, i) df = pd.DataFrame(items) hd.out(df, '06sort_items_query', True) df = pd.DataFrame(ng) hd.out(df, '06sort_items_query_ng', True) raise for step in steps: for i in items[step:]: if i not in ng: sw = True ins_pos = items.index(i) while sw == True: ins_pos -= step if ins_pos < 0: sw = False break ind_bk = items.index(i)
#============================================================================== s_table = hd.mk_freq_table(train.s_) s_table = s_table.to_frame() li = [] for i in s_table.index: sum_r = 0 cnt = 0 for j in train.index: if i in train.s_[j]: sum_r += train.r[j] cnt += 1 li.append(float(sum_r) / cnt) s_table['ave_r'] = li hd.out(s_table, 's_tbl') #============================================================================== # #============================================================================== t_table = hd.mk_freq_table(train.t_) t_table = t_table.to_frame() li = [] for i in t_table.index: sum_r = 0 cnt = 0 for j in train.index: if i in train.t_[j]: sum_r += train.r[j]
prd['t2'] = prd.t_.map(lambda x:' '.join(x)) s_freq = hd.mk_freq_table(tt.s_) s_freq = s_freq.to_frame() s_ind = s_freq.index.tolist() def is_contain_w(li,w): if w in li: return True return False def is_contain_words(li,words): cnt = 0 for w in words: if w in li: cnt +=1 if cnt == len(words): return True return False li = [] for w in s_ind: li.append(len(prd[prd.t_.map(lambda x:is_contain_w(x,w))])) s_freq['w'] = li s_freq['s'] = s_freq.index hd.out(s_freq[['s','w']],'05tfidf_weight')