def get_att_sub(base,name): pid = base.pid.tolist() att = hd.load_att() att = att[att.name==name] del att['name'] att.columns = [['pid',name]] att = att[att.pid.isin(pid)] att[name] = att[name].map(lambda x:hd.str_stem(x)).str.split() dup_pids = att[att.pid.duplicated()].pid.tolist() dup_inds = att[att.pid.duplicated()].index.tolist()#second ind if len(dup_pids)>0: for i,j in zip(dup_pids,dup_inds): att[name][j] = sum(att[att.pid==i][name].tolist(),[]) att.drop_duplicates(subset='pid',keep='last',inplace=True) return pd.merge(base,att,on='pid',how='left')
def get_size(prd): pid = prd.pid.tolist() att = hd.load_att() att = att[att.pid.isin(pid)] depth = att[att.name.str.contains('Depth') == True] depth.value = depth.value.map(lambda x: rep_alpha(x)) depth = depth[depth.value.str.replace(".", "").str.isdigit() == True] height = att[att.name.str.contains('Height') == True] height.value = height.value.map(lambda x: rep_alpha(x)) height = height[height.value.str.replace(".", "").str.isdigit() == True] width = att[att.name.str.contains('Width') == True] width.value = width.value.map(lambda x: rep_alpha(x)) width = width[width.value.str.replace(".", "").str.isdigit() == True] weight = att[att.name.str.contains('Weight') == True] weight.value = weight.value.map(lambda x: rep_alpha(x)) weight = weight[weight.value.str.replace(".", "").str.isdigit() == True] del weight['name'] merged = pd.concat([depth, height, width]) del merged['name'] merged.sort_values(by='pid', inplace=True) merged.reset_index(drop=True, inplace=True) li = [] for pid in merged.pid.unique().tolist(): li.append( [int(pid), ' '.join(merged[merged.pid == pid].value.tolist())]) size = pd.DataFrame(li, columns=['pid', 'att_size']) prd = pd.merge(prd, size, on='pid', how='left') li = [] for pid in weight.pid.unique().tolist(): li.append( [int(pid), ' '.join(weight[weight.pid == pid].value.tolist())]) weight = pd.DataFrame(li, columns=['pid', 'att_weight']) prd = pd.merge(prd, weight, on='pid', how='left') return prd
test.columns = [['id', 'pid', 't', 's']] """product_table""" prd = pd.concat([train, test]) del train del test prd = prd.drop_duplicates(subset='pid') prd.drop(['id', 's', 'r'], axis=1, inplace=True) prd = prd.sort_values(by='pid') "TITLE" prd.loc[:, 't_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split() prd.reset_index(drop=True, inplace=True) name = 'MFG Brand Name' pid = prd.pid.tolist() att = hd.load_att() att = att[att.name == name] del att['name'] att.columns = [['pid', name]] att = att[att.pid.isin(pid)] att[name] = att[name].map(lambda x: hd.str_stem(x)) prd = pd.merge(prd, att, on='pid', how='left') prd = prd.drop_duplicates(subset='pid') prd.reset_index(drop=True, inplace=True) prd.rename(columns={'MFG Brand Name': 'p_brand'}, inplace=True) prd['p_brand'] = prd.p_brand.fillna('null') brand = prd.p_brand.str.lower().unique().tolist() brand.remove('null')