Пример #1
0
def get_att_sub(base,name):
    pid = base.pid.tolist()
    att = hd.load_att()
    att = att[att.name==name]
    del att['name']
    att.columns = [['pid',name]]
    att = att[att.pid.isin(pid)]
    att[name] = att[name].map(lambda x:hd.str_stem(x)).str.split()
    dup_pids = att[att.pid.duplicated()].pid.tolist()
    dup_inds = att[att.pid.duplicated()].index.tolist()#second ind
    if len(dup_pids)>0:
        for i,j in zip(dup_pids,dup_inds):
            att[name][j] = sum(att[att.pid==i][name].tolist(),[])
        att.drop_duplicates(subset='pid',keep='last',inplace=True)
    return pd.merge(base,att,on='pid',how='left')
Пример #2
0
def get_size(prd):
    pid = prd.pid.tolist()
    att = hd.load_att()
    att = att[att.pid.isin(pid)]
    depth = att[att.name.str.contains('Depth') == True]
    depth.value = depth.value.map(lambda x: rep_alpha(x))
    depth = depth[depth.value.str.replace(".", "").str.isdigit() == True]

    height = att[att.name.str.contains('Height') == True]
    height.value = height.value.map(lambda x: rep_alpha(x))
    height = height[height.value.str.replace(".", "").str.isdigit() == True]

    width = att[att.name.str.contains('Width') == True]
    width.value = width.value.map(lambda x: rep_alpha(x))
    width = width[width.value.str.replace(".", "").str.isdigit() == True]

    weight = att[att.name.str.contains('Weight') == True]
    weight.value = weight.value.map(lambda x: rep_alpha(x))
    weight = weight[weight.value.str.replace(".", "").str.isdigit() == True]
    del weight['name']

    merged = pd.concat([depth, height, width])
    del merged['name']
    merged.sort_values(by='pid', inplace=True)
    merged.reset_index(drop=True, inplace=True)

    li = []
    for pid in merged.pid.unique().tolist():
        li.append(
            [int(pid), ' '.join(merged[merged.pid == pid].value.tolist())])
    size = pd.DataFrame(li, columns=['pid', 'att_size'])
    prd = pd.merge(prd, size, on='pid', how='left')

    li = []
    for pid in weight.pid.unique().tolist():
        li.append(
            [int(pid), ' '.join(weight[weight.pid == pid].value.tolist())])
    weight = pd.DataFrame(li, columns=['pid', 'att_weight'])
    prd = pd.merge(prd, weight, on='pid', how='left')

    return prd
Пример #3
0
test.columns = [['id', 'pid', 't', 's']]
"""product_table"""
prd = pd.concat([train, test])
del train
del test
prd = prd.drop_duplicates(subset='pid')
prd.drop(['id', 's', 'r'], axis=1, inplace=True)

prd = prd.sort_values(by='pid')
"TITLE"
prd.loc[:, 't_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split()
prd.reset_index(drop=True, inplace=True)

name = 'MFG Brand Name'
pid = prd.pid.tolist()
att = hd.load_att()
att = att[att.name == name]
del att['name']
att.columns = [['pid', name]]
att = att[att.pid.isin(pid)]
att[name] = att[name].map(lambda x: hd.str_stem(x))

prd = pd.merge(prd, att, on='pid', how='left')

prd = prd.drop_duplicates(subset='pid')
prd.reset_index(drop=True, inplace=True)
prd.rename(columns={'MFG Brand Name': 'p_brand'}, inplace=True)
prd['p_brand'] = prd.p_brand.fillna('null')
brand = prd.p_brand.str.lower().unique().tolist()
brand.remove('null')