示例#1
0
def union_features_one(x):
    q = set(process_str_replace(x['query']))
    t = set(process_str_replace(x['product_title']))
    d = set(process_str_replace(x['product_description']))
    qt = len(q & t)
    qtd = len(q & (t | d))
    return pd.Series({'qt': qt, 'qtd': qtd,
                      'qtn': qt/len(q),
                      'qtdn': qtd/len(q)})
示例#2
0
def union_features_one(x):
    q = set(process_str_replace(x['query']))
    t = set(process_str_replace(x['product_title']))
    d = set(process_str_replace(x['product_description']))
    qt = len(q & t)
    qtd = len(q & (t | d))
    return pd.Series({
        'qt': qt,
        'qtd': qtd,
        'qtn': qt / len(q),
        'qtdn': qtd / len(q)
    })
def get_str_for_query7(train, q, median_relevance, id=None):
    df = train[(train['query']==q) &
               (train['median_relevance']==median_relevance) &
               (train.index!=id)]
    title_set = set()
    for e in df['product_names'].values:
        title_set |= set(process_str_replace(e))
    return title_set
def create_similarity_features7(train, row, id=None):
    clr_print(id, row['product_title'])
    tx1 = get_str_for_query7(train, row['query'], 1, id)
    tx2 = get_str_for_query7(train, row['query'], 2, id)
    tx3 = get_str_for_query7(train, row['query'], 3, id)
    tx4 = get_str_for_query7(train, row['query'], 4, id)
    our_tx = set(process_str_replace(row['product_names']))

    return len(tx1 & our_tx), len(tx2 & our_tx), len(tx3 & our_tx), len(tx4 & our_tx)