def union_features_one(x): q = set(process_str_replace(x['query'])) t = set(process_str_replace(x['product_title'])) d = set(process_str_replace(x['product_description'])) qt = len(q & t) qtd = len(q & (t | d)) return pd.Series({'qt': qt, 'qtd': qtd, 'qtn': qt/len(q), 'qtdn': qtd/len(q)})
def union_features_one(x): q = set(process_str_replace(x['query'])) t = set(process_str_replace(x['product_title'])) d = set(process_str_replace(x['product_description'])) qt = len(q & t) qtd = len(q & (t | d)) return pd.Series({ 'qt': qt, 'qtd': qtd, 'qtn': qt / len(q), 'qtdn': qtd / len(q) })
def get_str_for_query7(train, q, median_relevance, id=None): df = train[(train['query']==q) & (train['median_relevance']==median_relevance) & (train.index!=id)] title_set = set() for e in df['product_names'].values: title_set |= set(process_str_replace(e)) return title_set
def create_similarity_features7(train, row, id=None): clr_print(id, row['product_title']) tx1 = get_str_for_query7(train, row['query'], 1, id) tx2 = get_str_for_query7(train, row['query'], 2, id) tx3 = get_str_for_query7(train, row['query'], 3, id) tx4 = get_str_for_query7(train, row['query'], 4, id) our_tx = set(process_str_replace(row['product_names'])) return len(tx1 & our_tx), len(tx2 & our_tx), len(tx3 & our_tx), len(tx4 & our_tx)