示例#1
0
def assemble_counts(train):
    X = []
    titles = []
    for i in range(len(train.id)):
        query = correct_string(train['query'][i].lower())
        title = correct_string(train.product_title[i].lower())

        query = (" ").join(
            [z for z in BeautifulSoup(query).get_text(" ").split(" ")])
        title = (" ").join(
            [z for z in BeautifulSoup(title).get_text(" ").split(" ")])

        query = re.sub("[^a-zA-Z0-9]", " ", query)
        title = re.sub("[^a-zA-Z0-9]", " ", title)

        query = (" ").join([stemmer.stem(z) for z in query.split(" ")])
        title = (" ").join([stemmer.stem(z) for z in title.split(" ")])

        query = " ".join(query.split())
        title = " ".join(title.split())

        #dist_qt = compression_distance(query,title)
        dist_qt2 = 1 - seq_matcher(None, query, title).ratio()

        query_len = len(query.split())
        title_len = len(title.split())

        tmp_title = title
        word_counter_qt = 0
        lev_dist_arr = []
        for q in query.split():
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None, q, t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    tmp_title += ' ' + q  # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        last_word_in = 0
        for t in title.split():
            lev_dist = seq_matcher(None, query.split()[-1], t).ratio()
            if lev_dist > 0.9:
                last_word_in = 1
        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1 - lev_max / len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt / query_len
        X.append([
            query_len, title_len, word_counter_qt, lev_max, last_word_in,
            word_counter_qt_norm, dist_qt2
        ])
        titles.append(tmp_title)

    X = np.array(X).astype(np.float)
    return X, np.array(titles)
def distance_between_query_and_title():

    for cata in catagories:

        with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            query_unigram = pickle.load(f)

        with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            title_unigram = pickle.load(f)

        sz = len(query_unigram)

        compression_distance = np.zeros([sz, 1])
        edit_distance = np.zeros([sz, 1])
        mean_maximum_edit_distance = np.zeros([sz, 1])

        for i in range(sz):
            query = " ".join(query_unigram[i])
            title = " ".join(title_unigram[i])

            compression_distance[i][0] = compressionDistance(query, title)
            edit_distance[i][0] = 1 - seq_matcher(None, query, title).ratio()

            lev_dist_arr = []
            for q in query.split():
                lev_dist_q = []
                for t in title.split():
                    lev_dist = seq_matcher(None, q, t).ratio()
                    lev_dist_q.append(lev_dist)
                lev_dist_arr.append(lev_dist_q)

            lev_max = 0
            for item in lev_dist_arr:
                lev_max_q = max(item)
                lev_max += lev_max_q
            lev_max = 1 - lev_max / len(lev_dist_arr)
            mean_maximum_edit_distance[i][0] = lev_max

            if (i % 1000 == 0): print(i)

        with open("%s/compression_distance_%s.pickle" % (folderPath, cata),
                  "wb") as f:
            pickle.dump(compression_distance, f)

        with open("%s/edit_distance_%s.pickle" % (folderPath, cata),
                  "wb") as f:
            pickle.dump(edit_distance, f)

        with open(
                "%s/mean_maximum_edit_distance_%s.pickle" % (folderPath, cata),
                "wb") as f:
            pickle.dump(mean_maximum_edit_distance, f)
def assemble_counts(train):
    X = []
    titles = []
    for i in range(len(train.id)):
        query = correct_string(train['query'][i].lower())
        title = correct_string(train.product_title[i].lower())
        
        query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")])
        title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")])
        
        query=re.sub("[^a-zA-Z0-9]"," ", query)
        title=re.sub("[^a-zA-Z0-9]"," ", title)
        
        query= (" ").join([stemmer.stem(z) for z in query.split(" ")])
        title= (" ").join([stemmer.stem(z) for z in title.split(" ")])

        query=" ".join(query.split())
        title=" ".join(title.split())
        
        #dist_qt = compression_distance(query,title)
        dist_qt2 = 1 - seq_matcher(None,query,title).ratio()
        
        query_len = len(query.split())
        title_len = len(title.split())
        
        tmp_title = title
        word_counter_qt = 0
        lev_dist_arr = []
        for q in query.split():
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None,q,t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    tmp_title += ' '+q # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        last_word_in = 0
        for t in title.split():
            lev_dist = seq_matcher(None,query.split()[-1],t).ratio()
            if lev_dist > 0.9: 
                last_word_in = 1
        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1- lev_max/len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt/query_len
        X.append([query_len,title_len,word_counter_qt,lev_max,last_word_in,word_counter_qt_norm, dist_qt2])
        titles.append(tmp_title)
        
    X = np.array(X).astype(np.float)
    return X, np.array(titles)
示例#4
0
def assemble_counts2(train):
    X = []
    queries = []

    for i in range(len(train.id)):
        query = train['query'][i]
        title = train.product_title[i]

        dist_qt = compression_distance(query, title)
        dist_qt2 = 1 - seq_matcher(None, query, title).ratio()

        query_len = len(query.split())

        lev_dist_arr = []
        word_rank_list = []
        word_q_ind = 0
        word_counter_qt = 0
        for q in query.split():
            word_q_ind += 1
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None, q, t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    word_rank_list.append(word_q_ind)
                    #tmp_title += ' '+q # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        if word_counter_qt == 0:
            maxrank = 0
        else:
            maxrank = 26 - min(word_rank_list)

        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1 - lev_max / len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt / query_len

        X.append([
            word_counter_qt, dist_qt, dist_qt2, lev_max, word_counter_qt_norm,
            maxrank
        ])
        queries.append(query)

    X = np.array(X).astype(np.float)

    return X, np.array(queries)
示例#5
0
def assemble_counts2(train):
    X = []
    queries = []
    
    for i in range(len(train.id)):
        query = train['query'][i]
        title = train.product_title[i]
        
        dist_qt = compression_distance(query,title)
        dist_qt2 = 1 - seq_matcher(None,query,title).ratio()
        
        query_len = len(query.split())
        
        lev_dist_arr = []
        word_rank_list = []
        word_q_ind = 0
        word_counter_qt = 0
        for q in query.split():
            word_q_ind += 1
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None,q,t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    word_rank_list.append(word_q_ind)
                    #tmp_title += ' '+q # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        if word_counter_qt == 0:
            maxrank = 0
        else:
            maxrank = 26 - min(word_rank_list)
        
        
        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1- lev_max/len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt/query_len
        
        
        
        X.append([word_counter_qt,dist_qt,dist_qt2,lev_max,word_counter_qt_norm,maxrank])
        queries.append(query)

    X = np.array(X).astype(np.float)

    return X, np.array(queries)
def last_word_from_query_present_title():
    for cata in catagories:
        with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            query_unigram = pickle.load(f)

        with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            title_unigram = pickle.load(f)

        sz = len(query_unigram)
        output = np.zeros([sz, 1])

        for i in range(sz):
            cnt = 0

            for word in title_unigram[i]:
                lev_dist = seq_matcher(None, query_unigram[i][-1],
                                       word).ratio()
                if lev_dist > 0.9:
                    cnt = 1
                    break

            output[i][0] = cnt

        print(output.shape)

        with open(
                "%s/last_word_from_query_present_title_%s.pickle" %
            (folderPath, cata), "wb") as f:
            pickle.dump(output, f)
def gen_count_word_query_in_title():
    for cata in catagories:

        with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            query_unigram = pickle.load(f)

        with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata),
                  "rb") as f:
            title_unigram = pickle.load(f)

        sz = len(query_unigram)
        output = np.zeros([sz, 1])

        for i in range(sz):
            cnt = 0
            for qword in query_unigram[i]:
                for tword in title_unigram[i]:
                    lev_dist = seq_matcher(None, qword, tword).ratio()
                    if lev_dist > 0.9:
                        cnt += 1

            output[i][0] = cnt

        print(output.shape)

        with open(
                "%s/count_word_query_in_title_%s.pickle" % (folderPath, cata),
                "wb") as f:
            pickle.dump(output, f)
def last_word(query, title):
	if len(query)==0 or len(title)==0:
		return 0
	for t in title:
		dist=seq_matcher(None, query[-1], t).ratio()
		if dist > 0.9:
			return 1

	return 0
def edist_norm(query, title):
	w=0
	for q in query:
		for t in title:
			lev_dist = seq_matcher(None,q,t).ratio()
			if lev_dist>0.9:
				w+=1

	return try_divide(w, len(query))
示例#10
0
def get_difflib_features(df):
    logging.info('get difflib features')
    feat = pd.DataFrame(index=df.index)
    seq_distances = []
    for i, (a, b) in enumerate(zip(df.search_term, df.product_title)):
        a = ''.join([c for c in a if c.isalnum()])
        b = ''.join([c for c in b if c.isalnum()])
        seq_distances.append(seq_matcher(None, a, b).ratio())
    feat['seq_match_ratio'] = 1.0 - np.array(seq_distances)
    return feat
示例#11
0
def mean_dist(data, col1, col2):
    mean_edit_s_t = []
    for i in range(len(data)):
        search = data[col1][i]
        title = data[col2][i]
        max_edit_s_t_arr = []
        for s in search.split():
            max_edit_s_t = []
            for t in title.split():
                a = seq_matcher(None, s, t).ratio()
                max_edit_s_t.append(a)
            max_edit_s_t_arr.append(max_edit_s_t)
        l = 0
        for item in max_edit_s_t_arr:
            l = l + max(item)
        mean_edit_s_t.append(l / len(max_edit_s_t_arr))
    return mean_edit_s_t
示例#12
0
def compute_one_edit_distance(row):
    query = row['search_term']
    title = row['product_title']
    
    return 1 - seq_matcher(None, query, title).ratio()
示例#13
0
def assemble_counts(train, m='train'):
    X = []
    titles = []
    queries = []
    weights = []
    train['isdesc'] = 1  # Description present flag
    train.loc[train['product_description'].isnull(), 'isdesc'] = 0

    for i in range(len(train.id)):
        query = correct_string(train['query'][i].lower())
        title = correct_string(train.product_title[i].lower())

        query = (" ").join(
            [z for z in BeautifulSoup(query).get_text(" ").split(" ")])
        title = (" ").join(
            [z for z in BeautifulSoup(title).get_text(" ").split(" ")])

        query = text.re.sub("[^a-zA-Z0-9]", " ", query)
        title = text.re.sub("[^a-zA-Z0-9]", " ", title)

        query = (" ").join([stemmer.stem(z) for z in query.split(" ")])
        title = (" ").join([stemmer.stem(z) for z in title.split(" ")])

        query = " ".join(query.split())
        title = " ".join(title.split())

        dist_qt = compression_distance(query, title)
        dist_qt2 = 1 - seq_matcher(None, query, title).ratio()

        query_len = len(query.split())
        title_len = len(title.split())
        isdesc = train.isdesc[i]

        tmp_title = title
        word_counter_qt = 0
        lev_dist_arr = []
        for q in query.split():
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None, q, t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    #tmp_title += ' '+q # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        last_word_in = 0
        for t in title.split():
            lev_dist = seq_matcher(None, query.split()[-1], t).ratio()
            if lev_dist > 0.9:
                last_word_in = 1
        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1 - lev_max / len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt / query_len

        X.append([
            query_len, title_len, isdesc, word_counter_qt, dist_qt, dist_qt2,
            lev_max, last_word_in, word_counter_qt_norm
        ])
        titles.append(tmp_title)
        queries.append(query)
        if m == 'train':
            weights.append(1 / (float(train["relevance_variance"][i]) + 1.0))
    X = np.array(X).astype(np.float)
    if m == 'train':
        return X, np.array(weights).astype(
            np.float), np.array(titles), np.array(queries)
    else:
        return X, np.array(titles), np.array(queries)
示例#14
0
def assemble_counts(train,m='train'):
    X = []
    titles = []
    queries = []
    weights = []
    train['isdesc'] = 1 # Description present flag
    train.loc[train['product_description'].isnull(),'isdesc'] = 0
    
    for i in range(len(train.id)):
        query = correct_string(train['query'][i].lower())
        title = correct_string(train.product_title[i].lower())
        
        query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")])
        title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")])
        
        query=text.re.sub("[^a-zA-Z0-9]"," ", query)
        title=text.re.sub("[^a-zA-Z0-9]"," ", title)
        
        query= (" ").join([stemmer.stem(z) for z in query.split(" ")])
        title= (" ").join([stemmer.stem(z) for z in title.split(" ")])

        query=" ".join(query.split())
        title=" ".join(title.split())
        
        dist_qt = compression_distance(query,title)
        dist_qt2 = 1 - seq_matcher(None,query,title).ratio()
        
        query_len = len(query.split())
        title_len = len(title.split())
        isdesc = train.isdesc[i]
        
        tmp_title = title
        word_counter_qt = 0
        lev_dist_arr = []
        for q in query.split():
            lev_dist_q = []
            for t in title.split():
                lev_dist = seq_matcher(None,q,t).ratio()
                if lev_dist > 0.9:
                    word_counter_qt += 1
                    #tmp_title += ' '+q # add such words to title to increase their weights in tfidf
                lev_dist_q.append(lev_dist)
            lev_dist_arr.append(lev_dist_q)
        last_word_in = 0
        for t in title.split():
            lev_dist = seq_matcher(None,query.split()[-1],t).ratio()
            if lev_dist > 0.9: 
                last_word_in = 1
        lev_max = 0
        for item in lev_dist_arr:
            lev_max_q = max(item)
            lev_max += lev_max_q
        lev_max = 1- lev_max/len(lev_dist_arr)
        word_counter_qt_norm = word_counter_qt/query_len
        
        
        
        X.append([query_len,title_len,isdesc,word_counter_qt,dist_qt,dist_qt2,lev_max,last_word_in,word_counter_qt_norm])
        titles.append(tmp_title)
        queries.append(query)
        if m =='train':
            weights.append(1/(float(train["relevance_variance"][i]) + 1.0))
    X = np.array(X).astype(np.float)
    if m =='train':
        return X, np.array(weights).astype(np.float), np.array(titles), np.array(queries)
    else:
        return X, np.array(titles), np.array(queries)
def edist(q, t):
	return 1 - seq_matcher(None,q,t).ratio()