def convert_attrs(arr_attrs, token_2_idx, attr_2_idx, token_zero_idx, attr_zero_idx, unknown_map_func, unigram_max_seq_len, bigram_max_seq_len, char_trigram_max_seq_len): attr_indices = [] attr_in_product = [] unigram_indices = [] bigram_indices = [] char_trigram_indices = [] for attr_str in arr_attrs: zz = attr_str.split("|") if len(attr_str.strip()) == 0: attr_in_product.append(1) attr_indices.append(attr_zero_idx) unigram_indices.append([ token_zero_idx, ] * unigram_max_seq_len) bigram_indices.append([ token_zero_idx, ] * bigram_max_seq_len) char_trigram_indices.append([ token_zero_idx, ] * char_trigram_max_seq_len) continue count = 0 for t in zz: attr_token = "#".join(t.split("#")[:2]) if attr_token in attr_2_idx: count += 1 attr_indices.append(attr_2_idx[attr_token]) attr_name = query_preprocessing(t.split("#")[-1]) ui, bi, ci = convert_strings([attr_name], token_2_idx, token_zero_idx, unigram_max_seq_len, bigram_max_seq_len, char_trigram_max_seq_len, unknown_map_func) unigram_indices.append(ui[0]) bigram_indices.append(bi[0]) char_trigram_indices.append(ci[0]) attr_in_product.append(count) return np.asarray(attr_indices, dtype=np.int32),\ np.asarray(attr_in_product, dtype=np.int32),\ np.asarray(unigram_indices, dtype=np.int32),\ np.asarray(bigram_indices, dtype=np.int32),\ np.asarray(char_trigram_indices, dtype=np.int32)
def testCreateNgram(self): product_names = list( map(lambda x: query_preprocessing(x.get("product_name")), self.products)) unigrams, bigrams, char_trigrams = create_ngrams(product_names[0]) self.assertEqual(unigrams, [ 'ổ', 'cứng', 'ssd', 'kingston', 'hyperx', 'fury', '120gb', '-', 'sata', 'iii', '-', 'hàng', 'chính', 'hãng' ]) self.assertEqual(bigrams, [ 'ổ#cứng', 'cứng#ssd', 'ssd#kingston', 'kingston#hyperx', 'hyperx#fury', 'fury#120gb', '120gb#-', '-#sata', 'sata#iii', 'iii#-', '-#hàng', 'hàng#chính', 'chính#hãng' ]) self.assertEqual(char_trigrams, [ '#ổ#', '#cứ', 'cứn', 'ứng', 'ng#', '#ss', 'ssd', 'sd#', '#ki', 'kin', 'ing', 'ngs', 'gst', 'sto', 'ton', 'on#', '#hy', 'hyp', 'ype', 'per', 'erx', 'rx#', '#fu', 'fur', 'ury', 'ry#', '#12', '120', '20g', '0gb', 'gb#', '#-#', '#sa', 'sat', 'ata', 'ta#', '#ii', 'iii', 'ii#', '#-#', '#hà', 'hàn', 'àng', 'ng#', '#ch', 'chí', 'hín', 'ính', 'nh#', '#hã', 'hãn', 'ãng', 'ng#' ])
def __init__( self, pair_paths, precomputed_path, product_db, vocab_path, cat_tokens_path, attr_tokens_path, maximums_query=[25, 25, 125],#for unigram, bigram, character trigrams maximums_product_name=[50, 50, 250], #for unigram, bigram, character trigrams maximums_brand=[10, 10, 50], maximums_author=[10, 10, 50], maximums_cat=[10, 10, 20], #for unigram, bigram, character trigrams maximums_attr=[10, 10, 20], #for unigram, bigram, character trigrams unknown_bin=8012): self.vocab = [] with open(vocab_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.vocab.append(l.strip()) self.cat_tokens = [] with open(cat_tokens_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.cat_tokens.append(l.strip()) self.attr_tokens = [] with open(attr_tokens_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.attr_tokens.append(l.strip()) with open(precomputed_path, 'r') as fobj: self.precomputed = json.load(fobj) self.vocab_size = len(self.vocab) self.cat_tokens_size = len(self.cat_tokens) self.attr_tokens_size = len(self.attr_tokens) self.unknown_bin = unknown_bin self.maximums_query = maximums_query self.maximums_product_name = maximums_product_name self.maximums_brand = maximums_brand self.maximums_author = maximums_author self.maximums_cat = maximums_cat self.maximums_attr = maximums_attr self.token_2_idx = {} self.cat_token_2_idx = {} self.attr_token_2_idx = {} self.zero_idx = len(self.vocab) + self.unknown_bin for i, w in enumerate(self.vocab): self.token_2_idx[w] = i self.cat_zero_idx = len(self.cat_tokens) for i, w in enumerate(self.cat_tokens): self.cat_token_2_idx[w] = i self.attr_zero_idx = len(self.attr_tokens) for i, w in enumerate(self.attr_tokens): self.attr_token_2_idx[w] = i self.hasher = pyhash.murmur3_32() # initialize sampling pools self.pair_paths = pair_paths self.precomputed_path = precomputed_path # self.conn = create_connection(product_db) # self.headers = get_fields(self.conn) if product_db: self.product_dict = {} with open(product_db, "r") as fobj: csv_reader= csv.DictReader(fobj) for i, r in enumerate(csv_reader): r = dict(r) r["name"] = query_preprocessing(r.get("name")) r["brand"] = query_preprocessing(r.get("brand")) r["author"] = query_preprocessing(r.get("author")) self.product_dict[r.get("product_id")] = r if i % 100000 == 0: print("Loaded %d products" % i) self.product_ids = list(self.product_dict.keys())
file_paths = [ "product_impressions/product_impressions_20190805_000000000000.csv", "product_impressions/product_impressions_20190806_000000000000.csv" ] predictor = SemRankerPredict() interaction = {} count = 0 for file_path in file_paths: with open(file_path, 'r') as fobj: for r in csv.DictReader(fobj): if r.get('product_id'): query = query_preprocessing(r.get('keyword')) if re.match(r'\d{6,}', query): continue product_id = int(r.get('product_id')) action = r.get('action') rel = 1 if action == "buy": rel = 2 if action == "" or action is None: rel = 0 if (query, product_id) in interaction: interaction[(query, product_id)] = max( interaction[(query, product_id)], rel) else: interaction[(query, product_id)] = rel
def create_placeholder_data(self): queries = list(map(lambda x: query_preprocessing(x), self.queries)) product_names = list( map(lambda x: query_preprocessing(x.get("product_name")), self.products)) brands = list( map(lambda x: query_preprocessing(x.get("brand")), self.products)) authors = list( map( lambda x: " ".join( [query_preprocessing(z) for z in x.get("author")]), self.products)) categories = list(map(lambda x: x.get('categories'), self.products)) attributes = list(map(lambda x: x.get('attributes'), self.products)) features = list( map(lambda x: [x.get(h) for h in self.header_fields], self.products)) precomputed_features_min = [ self.feature_precomputed.get(h)[0] for h in self.header_fields ] precomputed_features_max = [ self.feature_precomputed.get(h)[1] for h in self.header_fields ] max_query_length = self.max_query_length query_unigram_indices, query_bigram_indices, query_char_trigram_indices = \ convert_strings( queries, self.token_2_idx, self.zero_idx, max_query_length, max_query_length, max_query_length*5, self.unknown_to_idx) max_product_length = self.max_product_length product_unigram_indices, product_bigram_indices, product_char_trigram_indices = \ convert_strings( product_names, self.token_2_idx, self.zero_idx, max_product_length, max_product_length, max_product_length*5, self.unknown_to_idx) max_brand_length = self.max_brand_length brand_unigram_indices, brand_bigram_indices, brand_char_trigram_indices = \ convert_strings( brands, self.token_2_idx, self.zero_idx, max_brand_length, max_brand_length, max_brand_length*5, self.unknown_to_idx) max_author_length = self.max_author_length author_unigram_indices, author_bigram_indices, author_char_trigram_indices = \ convert_strings( authors, self.token_2_idx, self.zero_idx, max_author_length, max_author_length, max_author_length*5, self.unknown_to_idx) max_cat_length = self.max_cat_length cat_tokens, cat_in_product, cat_unigram_indices, cat_bigram_indices, cat_char_trigram_indices = \ convert_cats( categories, self.token_2_idx, self.cat_token_2_idx, self.zero_idx, self.cat_zero_idx, self.unknown_to_idx, max_cat_length, max_cat_length, max_cat_length*5 ) max_attr_length = self.max_attr_length attr_tokens, attr_in_product, attr_unigram_indices, attr_bigram_indices, attr_char_trigram_indices = \ convert_attrs( attributes, self.token_2_idx, self.attr_token_2_idx, self.zero_idx, self.attr_zero_idx, self.unknown_to_idx, max_attr_length, max_attr_length, max_attr_length*5 ) features = convert_features(features, precomputed_features_min, precomputed_features_max) return query_unigram_indices, query_bigram_indices, query_char_trigram_indices, \ product_unigram_indices, product_bigram_indices, product_char_trigram_indices, \ brand_unigram_indices, brand_bigram_indices, brand_char_trigram_indices, \ author_unigram_indices, author_bigram_indices, author_char_trigram_indices, \ cat_tokens, cat_in_product, cat_unigram_indices, cat_bigram_indices, cat_char_trigram_indices,\ attr_tokens, attr_in_product, attr_unigram_indices, attr_bigram_indices, attr_char_trigram_indices,\ features
def setUp(self): self.unigrams = set() self.bigrams = set() self.char_trigrams = set() self.cat_tokens = set() self.attr_tokens = set() self.hasher = pyhash.murmur3_32() self.unknown_bin = 16 self.feature_precomputed = { "reviews": [0.0, 3437.0], "rating": [0.0, 100.0], "sales_monthly": [0.0, 14345.0], "sales_yearly": [0.0, 136592.0], "support_p2h_delivery": [0.0, 1.0] } self.header_fields = [ "reviews", "rating", "sales_monthly", "sales_yearly", "support_p2h_delivery" ] p1 = { 'product_name': 'Ổ Cứng SSD Kingston HyperX FURY 120GB - SATA III - Hàng Chính Hãng', 'brand': 'Kingston', 'author': '', 'attributes': '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch', 'categories': '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ', 'reviews': 100, 'rating': 80, 'sales_monthly': 20, 'sales_yearly': 100, 'support_p2h_delivery': 1 } p2 = { 'product_name': 'Ổ Cứng SSD Sata III 2.5 Inch Samsung 850 EVO 120GB - Hàng Chính Hãng', 'brand': 'Samsung', 'author': '', 'attributes': '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch', 'categories': '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ', 'reviews': 71, 'rating': 95, 'sales_monthly': 10, 'sales_yearly': 50, 'support_p2h_delivery': 1 } p3 = { 'product_name': 'Mai Em Vào Lớp 1 - Vở Tập Tô Chữ (Dành Cho Trẻ 5 - 6 Tuổi) - Tập 1', 'brand': '', 'author': ['Lê Hồng Đăng - Lê Thị Ngọc Ánh'], 'attributes': '', 'categories': '8322#2#Nhà Sách Tiki|316#3#Sách tiếng Việt |393#4#Sách thiếu nhi |853#5#Kiến thức - Bách khoa', 'reviews': 0, 'rating': 0, 'sales_monthly': 3, 'sales_yearly': 10, 'support_p2h_delivery': 1 } self.products = [p1, p2, p3] self.queries = ['ổ cứng', 'samsung 850', 'vở tập tô'] self.target = [2, 1, 0] # positive, impressed, negative self.weights = [1., 1., 1.] for p in self.products: self.add_to_vocab(query_preprocessing(p['product_name'])) for z in p['attributes'].split('|'): t = "#".join(z.split("#")[:2]) self.attr_tokens.add(t) for z in p['categories'].split('|'): t = "#".join(z.split("#")[:2]) self.cat_tokens.add(t) self.vocab = self.unigrams.copy() self.vocab = self.vocab.union(self.bigrams, self.char_trigrams) self.vocab = list(self.vocab) self.zero_idx = len(self.vocab) + self.unknown_bin self.token_2_idx = {} for i, t in enumerate(self.vocab): self.token_2_idx[t] = i self.cat_token_2_idx = {} for i, t in enumerate(self.cat_tokens): self.cat_token_2_idx[t] = i self.cat_zero_idx = len(self.cat_tokens) self.attr_token_2_idx = {} for i, t in enumerate(self.attr_tokens): self.attr_token_2_idx[t] = i self.attr_zero_idx = len(self.attr_tokens) self.embed_size = 80 self.attr_cat_embed_size = 10 self.vocab_size = len(self.token_2_idx) self.max_query_length = 25 self.max_product_length = 50 self.max_brand_length = 25 self.max_author_length = 25 self.max_attr_length = 10 self.max_cat_length = 10 self.filter_sizes = [2, 3, 4, 5] self.num_filters = 5
product_id = row.get('product_id') action = row.get('action') if re.match(r'\d{6,}', keyword): continue rel = 1 if action == "buy": rel = 2 if action == "" or action is None: rel = 0 if keyword is None or product_id is None: continue if random.random() < 0.4 and rel == 0: #drop 40% negatives samples continue keyword = query_preprocessing(keyword) if len(negatives_pool) < 5e6: negatives_pool.add((keyword, product_id)) if not len(keyword): continue count += 1 if count % 10000 == 0: print("Processed %d pair (keyword, product)" % count) interactions[(keyword, product_id)] = max( interactions.get((keyword, product_id), 0), rel) fobj.close() negatives_pool = list(negatives_pool) queries = {} keep_q = {} for (keyword, product_id), v in interactions.items():
count_char_trigrams += 1 for i in range(0, max(len(tokens) - 1, 0)): t = "%s#%s" % (tokens[i], tokens[i + 1]) if t in bigrams: bigrams[t] += 1 else: bigrams[t] = 1 count_bigrams += 1 fobj = open("data/catalog.csv") reader = csv.reader(fobj) for r in reader: product_name = query_preprocessing(r[1]) if not product_name or len(product_name) == 0: continue add_to_vocab(product_name) fobj.close() for f in os.listdir("triples"): if f.endswith(".csv"): fobj = open("data/catalog.csv") reader = csv.reader(fobj) for r in reader: query = query_preprocessing(r[0]) if not query or len(query) == 0: continue