Python query_preprocessing示例，vn_lang.query_preprocessing Python示例

示例#1

0

显示文件

文件： convert.py 项目： rushyamz/semranker

def convert_attrs(arr_attrs, token_2_idx, attr_2_idx, token_zero_idx,
                  attr_zero_idx, unknown_map_func, unigram_max_seq_len,
                  bigram_max_seq_len, char_trigram_max_seq_len):

    attr_indices = []
    attr_in_product = []

    unigram_indices = []
    bigram_indices = []
    char_trigram_indices = []

    for attr_str in arr_attrs:
        zz = attr_str.split("|")

        if len(attr_str.strip()) == 0:
            attr_in_product.append(1)
            attr_indices.append(attr_zero_idx)
            unigram_indices.append([
                token_zero_idx,
            ] * unigram_max_seq_len)
            bigram_indices.append([
                token_zero_idx,
            ] * bigram_max_seq_len)
            char_trigram_indices.append([
                token_zero_idx,
            ] * char_trigram_max_seq_len)
            continue

        count = 0
        for t in zz:
            attr_token = "#".join(t.split("#")[:2])
            if attr_token in attr_2_idx:
                count += 1
                attr_indices.append(attr_2_idx[attr_token])
                attr_name = query_preprocessing(t.split("#")[-1])
                ui, bi, ci = convert_strings([attr_name], token_2_idx,
                                             token_zero_idx,
                                             unigram_max_seq_len,
                                             bigram_max_seq_len,
                                             char_trigram_max_seq_len,
                                             unknown_map_func)
                unigram_indices.append(ui[0])
                bigram_indices.append(bi[0])
                char_trigram_indices.append(ci[0])
        attr_in_product.append(count)

    return np.asarray(attr_indices, dtype=np.int32),\
        np.asarray(attr_in_product, dtype=np.int32),\
        np.asarray(unigram_indices, dtype=np.int32),\
        np.asarray(bigram_indices, dtype=np.int32),\
        np.asarray(char_trigram_indices, dtype=np.int32)

示例#2

0

显示文件

文件： semranker_test.py 项目： rushyamz/semranker

    def testCreateNgram(self):
        product_names = list(
            map(lambda x: query_preprocessing(x.get("product_name")),
                self.products))
        unigrams, bigrams, char_trigrams = create_ngrams(product_names[0])

        self.assertEqual(unigrams, [
            'ổ', 'cứng', 'ssd', 'kingston', 'hyperx', 'fury', '120gb', '-',
            'sata', 'iii', '-', 'hàng', 'chính', 'hãng'
        ])
        self.assertEqual(bigrams, [
            'ổ#cứng', 'cứng#ssd', 'ssd#kingston', 'kingston#hyperx',
            'hyperx#fury', 'fury#120gb', '120gb#-', '-#sata', 'sata#iii',
            'iii#-', '-#hàng', 'hàng#chính', 'chính#hãng'
        ])
        self.assertEqual(char_trigrams, [
            '#ổ#', '#cứ', 'cứn', 'ứng', 'ng#', '#ss', 'ssd', 'sd#', '#ki',
            'kin', 'ing', 'ngs', 'gst', 'sto', 'ton', 'on#', '#hy', 'hyp',
            'ype', 'per', 'erx', 'rx#', '#fu', 'fur', 'ury', 'ry#', '#12',
            '120', '20g', '0gb', 'gb#', '#-#', '#sa', 'sat', 'ata', 'ta#',
            '#ii', 'iii', 'ii#', '#-#', '#hà', 'hàn', 'àng', 'ng#', '#ch',
            'chí', 'hín', 'ính', 'nh#', '#hã', 'hãn', 'ãng', 'ng#'
        ])

示例#3

0

显示文件

文件： tf_csv_reader.py 项目： rushyamz/semranker

    def __init__(
        self, pair_paths,
        precomputed_path,
        product_db,
        vocab_path,
        cat_tokens_path, 
        attr_tokens_path,
        maximums_query=[25, 25, 125],#for unigram, bigram, character trigrams
        maximums_product_name=[50, 50, 250], #for unigram, bigram, character trigrams
        maximums_brand=[10, 10, 50],
        maximums_author=[10, 10, 50],
        maximums_cat=[10, 10, 20], #for unigram, bigram, character trigrams
        maximums_attr=[10, 10, 20], #for unigram, bigram, character trigrams
        unknown_bin=8012):

        self.vocab = []
        with open(vocab_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.vocab.append(l.strip())
        self.cat_tokens = []
        with open(cat_tokens_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.cat_tokens.append(l.strip())
        self.attr_tokens = []
        with open(attr_tokens_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.attr_tokens.append(l.strip())

        with open(precomputed_path, 'r') as fobj:
            self.precomputed = json.load(fobj)

        self.vocab_size = len(self.vocab)
        self.cat_tokens_size = len(self.cat_tokens)
        self.attr_tokens_size = len(self.attr_tokens)

        self.unknown_bin = unknown_bin

        self.maximums_query = maximums_query
        self.maximums_product_name = maximums_product_name
        self.maximums_brand = maximums_brand
        self.maximums_author = maximums_author
        self.maximums_cat = maximums_cat
        self.maximums_attr = maximums_attr
        
        self.token_2_idx = {}
        self.cat_token_2_idx = {}
        self.attr_token_2_idx = {}
        
        self.zero_idx = len(self.vocab) + self.unknown_bin
        for i, w in enumerate(self.vocab):
            self.token_2_idx[w] = i
        
        self.cat_zero_idx = len(self.cat_tokens)
        for i, w in enumerate(self.cat_tokens):
            self.cat_token_2_idx[w] = i
        
        self.attr_zero_idx = len(self.attr_tokens)
        for i, w in enumerate(self.attr_tokens):
            self.attr_token_2_idx[w] = i

        self.hasher = pyhash.murmur3_32()

        # initialize sampling pools
        self.pair_paths = pair_paths
        self.precomputed_path = precomputed_path

        # self.conn = create_connection(product_db)
        # self.headers = get_fields(self.conn)

        if product_db:
            self.product_dict = {}
            with open(product_db, "r") as fobj:
                csv_reader= csv.DictReader(fobj)
                for i, r in enumerate(csv_reader):
                    r = dict(r)
                    r["name"] = query_preprocessing(r.get("name"))
                    r["brand"] = query_preprocessing(r.get("brand"))
                    r["author"] = query_preprocessing(r.get("author"))
                    self.product_dict[r.get("product_id")] = r
                    if i % 100000 == 0:
                        print("Loaded %d products" % i)

            self.product_ids =  list(self.product_dict.keys())

示例#4

0

显示文件

文件： eval.py 项目： rushyamz/semranker

file_paths = [
    "product_impressions/product_impressions_20190805_000000000000.csv",
    "product_impressions/product_impressions_20190806_000000000000.csv"
]

predictor = SemRankerPredict()
interaction = {}

count = 0

for file_path in file_paths:
    with open(file_path, 'r') as fobj:
        for r in csv.DictReader(fobj):
            if r.get('product_id'):
                query = query_preprocessing(r.get('keyword'))
                if re.match(r'\d{6,}', query):
                    continue
                product_id = int(r.get('product_id'))
                action = r.get('action')
                rel = 1
                if action == "buy":
                    rel = 2
                if action == "" or action is None:
                    rel = 0

                if (query, product_id) in interaction:
                    interaction[(query, product_id)] = max(
                        interaction[(query, product_id)], rel)
                else:
                    interaction[(query, product_id)] = rel

示例#5

0

显示文件

文件： semranker_test.py 项目： rushyamz/semranker

    def create_placeholder_data(self):
        queries = list(map(lambda x: query_preprocessing(x), self.queries))
        product_names = list(
            map(lambda x: query_preprocessing(x.get("product_name")),
                self.products))
        brands = list(
            map(lambda x: query_preprocessing(x.get("brand")), self.products))
        authors = list(
            map(
                lambda x: " ".join(
                    [query_preprocessing(z) for z in x.get("author")]),
                self.products))
        categories = list(map(lambda x: x.get('categories'), self.products))
        attributes = list(map(lambda x: x.get('attributes'), self.products))
        features = list(
            map(lambda x: [x.get(h) for h in self.header_fields],
                self.products))
        precomputed_features_min = [
            self.feature_precomputed.get(h)[0] for h in self.header_fields
        ]
        precomputed_features_max = [
            self.feature_precomputed.get(h)[1] for h in self.header_fields
        ]

        max_query_length = self.max_query_length
        query_unigram_indices, query_bigram_indices, query_char_trigram_indices =  \
            convert_strings(
                queries, self.token_2_idx, self.zero_idx,
                max_query_length, max_query_length, max_query_length*5,
                self.unknown_to_idx)

        max_product_length = self.max_product_length
        product_unigram_indices, product_bigram_indices, product_char_trigram_indices =  \
            convert_strings(
                product_names, self.token_2_idx, self.zero_idx,
                max_product_length, max_product_length, max_product_length*5,
                self.unknown_to_idx)

        max_brand_length = self.max_brand_length
        brand_unigram_indices, brand_bigram_indices, brand_char_trigram_indices =  \
            convert_strings(
                brands, self.token_2_idx, self.zero_idx,
                max_brand_length, max_brand_length, max_brand_length*5,
                self.unknown_to_idx)

        max_author_length = self.max_author_length
        author_unigram_indices, author_bigram_indices, author_char_trigram_indices =  \
            convert_strings(
                authors, self.token_2_idx, self.zero_idx,
                max_author_length, max_author_length, max_author_length*5,
                self.unknown_to_idx)

        max_cat_length = self.max_cat_length
        cat_tokens, cat_in_product, cat_unigram_indices, cat_bigram_indices, cat_char_trigram_indices = \
            convert_cats(
                categories,
                self.token_2_idx,
                self.cat_token_2_idx,
                self.zero_idx,
                self.cat_zero_idx,
                self.unknown_to_idx,
                max_cat_length, max_cat_length, max_cat_length*5
            )

        max_attr_length = self.max_attr_length
        attr_tokens, attr_in_product, attr_unigram_indices, attr_bigram_indices, attr_char_trigram_indices = \
            convert_attrs(
                attributes,
                self.token_2_idx,
                self.attr_token_2_idx,
                self.zero_idx,
                self.attr_zero_idx,
                self.unknown_to_idx,
                max_attr_length, max_attr_length, max_attr_length*5
            )

        features = convert_features(features, precomputed_features_min,
                                    precomputed_features_max)

        return query_unigram_indices, query_bigram_indices, query_char_trigram_indices, \
               product_unigram_indices, product_bigram_indices, product_char_trigram_indices, \
               brand_unigram_indices, brand_bigram_indices, brand_char_trigram_indices, \
               author_unigram_indices, author_bigram_indices, author_char_trigram_indices, \
               cat_tokens, cat_in_product, cat_unigram_indices, cat_bigram_indices, cat_char_trigram_indices,\
               attr_tokens, attr_in_product, attr_unigram_indices, attr_bigram_indices, attr_char_trigram_indices,\
               features

示例#6

0

显示文件

文件： semranker_test.py 项目： rushyamz/semranker

    def setUp(self):
        self.unigrams = set()
        self.bigrams = set()
        self.char_trigrams = set()

        self.cat_tokens = set()
        self.attr_tokens = set()

        self.hasher = pyhash.murmur3_32()
        self.unknown_bin = 16

        self.feature_precomputed = {
            "reviews": [0.0, 3437.0],
            "rating": [0.0, 100.0],
            "sales_monthly": [0.0, 14345.0],
            "sales_yearly": [0.0, 136592.0],
            "support_p2h_delivery": [0.0, 1.0]
        }
        self.header_fields = [
            "reviews", "rating", "sales_monthly", "sales_yearly",
            "support_p2h_delivery"
        ]

        p1 = {
            'product_name':
            'Ổ Cứng SSD Kingston HyperX FURY 120GB - SATA III - Hàng Chính Hãng',
            'brand': 'Kingston',
            'author': '',
            'attributes':
            '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch',
            'categories':
            '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ',
            'reviews': 100,
            'rating': 80,
            'sales_monthly': 20,
            'sales_yearly': 100,
            'support_p2h_delivery': 1
        }

        p2 = {
            'product_name':
            'Ổ Cứng SSD Sata III 2.5 Inch Samsung 850 EVO 120GB - Hàng Chính Hãng',
            'brand': 'Samsung',
            'author': '',
            'attributes':
            '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch',
            'categories':
            '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ',
            'reviews': 71,
            'rating': 95,
            'sales_monthly': 10,
            'sales_yearly': 50,
            'support_p2h_delivery': 1
        }

        p3 = {
            'product_name':
            'Mai Em Vào Lớp 1 - Vở Tập Tô Chữ (Dành Cho Trẻ 5 - 6 Tuổi) - Tập 1',
            'brand': '',
            'author': ['Lê Hồng Đăng - Lê Thị Ngọc Ánh'],
            'attributes': '',
            'categories':
            '8322#2#Nhà Sách Tiki|316#3#Sách tiếng Việt |393#4#Sách thiếu nhi |853#5#Kiến thức - Bách khoa',
            'reviews': 0,
            'rating': 0,
            'sales_monthly': 3,
            'sales_yearly': 10,
            'support_p2h_delivery': 1
        }

        self.products = [p1, p2, p3]
        self.queries = ['ổ cứng', 'samsung 850', 'vở tập tô']
        self.target = [2, 1, 0]  # positive, impressed, negative
        self.weights = [1., 1., 1.]

        for p in self.products:
            self.add_to_vocab(query_preprocessing(p['product_name']))

            for z in p['attributes'].split('|'):
                t = "#".join(z.split("#")[:2])
                self.attr_tokens.add(t)

            for z in p['categories'].split('|'):
                t = "#".join(z.split("#")[:2])
                self.cat_tokens.add(t)

        self.vocab = self.unigrams.copy()
        self.vocab = self.vocab.union(self.bigrams, self.char_trigrams)
        self.vocab = list(self.vocab)
        self.zero_idx = len(self.vocab) + self.unknown_bin

        self.token_2_idx = {}
        for i, t in enumerate(self.vocab):
            self.token_2_idx[t] = i

        self.cat_token_2_idx = {}
        for i, t in enumerate(self.cat_tokens):
            self.cat_token_2_idx[t] = i
        self.cat_zero_idx = len(self.cat_tokens)

        self.attr_token_2_idx = {}
        for i, t in enumerate(self.attr_tokens):
            self.attr_token_2_idx[t] = i
        self.attr_zero_idx = len(self.attr_tokens)

        self.embed_size = 80
        self.attr_cat_embed_size = 10
        self.vocab_size = len(self.token_2_idx)
        self.max_query_length = 25
        self.max_product_length = 50
        self.max_brand_length = 25
        self.max_author_length = 25
        self.max_attr_length = 10
        self.max_cat_length = 10
        self.filter_sizes = [2, 3, 4, 5]
        self.num_filters = 5

示例#7

0

显示文件

文件： generate_impressions.py 项目： rushyamz/semranker

            product_id = row.get('product_id')
            action = row.get('action')

            if re.match(r'\d{6,}', keyword):
                continue
            rel = 1
            if action == "buy":
                rel = 2
            if action == "" or action is None:
                rel = 0

            if keyword is None or product_id is None:
                continue
            if random.random() < 0.4 and rel == 0:  #drop 40% negatives samples
                continue
            keyword = query_preprocessing(keyword)
            if len(negatives_pool) < 5e6:
                negatives_pool.add((keyword, product_id))
            if not len(keyword):
                continue
            count += 1
            if count % 10000 == 0:
                print("Processed %d pair (keyword, product)" % count)
            interactions[(keyword, product_id)] = max(
                interactions.get((keyword, product_id), 0), rel)
        fobj.close()

negatives_pool = list(negatives_pool)
queries = {}
keep_q = {}
for (keyword, product_id), v in interactions.items():

示例#8

0

显示文件

文件： make_metadata.py 项目： rushyamz/semranker

                count_char_trigrams += 1

    for i in range(0, max(len(tokens) - 1, 0)):
        t = "%s#%s" % (tokens[i], tokens[i + 1])
        if t in bigrams:
            bigrams[t] += 1
        else:
            bigrams[t] = 1
        count_bigrams += 1


fobj = open("data/catalog.csv")
reader = csv.reader(fobj)

for r in reader:
    product_name = query_preprocessing(r[1])
    if not product_name or len(product_name) == 0:
        continue

    add_to_vocab(product_name)

fobj.close()

for f in os.listdir("triples"):
    if f.endswith(".csv"):
        fobj = open("data/catalog.csv")
        reader = csv.reader(fobj)
        for r in reader:
            query = query_preprocessing(r[0])
            if not query or len(query) == 0:
                continue