示例#1
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)
        for brand in brand_list:
            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            self.progress += 1
            rs = db.query(
                str.format(
                    'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                    'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                    'p1.idproducts=p2.idproducts '
                    'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                    brand))

            # 以model为键值,将同一个model下,不同区域的价格放在一起。
            records = rs.fetch_row(maxrows=0, how=1)
            price_data = {}
            for r in records:
                model = r['model']
                # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                # if r['price'] and int(r['offline']) == 0:
                # 这里更改为不管offline,全检查
                if r['price']:
                    # 首先检查model是否已存在
                    if model not in price_data:
                        price_data[model] = []
                    price_data[model].append(r)

            # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
            for model in price_data:
                for item in price_data[model]:
                    price = float(item['price'])
                    item['nprice'] = info.currency_info()[
                        item['currency']]['rate'] * price

                # 按照nprice大小排序
                sorted_data = sorted(price_data[model],
                                     key=lambda item: item['nprice'])
                max_price = sorted_data[-1]['nprice']
                min_price = sorted_data[0]['nprice']
                if min_price > 0 and max_price / min_price > self.threshold:
                    print unicode.format(
                        u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}',
                        brand, model, sorted_data[0]['nprice'],
                        sorted_data[0]['region'], sorted_data[-1]['nprice'],
                        sorted_data[-1]['region'],
                        info.brand_info()[brand]['brandname_e'])

        db.close()
示例#2
0
def guess_currency(price, region=None):
    # 如果下面这些符号出现在字符串中,则可以直接确定货币
    symbols = {u'€': 'EUR', 'HK$': 'HKD', 'AU$': 'AUD', 'CA$': 'CAD', 'US$': 'USD', u'£': 'GBP'}
    # 按照符号提取
    for s in symbols:
        if s in price:
            return symbols[s]

    # 如果$前面没有紧贴一至两个大写的字母,即没有出现CA$,AU $等情况,则说明货币是美元。
    if '$' in price:
        mt1 = re.search(r'[A-Z]{2}\s{0,2}\$', price, flags=re.U)
        mt2 = re.search(r'[A-Z]{1}\$', price, flags=re.U)
        if not mt1 and not mt2:
            return 'USD'

    if u'¥' in price and region in ('cn', 'hk', 'mo', 'tw'):
        return 'CNY'

    # 若字符串中包含大写的三个字母,并且该标识出现在货币列表中,说明这三个字母组成的字符串是货币信息
    mt = re.search(r'([A-Z]{3})', price, flags=re.U)
    if mt and mt.group(1) in info.currency_info().keys():
        return mt.group(1)
    else:
        # 未找到货币信息
        return None
示例#3
0
    def merge_prods(self, prods, db):
        """
        按照国家顺序,挑选主记录
        :param prods:
        """
        logger = get_logger()
        # 将prods转换为unicode
        for idx in xrange(len(prods)):
            prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]}

        # 挑选primary记录
        sorted_prods = sorted(prods,
                              key=lambda k: self.region_order[k['region']])
        main_entry = sorted_prods[0]
        entry = {
            k: unicodify(main_entry[k])
            for k in ('brand_id', 'model', 'name', 'description', 'details',
                      'gender', 'category', 'color', 'url', 'fingerprint')
        }
        if not entry['name']:
            entry['name'] = u'单品'

        mfashion_tags = [
            unicodify(val[0]) for val in db.query(
                str.format(
                    'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 '
                    'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags '
                    'WHERE p2.idproducts IN ({0})', ','.join(
                        val['idproducts']
                        for val in prods))).fetch_row(maxrows=0)
        ]
        #
        # original_tags = [int(val[0]) for val in
        #                  db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags '
        #                                      'WHERE idproducts IN ({0})',
        #                                      ','.join(val['idproducts'] for val in prods))).fetch_row(
        #                      maxrows=0)]

        entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False)
        entry[
            'original_tags'] = ''  #json.dumps(original_tags, ensure_ascii=False)

        entry['region_list'] = json.dumps([val['region'] for val in prods],
                                          ensure_ascii=False)
        entry['brandname_e'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_e']
        entry['brandname_c'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_c']
        # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time
        # entry['fetch_time'] = \
        #     sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[
        #         0].strftime("%Y-%m-%d %H:%M:%S")

        url_dict = {int(val['idproducts']): val['url'] for val in prods}
        offline_dict = {
            int(val['idproducts']): int(val['offline'])
            for val in prods
        }
        price_change_dict = {
            int(val['idproducts']): val['price_change']
            for val in prods
        }
        update_time_dict = {
            int(val['idproducts']):
            datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S")
            for val in prods
        }
        # pid和region之间的关系
        region_dict = {int(val['idproducts']): val['region'] for val in prods}
        price_list = {}
        # 以pid为主键,将全部的价格历史记录合并起来
        for item in db.query_match(
            ['price', 'price_discount', 'currency', 'date', 'idproducts'],
                self.price_hist, {},
                str.format('idproducts IN ({0})',
                           ','.join(val['idproducts'] for val in prods)),
                tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0,
                                                                    how=1):
            pid = int(item['idproducts'])
            region = region_dict[pid]
            offline = offline_dict[pid]
            if pid not in price_list:
                price_list[pid] = []
            price = float(item['price']) if item['price'] else None
            if offline == 0:
                price_discount = float(
                    item['price_discount']) if item['price_discount'] else None
            else:
                price_discount = None
            price_list[pid].append({
                'price':
                price,
                'price_discount':
                price_discount,
                'currency':
                item['currency'],
                'date':
                datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"),
                'price_change':
                price_change_dict[pid],
                'url':
                url_dict[pid],
                'offline':
                offline,
                'code':
                region,
                'country':
                info.region_info()[region]['name_c']
            })

        currency_conv = lambda val, currency: info.currency_info()[currency][
            'rate'] * val

        # 对price_list进行简并操作。
        # 策略:如果有正常的最新价格,则返回正常的价格数据。
        # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。
        # 如果无法找到不为None的价格,则跳过该pid
        for pid, pid_data in price_list.items():
            # 按照时间顺序逆排序,同时只保留price不为None的数据
            # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True)

            # 有价格的pid_data子集
            valid_pid_data = filter(lambda val: val['price'], pid_data)

            if pid_data[0]['price']:
                # 正常情况
                price_list[pid] = pid_data[0]
                # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况
                currency = valid_pid_data[0]['currency']
                if price_change_dict[pid] == 'D' and len(
                        valid_pid_data
                ) > 1 and currency == valid_pid_data[1]['currency']:
                    if not pid_data[0]['price_discount'] and currency_conv(
                            valid_pid_data[1]['price'],
                            currency) > currency_conv(
                                valid_pid_data[0]['price'],
                                currency) and (datetime.datetime.now() -
                                               valid_pid_data[0]['date']
                                               ) < datetime.timedelta(7):
                        price_list[pid]['price_discount'] = price_list[pid][
                            'price']
                        price_list[pid]['price'] = valid_pid_data[1]['price']
            else:
                # 寻找回溯第一条price不为None的数据。
                # tmp = filter(lambda val: val['price'], pid_data)
                if not valid_pid_data:
                    # 没有价格信息,取消该pid记录
                    price_list.pop(pid)
                else:
                    # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态
                    tmp = valid_pid_data[0]
                    tmp['price_discount'] = None
                    price_list[pid] = tmp

            # 第一次有效价格对应的时间,为fetch_time
            # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date']))
            # pid_data = filter(lambda val: val['price'], pid_data)
            if valid_pid_data and pid in price_list:
                price_list[pid]['fetch_time'] = valid_pid_data[-1]['date']
                price_list[pid]['idproducts'] = pid

        # 如果没有价格信息,则不发布
        if not price_list:
            return

        entry['price_list'] = sorted(
            price_list.values(),
            key=lambda val: self.region_order[val['code']])
        entry = release_filter(entry, logger)

        if not entry['price_list']:
            return

        entry['offline'] = entry['price_list'][0]['offline']

        # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。
        entry['fetch_time'] = min(
            tmp['fetch_time']
            for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S")

        # 价格排序的列表
        alt_prices = []
        for price_item in entry['price_list']:
            # 将datetime序列化,进而保存在release表中。
            price_item['date'] = price_item['date'].strftime(
                "%Y-%m-%d %H:%M:%S")
            price_item['fetch_time'] = price_item['fetch_time'].strftime(
                "%Y-%m-%d %H:%M:%S")
            if price_item['offline'] == 0:
                if price_item['price_discount']:
                    tmp = map(
                        lambda key_name: currency_conv(price_item[key_name],
                                                       price_item['currency']),
                        ('price', 'price_discount'))
                    tmp.extend([
                        price_item[key]
                        for key in ('price_change', 'price', 'price_discount',
                                    'currency', 'date', 'idproducts')
                    ])
                    alt_prices.append(tmp)
                else:
                    alt_prices.append([
                        currency_conv(price_item['price'],
                                      price_item['currency']), None,
                        price_item['price_change'], price_item['price'],
                        price_item['price_discount'], price_item['currency'],
                        price_item['date'], price_item['idproducts']
                    ])
            else:
                alt_prices.append([
                    currency_conv(price_item['price'], price_item['currency']),
                    None, price_item['price_change'], price_item['price'],
                    price_item['price_discount'], price_item['currency'],
                    price_item['date'], price_item['idproducts']
                ])

        # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价
        alt_prices = sorted(alt_prices,
                            key=lambda val: val[1] if val[1] else val[0])

        entry['price'], entry['price_discount'] = alt_prices[
            0][:2] if alt_prices else (None, ) * 2
        entry['price_change'] = alt_prices[0][2] if alt_prices else '0'
        entry['o_price'], entry['o_discount'], entry[
            'o_currency'] = alt_prices[0][3:6]

        # 取消entry['price_list']中的idproducts
        for i in xrange(len(entry['price_list'])):
            entry['price_list'][i].pop('idproducts')
        entry['price_list'] = json.dumps(entry['price_list'],
                                         ensure_ascii=False)

        entry['last_price_ts'] = alt_prices[0][6]
        entry['product_update_ts'] = update_time_dict[
            alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S")

        # 搜索字段
        search_text = u' '.join(entry[tmp] if entry[tmp] else ''
                                for tmp in ('name', 'description', 'details',
                                            'model', 'brandname_e',
                                            'brandname_c'))
        search_color = u' '.join(entry['color']) if entry['color'] else u''
        rs = db.query_match(
            ['description_cn', 'description_en', 'details_cn', 'details_en'],
            'products_translate', {
                'fingerprint': entry['fingerprint']
            }).fetch_row()
        part_translate = u' ' + u' '.join(
            unicodify(tmp)
            for tmp in filter(lambda v: v, rs[0])) if rs else ' '
        search_tags = u' '.join(list(set(mfashion_tags)))
        entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text,
                                             part_translate, search_tags,
                                             search_color)

        p = prods[0]
        checksums = []
        # 爆照checksums中的数据唯一,且顺序和idproducts_image一致
        for tmp in db.query(
                str.format(
                    '''
          SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1
          JOIN products AS p2 ON p1.fingerprint=p2.fingerprint
          JOIN images_store AS p3 ON p1.checksum=p3.checksum
          WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image
          ''', p['fingerprint'])).fetch_row(maxrows=0, how=1):
            if tmp not in checksums:
                checksums.append(tmp)

        # 如果没有图片,则暂时不添加到release表中
        if not checksums:
            return

        image_list = []
        for val in checksums:
            tmp = {
                'path': val['path'],
                'width': int(val['width']),
                'height': int(val['height'])
            }
            if not image_list:
                entry['cover_image'] = json.dumps(tmp, ensure_ascii=False)
            image_list.append(tmp)

        entry['image_list'] = json.dumps(image_list[:self.max_images],
                                         ensure_ascii=False)

        db.insert(entry, 'products_release')
示例#4
0
 def func(idx):
     rate = info.currency_info()[price_history[idx][-2]]['rate']
     return (float(price_history[idx][-4]) *
             rate if price_history[idx][-4] else None,
             float(price_history[idx][-3]) *
             rate if price_history[idx][-3] else None)
示例#5
0
 def price_conv(price, currency):
     return price * info.currency_info()[currency]['rate']
示例#6
0
    def run(cls, logger=None, **kwargs):
        log_path_name = os.path.normpath(
            os.path.join(
                getattr(gs, 'STORAGE_PATH'), 'log/check/DataCheck%s.log' %
                datetime.datetime.now().strftime('%Y%m%d')))
        logging.basicConfig(filename=log_path_name, level=logging.DEBUG)
        logging.info('PRODUCT CHECK STARTED!!!!')

        threshold = kwargs['threshold'] if 'threshold' in kwargs else 10

        if 'brand_list' in kwargs:
            brand_list = kwargs['brand_list']
        else:
            with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
                brand_list = db.query_match(['brand_id'],
                                            'products',
                                            distinct=True).fetch_row(maxrows=0)
                db.start_transaction()
                brand_list = [int(val[0]) for val in brand_list]

        for brand in brand_list:
            with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
                #=============================product check==================================================
                logging.info(
                    unicode.format(
                        u'{0} PROCESSING product check {1} / {2}',
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        brand,
                        info.brand_info()[brand]['brandname_e']))
                rs = db.query_match([
                    'idproducts', 'region', 'name', 'url', 'color',
                    'description', 'details', 'price_change'
                ], 'products', {
                    'brand_id': brand
                }).fetch_row(maxrows=0)
                for idproducts, region, name, url, color, desc, details, price_change in rs:
                    name_err = url_err = color_err = desc_err = details_err = price_change_err = False

                    #查找html转义符
                    # if name and has_escape(name):
                    #     print idproducts, name
                    #     name_err = True
                    # if desc and has_escape(desc):
                    #     print idproducts, desc
                    #     desc_err = True
                    # if details and has_escape(details):
                    #     print idproducts, details
                    #     details_err = True
                    # for c in [name, desc, details]:
                    #     if c and has_escape(c):
                    #         print idproducts, c
                    #         # db.update({'name': lxmlparser()},
                    #         #           'products', str.format('idproducts="{0}"', idproducts))
                    #         pass
                    if name and has_escape(name):
                        print(idproducts, name)
                        # print lxmlparser(unicode(name).encode("utf-8"))
                        # db.update({'name': lxmlparser(name)},
                        #           'products', str.format('idproducts="{0}"', idproducts))
                    if desc and has_escape(desc):
                        print(idproducts, desc)
                        # print lxmlparser(unicode(desc).encode("utf-8"))
                        # db.update({'desc': lxmlparser(desc)},
                        #           'products', str.format('idproducts="{0}"', idproducts))
                    if details and has_escape(details):
                        print(idproducts, details)
                        # print lxmlparser(unicode(details).encode("utf-8"))
                        # db.update({'details': lxmlparser(details)},
                        #           'products', str.format('idproducts="{0}"', idproducts))

                    #中英美区域name、description检验,只能包含中英文字符和标点,出现其他文字及符号标识为错误
                    if region in ['cn', 'us', 'uk']:
                        name_err = not region_pass(name)
                        desc_err = not region_pass(desc)

                    #url不含cjk字符,否则报错,quote生成新url,待用。
                    url_err = check_url(url)
                    if url_err:
                        url = urllib.quote(url, ":?=/")

                    #color为[]或者可json解析的字符串
                    if color != '[]' and color is not None:
                        try:
                            t = json.loads(color)
                            color_err = False
                        except:
                            color_err = True
                            print 'color:', color

                    if name_err or url_err or color_err or desc_err or details_err:
                        logging.error(
                            (datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                             'Detail info--------------idproducts:',
                             idproducts, 'name_err' if name_err else None,
                             'url_err' if url_err else None,
                             'color_err' if color_err else None,
                             'desc_err' if desc_err else None,
                             'details' if details_err else None))

                        #=============================price check==================================================

                        logging.info(
                            unicode.format(
                                u'{0} PROCESSING price check {1} / {2}',
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S'), brand,
                                info.brand_info()[brand]['brandname_e']))
                        prs = db.query(
                            str.format(
                                'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                                'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                                'p1.idproducts=p2.idproducts '
                                'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                                brand))
                        # 以model为键值,将同一个model下,不同区域的价格放在一起。
                        records = prs.fetch_row(maxrows=0, how=1)
                        price_data = {}
                        for r in records:
                            model = r['model']
                            # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                            if r['price'] and int(r['offline']) == 0:
                                # 首先检查model是否已存在
                                if model not in price_data:
                                    price_data[model] = []
                                price_data[model].append(r)

                        # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
                        for model in price_data:
                            for item in price_data[model]:
                                price = float(item['price'])
                                item['nprice'] = info.currency_info()[
                                    item['currency']]['rate'] * price

                            # 按照nprice大小排序
                            sorted_data = sorted(
                                price_data[model],
                                key=lambda item: item['nprice'])
                            max_price = sorted_data[-1]['nprice']
                            min_price = sorted_data[0]['nprice']
                            if min_price > 0 and max_price / min_price > threshold:
                                logging.warning(
                                    unicode.format(
                                        u'{0} WARNING: {1}:{7} MODEL={2}, {3} / {4} => {5} / {6}',
                                        datetime.datetime.now().strftime(
                                            '%Y-%m-%d %H:%M:%S'), brand, model,
                                        sorted_data[0]['nprice'],
                                        sorted_data[0]['region'],
                                        sorted_data[-1]['nprice'],
                                        sorted_data[-1]['region'],
                                        info.brand_info()[brand]
                                        ['brandname_e']))
        logging.info('PRODUCT CHECK ENDED!!!!')