Пример #1
0
    def run(self):
        db_src = RoseVisionDb()
        db_src.conn(self.src_spec)
        db_dst = RoseVisionDb()
        db_dst.conn(self.dst_spec)

        # 备选记录
        idproducts_list = [
            int(val[0]) for val in db_src.query(
                unicode.format(u'SELECT idproducts FROM products WHERE {0}',
                               u' AND '.join(self.cond)).encode(
                                   'utf-8')).fetch_row(maxrows=0)
        ]

        self.tot = len(idproducts_list)
        self.progress = 0

        db_dst.execute('SET AUTOCOMMIT=0')
        # db_dst.execute('ALTER TABLE products DISABLE KEYS')

        for pid_src in idproducts_list:
            self.progress += 1
            record = db_src.query(
                str.format('SELECT * FROM products WHERE idproducts={0}',
                           pid_src)).fetch_row(how=1)[0]

            db_dst.start_transaction()
            try:
                rs = db_dst.query(
                    str.format(
                        'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                        'AND region="{2}"', record['brand_id'],
                        record['model'], record['region']))
                pid_dst = int(
                    rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None
                entry = {k: record[k] for k in record if k != 'idproducts'}

                price = process_price(record['price'], record['region'])
                if price:
                    entry['price_rev'] = price['price']
                    entry['currency_rev'] = price['currency']

                if entry['details']:
                    entry['details'] = self.process_text(
                        unicodify(entry['details']))
                if entry['description']:
                    entry['description'] = self.process_text(
                        unicodify(entry['description']))
                if entry['name']:
                    entry['name'] = self.process_text(unicodify(entry['name']))
                if entry['category']:
                    entry['category'] = self.process_text(
                        unicodify(entry['category']))
                if entry['extra']:
                    entry['extra'] = self.process_text(
                        unicodify(entry['extra']))

                if pid_dst:
                    db_dst.update(entry, 'products',
                                  str.format('idproducts={0}', pid_dst))
                else:
                    db_dst.insert(entry, 'products')
                    pid_dst = int(
                        db_dst.query(
                            str.format(
                                'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                                'AND region="{2}"', record['brand_id'],
                                record['model'],
                                record['region'])).fetch_row()[0][0])

                # 是否需要处理价格信息
                if price:
                    record_price = db_dst.query(
                        str.format(
                            'SELECT price,currency FROM products_price_history '
                            'WHERE idproducts={0} ORDER BY date DESC LIMIT 1',
                            pid_dst)).fetch_row(how=1)
                    if not record_price or float(record_price[0]['price']) != price['price'] or \
                                    record_price[0]['currency'] != price['currency']:
                        db_dst.insert(
                            {
                                'idproducts': pid_dst,
                                'date': record['update_time'],
                                'brand_id': record['brand_id'],
                                'region': record['region'],
                                'model': record['model'],
                                'price': price['price'],
                                'currency': price['currency']
                            }, 'products_price_history')

                # 处理图像信息
                tmp = db_src.query(
                    str.format(
                        'SELECT checksum,brand_id,url,path,width,height,format FROM products_image '
                        'WHERE brand_id={0} AND model="{1}"',
                        record['brand_id'],
                        record['model'])).fetch_row(maxrows=0, how=1)
                image_record = {val['checksum']: val for val in tmp}
                checksum_src = set(image_record.keys())

                # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。
                for checksum in checksum_src:
                    if db_dst.query(
                            str.format(
                                'SELECT checksum FROM images_store WHERE checksum="{0}"',
                                checksum)).num_rows() == 0:
                        db_dst.insert(
                            {
                                'checksum': checksum,
                                'brand_id': image_record[checksum]['brand_id'],
                                'url': image_record[checksum]['url'],
                                'path': image_record[checksum]['path'],
                                'width': image_record[checksum]['width'],
                                'height': image_record[checksum]['height'],
                                'format': image_record[checksum]['format']
                            }, 'images_store')

                # 补充目标数据库的products_image表,添加相应的checksum
                checksum_dst = set([
                    val[0] for val in db_dst.query(
                        str.format(
                            'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"',
                            record['brand_id'], record['model'])).fetch_row(
                                maxrows=0)
                ])
                for checksum in checksum_src - checksum_dst:
                    db_dst.insert(
                        {
                            'checksum': checksum,
                            'brand_id': record['brand_id'],
                            'model': record['model']
                        }, 'products_image')

                db_dst.commit()
            except:
                db_dst.rollback()
                raise
Пример #2
0
class ProcessTags(object):
    """
    标签的映射规则有变动,需要更新
    """
    original_tags_tbl = 'original_tags'
    mfashion_tags_tbl = 'mfashion_tags'
    prod_tag_tbl = 'products_original_tags'
    prod_mt_tbl = 'products_mfashion_tags'
    products = 'products'

    db_spec = getattr(gs, 'DATABASE')['DB_SPEC']
    tot = 1
    progress = 0

    def __init__(self, last_update=None, extra_cond=None):
        print str.format(
            'Processing tags (last_update="{0}", extra_cond="{1}")...',
            last_update, extra_cond)
        self.db = RoseVisionDb()
        self.db.conn(self.db_spec)
        self.last_update = last_update
        self.extra_cond = extra_cond

    def get_msg(self):
        return str.format(
            '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot,
            float(self.progress) / self.tot) if self.tot > 0 else 'IDLE'

    def run(self):
        last_update = self.last_update
        extra_cond = self.extra_cond

        if not extra_cond:
            extra_cond = []
        elif not iterable(extra_cond):
            extra_cond = [extra_cond]
        if last_update:
            extra_cond.append(
                unicode.format(u'update_time > "{0}"', last_update))
        extra_cond.append('mapping_list IS NOT NULL')

        # MFashion标签的缓存记录
        cached_mfashion = {}

        # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以
        # 该标签对应brand/region下的所有标签都必须重做
        rs = self.db.query_match(['brand_id', 'region'],
                                 self.original_tags_tbl, {},
                                 extra=extra_cond,
                                 distinct=True)
        # 需要处理的标签
        tag_dict = {}
        for i in xrange(rs.num_rows()):
            brand_id, region = rs.fetch_row()[0]
            for val in self.db.query_match(
                ['idmappings', 'mapping_list'],
                    self.original_tags_tbl, {
                        'brand_id': brand_id,
                        'region': region
                    },
                    extra='mapping_list IS NOT NULL').fetch_row(maxrows=0):
                tag_dict[val[0]] = json.loads(val[1].replace("'", '"'))

            # 删除旧单品/标签关系
            self.db.execute(
                str.format(
                    'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts '
                    'AND p1.brand_id={2} AND region="{3}"', self.products,
                    self.prod_mt_tbl, brand_id, region))

        self.tot = len(tag_dict)
        self.progress = 0
        for tid, rule in tag_dict.items():
            self.progress += 1
            self.db.start_transaction()
            try:
                # 所有相关的单品
                pid_list = [
                    int(val[0]) for val in self.db.query_match(
                        ['idproducts'], self.prod_tag_tbl, {
                            'id_original_tags': tid
                        }).fetch_row(maxrows=0)
                ]

                # 添加MFashion标签
                for tag in rule:
                    if tag not in cached_mfashion:
                        self.db.insert({'tag': tag},
                                       self.mfashion_tags_tbl,
                                       ignore=True)
                        tid = int(
                            self.db.query_match(['idmfashion_tags'],
                                                self.mfashion_tags_tbl, {
                                                    'tag': tag
                                                }).fetch_row()[0][0])
                        cached_mfashion[tag] = tid

                    self.db.insert([{
                        'idproducts': pid,
                        'id_mfashion_tags': cached_mfashion[tag]
                    } for pid in pid_list],
                                   self.prod_mt_tbl,
                                   ignore=True)

                self.db.commit()
            except ValueError:
                self.db.rollback()
            except:
                self.db.rollback()
                raise