예제 #1
0
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                        db_spider_spec=getattr(glob, 'SPIDER_SPEC'),
                        table='products',
                        extra_cond=None):
    """
    给editor库的数据添加tags字段
    """
    db = RoseVisionDb()
    db.conn(db_spider_spec)
    try:
        extra_cond = ' AND '.join(
            unicode.format(u'({0})', tuple(unicodify(v)))
            for v in extra_cond) if extra_cond else '1'

        rs = db.query(
            unicode.format(
                u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}',
                extra_cond).encode('utf-8'))
        temp = rs.fetch_row(maxrows=0)
        mapping_rules = dict(temp)
    finally:
        db.close()

    db.conn(db_spec)
    db.start_transaction()
    try:

        rs = db.query(
            unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond))
        for i in xrange(rs.num_rows()):
            record = rs.fetch_row(how=1)[0]
            extra = json.loads(record['extra'])
            tags = []
            for k in extra:
                tags.extend(extra[k])
            tags = set(tags)
            tag_names = []
            for v in tags:
                if v in mapping_rules:
                    tag_names.extend(json.loads(mapping_rules[v]))
            tag_names = list(set(tag_names))

            db.update({'tags': json.dumps(tag_names, ensure_ascii=False)},
                      str.format('idproducts={0}', record['idproducts']))

        db.commit()
        pass
    except OperationalError:
        db.rollback()
    finally:
        db.close()
예제 #2
0
def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                         table='products',
                         extra_cond=None):
    """
    处理editor库中的价格信息
    :param table: 需要操作的表。默认为products。
    :param db_spec: 需要操作的数据库,默认为editor库。
    :param extra_cond: 筛选条件。
    """
    db = RoseVisionDb()
    db.conn(db_spec)
    extra_cond = ' AND '.join(
        unicode.format(u'({0})', tuple(unicodify(v)))
        for v in extra_cond) if extra_cond else '1'

    db.lock([table])
    db.start_transaction()
    try:
        # 根据update_flag字段判断哪些记录是需要更新的
        obj = EditorPriceProcessor(
            db.query(
                unicode.format(
                    u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}',
                    table, extra_cond).encode('utf-8')), db)
        core.func_carrier(obj, 1)
        db.commit()
    except OperationalError:
        db.rollback()
    finally:
        db.unlock()
        db.close()
예제 #3
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list
        if not brand_list:
            # 如果没有任何品牌,则直接退出
            return self.report

        self.progress = 0
        # 获得检查总数
        self.tot = int(
            db.query(
                str.format(
                    'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})',
                    ','.join(str(tmp)
                             for tmp in brand_list))).fetch_row()[0][0])
        for brand in brand_list:
            if not self.silent:
                print unicode.format(u'\nPROCESSING {0} / {1}\n', brand,
                                     info.brand_info()[brand]['brandname_e'])

            db.start_transaction()
            try:
                for model, pid, fingerprint in db.query_match(
                    ['model', 'idproducts', 'fingerprint'], 'products', {
                        'brand_id': brand
                    }).fetch_row(maxrows=0):
                    self.progress += 1
                    new_fp = gen_fingerprint(brand, model)
                    if fingerprint != new_fp:
                        self.report.append({
                            'model': model,
                            'idproducts': pid,
                            'fingerprint_db': fingerprint,
                            'fingerprint_gen': new_fp,
                            'brand_id': brand
                        })
                        if not self.silent:
                            print unicode.format(
                                u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, '
                                u'fingerprints: {3} => {4}\n', model, pid,
                                brand, fingerprint, new_fp)
                        if self.update_fingerprint:
                            # 自动更新MD5指纹
                            db.update({'fingerprint': new_fp},
                                      'products',
                                      str.format('idproducts={0}', pid),
                                      timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()
        db.close()
예제 #4
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)
        for brand in brand_list:
            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            self.progress += 1
            rs = db.query(
                str.format(
                    'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                    'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                    'p1.idproducts=p2.idproducts '
                    'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                    brand))

            # 以model为键值,将同一个model下,不同区域的价格放在一起。
            records = rs.fetch_row(maxrows=0, how=1)
            price_data = {}
            for r in records:
                model = r['model']
                # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                # if r['price'] and int(r['offline']) == 0:
                # 这里更改为不管offline,全检查
                if r['price']:
                    # 首先检查model是否已存在
                    if model not in price_data:
                        price_data[model] = []
                    price_data[model].append(r)

            # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
            for model in price_data:
                for item in price_data[model]:
                    price = float(item['price'])
                    item['nprice'] = info.currency_info()[
                        item['currency']]['rate'] * price

                # 按照nprice大小排序
                sorted_data = sorted(price_data[model],
                                     key=lambda item: item['nprice'])
                max_price = sorted_data[-1]['nprice']
                min_price = sorted_data[0]['nprice']
                if min_price > 0 and max_price / min_price > self.threshold:
                    print unicode.format(
                        u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}',
                        brand, model, sorted_data[0]['nprice'],
                        sorted_data[0]['region'], sorted_data[-1]['nprice'],
                        sorted_data[-1]['region'],
                        info.brand_info()[brand]['brandname_e'])

        db.close()
예제 #5
0
    def run(self):
        db_src = RoseVisionDb()
        db_src.conn(self.src_spec)
        db_dst = RoseVisionDb()
        db_dst.conn(self.dst_spec)

        # 备选记录
        idproducts_list = [
            int(val[0]) for val in db_src.query(
                unicode.format(u'SELECT idproducts FROM products WHERE {0}',
                               u' AND '.join(self.cond)).encode(
                                   'utf-8')).fetch_row(maxrows=0)
        ]

        self.tot = len(idproducts_list)
        self.progress = 0

        db_dst.execute('SET AUTOCOMMIT=0')
        # db_dst.execute('ALTER TABLE products DISABLE KEYS')

        for pid_src in idproducts_list:
            self.progress += 1
            record = db_src.query(
                str.format('SELECT * FROM products WHERE idproducts={0}',
                           pid_src)).fetch_row(how=1)[0]

            db_dst.start_transaction()
            try:
                rs = db_dst.query(
                    str.format(
                        'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                        'AND region="{2}"', record['brand_id'],
                        record['model'], record['region']))
                pid_dst = int(
                    rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None
                entry = {k: record[k] for k in record if k != 'idproducts'}

                price = process_price(record['price'], record['region'])
                if price:
                    entry['price_rev'] = price['price']
                    entry['currency_rev'] = price['currency']

                if entry['details']:
                    entry['details'] = self.process_text(
                        unicodify(entry['details']))
                if entry['description']:
                    entry['description'] = self.process_text(
                        unicodify(entry['description']))
                if entry['name']:
                    entry['name'] = self.process_text(unicodify(entry['name']))
                if entry['category']:
                    entry['category'] = self.process_text(
                        unicodify(entry['category']))
                if entry['extra']:
                    entry['extra'] = self.process_text(
                        unicodify(entry['extra']))

                if pid_dst:
                    db_dst.update(entry, 'products',
                                  str.format('idproducts={0}', pid_dst))
                else:
                    db_dst.insert(entry, 'products')
                    pid_dst = int(
                        db_dst.query(
                            str.format(
                                'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                                'AND region="{2}"', record['brand_id'],
                                record['model'],
                                record['region'])).fetch_row()[0][0])

                # 是否需要处理价格信息
                if price:
                    record_price = db_dst.query(
                        str.format(
                            'SELECT price,currency FROM products_price_history '
                            'WHERE idproducts={0} ORDER BY date DESC LIMIT 1',
                            pid_dst)).fetch_row(how=1)
                    if not record_price or float(record_price[0]['price']) != price['price'] or \
                                    record_price[0]['currency'] != price['currency']:
                        db_dst.insert(
                            {
                                'idproducts': pid_dst,
                                'date': record['update_time'],
                                'brand_id': record['brand_id'],
                                'region': record['region'],
                                'model': record['model'],
                                'price': price['price'],
                                'currency': price['currency']
                            }, 'products_price_history')

                # 处理图像信息
                tmp = db_src.query(
                    str.format(
                        'SELECT checksum,brand_id,url,path,width,height,format FROM products_image '
                        'WHERE brand_id={0} AND model="{1}"',
                        record['brand_id'],
                        record['model'])).fetch_row(maxrows=0, how=1)
                image_record = {val['checksum']: val for val in tmp}
                checksum_src = set(image_record.keys())

                # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。
                for checksum in checksum_src:
                    if db_dst.query(
                            str.format(
                                'SELECT checksum FROM images_store WHERE checksum="{0}"',
                                checksum)).num_rows() == 0:
                        db_dst.insert(
                            {
                                'checksum': checksum,
                                'brand_id': image_record[checksum]['brand_id'],
                                'url': image_record[checksum]['url'],
                                'path': image_record[checksum]['path'],
                                'width': image_record[checksum]['width'],
                                'height': image_record[checksum]['height'],
                                'format': image_record[checksum]['format']
                            }, 'images_store')

                # 补充目标数据库的products_image表,添加相应的checksum
                checksum_dst = set([
                    val[0] for val in db_dst.query(
                        str.format(
                            'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"',
                            record['brand_id'], record['model'])).fetch_row(
                                maxrows=0)
                ])
                for checksum in checksum_src - checksum_dst:
                    db_dst.insert(
                        {
                            'checksum': checksum,
                            'brand_id': record['brand_id'],
                            'model': record['model']
                        }, 'products_image')

                db_dst.commit()
            except:
                db_dst.rollback()
                raise
예제 #6
0
class UpdateSpider(MFashionBaseSpider):
    handle_httpstatus_list = [404]

    def __init__(self, brand_list, region_list, db_spec, *a, **kw):
        self.name = str.format(
            'update-{0}-{1}',
            '-'.join(str(tmp) for tmp in brand_list) if brand_list else 'all',
            '-'.join(region_list) if region_list else 'all')
        super(UpdateSpider, self).__init__(*a, **kw)
        self.brand_list = brand_list
        self.region_list = region_list
        self.db = RoseVisionDb()
        self.db.conn(db_spec)

        # self.python_logger = get_logger()
        # PythonLoggingObserver(loggerName='rosevision').start()

    def start_requests(self):
        # 如果未指定brand_list,则默认对所有的品牌进行更新
        # 获得所有的品牌数据
        if not self.brand_list:
            self.brand_list = info.brand_info().keys()
        # self.log('TEST', log.INFO)
        #
        # return

        # UpdateSpider的可选区域参数
        region_cond = str.format(
            'region IN ({0})', ','.join(
                "'" + tmp + "'"
                for tmp in self.region_list)) if self.region_list else '1'

        rs = self.db.query(
            str.format(
                'SELECT COUNT(*) FROM products WHERE brand_id IN ({0}) AND {1}',
                ','.join(str(tmp) for tmp in self.brand_list), region_cond))
        tot_num = int(rs.fetch_row()[0][0])
        self.log(str.format('Total number of records to update: {0}', tot_num),
                 level=log.INFO)

        for brand in self.brand_list:
            # 获得该品牌下所有记录
            # 如果未指定region_list,则默认对所有的的确进行更新
            if self.region_list:
                region_list = self.region_list
            else:
                rs = self.db.query(
                    str.format(
                        'SELECT DISTINCT region FROM products WHERE brand_id={0}',
                        brand))
                region_list = [
                    tmp['region'] for tmp in rs.fetch_row(maxrows=0, how=1)
                ]

            region_info = info.region_info()
            region_list = filter(lambda val: int(region_info[val]['status']),
                                 region_list)

            for region in region_list:
                # 对该brand,该区域的所有商品,无论其下线状态是什么,都进行更新。
                rs = self.db.query_match(
                    {'idproducts', 'url', 'region', 'model'}, 'products', {
                        'brand_id': brand,
                        'region': region
                    })
                products_map = {
                    int(tmp['idproducts']): {
                        'url': tmp['url'],
                        'region': tmp['region'],
                        'model': tmp['model']
                    }
                    for tmp in rs.fetch_row(maxrows=0, how=1)
                }
                for pid, data in products_map.items():
                    url = data['url']
                    region = data['region']
                    model = data['model']

                    # url = 'http://www.gucci.com/us/styles/3085353G0109060'
                    # region = 'us'
                    # pid = 196907
                    #
                    # return [Request(url=url,
                    #                 callback=self.parse,
                    #                 meta={'brand': brand, 'pid': pid, 'region': region},
                    #                 errback=self.onerror,
                    #                 dont_filter=True)]
                    if url:
                        try:
                            yield Request(url=url,
                                          callback=self.parse,
                                          meta={
                                              'brand': brand,
                                              'pid': pid,
                                              'region': region,
                                              'model': model
                                          },
                                          errback=self.onerror,
                                          dont_filter=True)
                        except TypeError:
                            continue
                    else:
                        continue

    def parse(self, response):
        brand = response.meta['brand']
        item = UpdateItem()
        item['idproduct'] = response.meta['pid']
        item['brand'] = brand
        item['region'] = response.meta['region']
        sc = info.spider_info()[brand]['spider_class']
        metadata = {}
        item['metadata'] = metadata

        metadata['url'] = response.url

        if response.status == 404:
            item['offline'] = 1
            return item
        elif response.status < 200 or response.status > 300:
            return
        else:
            func = getattr(sc, 'is_offline')
            item['offline'] = 1 if func(response, self) else 0
            if item['offline'] == 1:
                return item

        if 'fetch_price' in dir(sc):
            ret = getattr(sc, 'fetch_price')(response, self)
            if isinstance(ret, Request):
                metadata['price'] = ret
            else:
                if 'price' in ret:
                    metadata['price'] = ret['price']
                if 'price_discount' in ret:
                    metadata['price_discount'] = ret['price_discount']

        if 'fetch_name' in dir(sc):
            name = getattr(sc, 'fetch_name')(response, self)
            if name:
                metadata['name'] = name

        if 'fetch_model' in dir(sc):
            model = getattr(sc, 'fetch_model')(response, self)
            if model:
                metadata['model'] = model

        if 'fetch_description' in dir(sc):
            description = getattr(sc, 'fetch_description')(response, self)
            if description:
                metadata['description'] = description

        if 'fetch_details' in dir(sc):
            details = getattr(sc, 'fetch_details')(response, self)
            if details:
                metadata['details'] = details

        if 'fetch_color' in dir(sc):
            color = getattr(sc, 'fetch_color')(response, self)
            if color:
                metadata['color'] = color

        return self.resolve_requests(item)

    def update_callback(self, response):
        spider_cb = response.meta['spider_cb']
        item = response.meta['item']
        key = response.meta['key']
        ret = spider_cb(response)

        # key对应的
        item['metadata'].pop(key)

        if ret:
            if key == 'price':
                for tmp in ('price', 'price_discount'):
                    if tmp in ret:
                        item['metadata'][tmp] = ret[tmp]
            else:
                item['metadata'][key] = ret
        else:
            # 如果返回值为None,说明没有相对应的信息。
            if key == 'price':
                for tmp in ('price', 'price_discount'):
                    if tmp in item['metadata']:
                        item['metadata'].pop(tmp)
        return self.resolve_requests(item)

    def resolve_requests(self, item):
        """
        检查metadata里面是否有Request对象。如果有,需要进一步提交这些request。
        仅当所有的对象都不是Request时,该过程才结束。
        应该注意的是,metadata中的键只会处理一次。
        @param metadata:
        """
        metadata = item['metadata']
        while True:
            resolved = True
            for func_key, value in metadata.items():
                if isinstance(value, Request):
                    value.meta['spider_cb'] = value.callback
                    value.meta['item'] = item
                    value.meta['key'] = func_key
                    value.callback = self.update_callback
                    value.dont_filter = True
                    return value

            if resolved:
                break

        return item

    @staticmethod
    def onerror(reason):
        meta = None
        try:
            if hasattr(reason.value, 'response'):
                response = reason.value.response
                # 这里response可能为None,比如出现
                # ERROR: Error downloading <GET xxx>:
                # [<twisted.python.failure.Failure <class 'twisted.internet.error.ConnectionDone'>>]
                if response:
                    meta = response.meta
        except AttributeError:
            pass
        try:
            if not meta:
                meta = reason.request.meta
        except AttributeError:
            pass

        if meta:
            try:
                brand = meta['brand']
                item = UpdateItem()
                item['idproduct'] = meta['pid']
                item['brand'] = brand
                item['region'] = meta['region']
                # sc = glob.spider_info()[brand]
                metadata = {}
                item['metadata'] = metadata
                item['offline'] = 1
                return item
            except KeyError:
                return
예제 #7
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        # 如果没有指定brand_list,则默认使用数据库中所有的brand_list
        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)

        # 最终生成的表格
        tot_results = []

        for brand in brand_list:
            results = {}

            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            brand_name = info.brand_info()[brand]['brandname_e']
            self.progress += 1

            rs = db.query(
                str.format(
                    '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1
                            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
                            LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts
                            LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags
                            WHERE p1.brand_id={0} AND p1.offline=0''', brand))
            records = rs.fetch_row(maxrows=0, how=1)
            for r in records:
                pid = int(r['idproducts'])
                timestamp = datetime.datetime.strptime(r['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                tag = unicodify(r['tag'])

                if pid in results:
                    # 如果已经存在相应单品的记录
                    old_rec = results[pid]
                    old_rec['tag'].add(tag)
                    old_t = datetime.datetime.strptime(old_rec['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                    if timestamp > old_t:
                        old_rec['price'] = unicodify(r['price'])
                        old_rec['price_discount'] = unicodify(
                            r['price_discount'])
                        old_rec['currency'] = unicodify(r['currency'])
                        old_rec['date'] = unicodify(r['date'])
                else:
                    # 如果该单品的记录不存在
                    results[pid] = {k: unicodify(r[k]) for k in r}
                    tmp = results[pid]['tag']
                    if tmp:
                        results[pid]['tag'] = {tmp}
                    else:
                        results[pid]['tag'] = set({})
                    results[pid]['brand'] = brand_name
                    results[pid].pop('idproducts')

            tot_results.extend(self.random_extract(results.values()))

        db.close()

        # 将所有的tag转换为[]
        data = []
        for r in tot_results:
            r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False)
            data.append(
                {k: r[k].encode('utf-8') if r[k] else 'NULL'
                 for k in r})

        # 写入CSV文件
        with open(
                str.format('extract_{0}.csv',
                           datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
                'wb') as f:
            f.write(u'\ufeff'.encode('utf8'))
            dict_writer = csv.DictWriter(f,
                                         fieldnames=[
                                             'brand_id', 'brand', 'model',
                                             'region', 'price',
                                             'price_discount', 'currency',
                                             'date', 'name', 'tag', 'url'
                                         ])
            dict_writer.writeheader()
            dict_writer.writerows(data)
# coding=utf-8

from utils.db import RoseVisionDb
import global_settings as gs

__author__ = 'Ryan'

db = RoseVisionDb()
db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])
rs = db.query(str.format('select currency, rate from region_info'))
exchange_dic = {
    val['currency']: val['rate']
    for val in
    rs.fetch_row(maxrows=0, how=1)
}


def check_price_history_region_diff(price_dic, max_range=3):
    """
    price_dic = {
        'CNY': 1500.0
        'USD': 500.0
    }
    """
    stand_value = None
    for key, value in price_dic.items():
        rate = float(exchange_dic[key])
        if stand_value:
            rate_value = value * rate
            if rate_value > max_range * stand_value or rate_value < max_range * stand_value:
                return False