def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise
class ProcessTags(object): """ 标签的映射规则有变动,需要更新 """ original_tags_tbl = 'original_tags' mfashion_tags_tbl = 'mfashion_tags' prod_tag_tbl = 'products_original_tags' prod_mt_tbl = 'products_mfashion_tags' products = 'products' db_spec = getattr(gs, 'DATABASE')['DB_SPEC'] tot = 1 progress = 0 def __init__(self, last_update=None, extra_cond=None): print str.format( 'Processing tags (last_update="{0}", extra_cond="{1}")...', last_update, extra_cond) self.db = RoseVisionDb() self.db.conn(self.db_spec) self.last_update = last_update self.extra_cond = extra_cond def get_msg(self): return str.format( '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot, float(self.progress) / self.tot) if self.tot > 0 else 'IDLE' def run(self): last_update = self.last_update extra_cond = self.extra_cond if not extra_cond: extra_cond = [] elif not iterable(extra_cond): extra_cond = [extra_cond] if last_update: extra_cond.append( unicode.format(u'update_time > "{0}"', last_update)) extra_cond.append('mapping_list IS NOT NULL') # MFashion标签的缓存记录 cached_mfashion = {} # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以 # 该标签对应brand/region下的所有标签都必须重做 rs = self.db.query_match(['brand_id', 'region'], self.original_tags_tbl, {}, extra=extra_cond, distinct=True) # 需要处理的标签 tag_dict = {} for i in xrange(rs.num_rows()): brand_id, region = rs.fetch_row()[0] for val in self.db.query_match( ['idmappings', 'mapping_list'], self.original_tags_tbl, { 'brand_id': brand_id, 'region': region }, extra='mapping_list IS NOT NULL').fetch_row(maxrows=0): tag_dict[val[0]] = json.loads(val[1].replace("'", '"')) # 删除旧单品/标签关系 self.db.execute( str.format( 'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts ' 'AND p1.brand_id={2} AND region="{3}"', self.products, self.prod_mt_tbl, brand_id, region)) self.tot = len(tag_dict) self.progress = 0 for tid, rule in tag_dict.items(): self.progress += 1 self.db.start_transaction() try: # 所有相关的单品 pid_list = [ int(val[0]) for val in self.db.query_match( ['idproducts'], self.prod_tag_tbl, { 'id_original_tags': tid }).fetch_row(maxrows=0) ] # 添加MFashion标签 for tag in rule: if tag not in cached_mfashion: self.db.insert({'tag': tag}, self.mfashion_tags_tbl, ignore=True) tid = int( self.db.query_match(['idmfashion_tags'], self.mfashion_tags_tbl, { 'tag': tag }).fetch_row()[0][0]) cached_mfashion[tag] = tid self.db.insert([{ 'idproducts': pid, 'id_mfashion_tags': cached_mfashion[tag] } for pid in pid_list], self.prod_mt_tbl, ignore=True) self.db.commit() except ValueError: self.db.rollback() except: self.db.rollback() raise