def __init__(self, last_update=None, extra_cond=None): print str.format( 'Processing tags (last_update="{0}", extra_cond="{1}")...', last_update, extra_cond) self.db = RoseVisionDb() self.db.conn(self.db_spec) self.last_update = last_update self.extra_cond = extra_cond
def __init__(self, brand_list, region_list, db_spec, *a, **kw): self.name = str.format( 'update-{0}-{1}', '-'.join(str(tmp) for tmp in brand_list) if brand_list else 'all', '-'.join(region_list) if region_list else 'all') super(UpdateSpider, self).__init__(*a, **kw) self.brand_list = brand_list self.region_list = region_list self.db = RoseVisionDb() self.db.conn(db_spec)
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) for brand in brand_list: print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) self.progress += 1 rs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = rs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 # if r['price'] and int(r['offline']) == 0: # 这里更改为不管offline,全检查 if r['price']: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted(price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > self.threshold: print unicode.format( u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}', brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand]['brandname_e']) db.close()
def run(cls, logger=None, **kwargs): #必须指定brand-id if 'brand_id' in kwargs.keys(): id = kwargs['brand_id'] else: return #todo 服务器需安装jre+selenium+phantomjs sel = webdriver.PhantomJS( executable_path= u'C:\phantomjs-1.9.7-windows\phantomjs-1.9.7-windows\phantomjs.exe' ) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: rs = db.query_match([ 'idproducts', 'brand_id', 'model', 'name', 'url', 'description', 'price' ], 'products', { 'brand_id': id, 'offline': '0' }).fetch_row(maxrows=0) db.start_transaction() #随机抽取500个单品 rs = random.sample(rs, 500) try: for idproducts, brand_id, model, name, url, description, price in rs: name_err = desc_err = price_err = False brand_id = unicodify(brand_id) model = unicodify(model) name = unicodify(name) url = unicodify(url) description = unicodify(description) price = unicodify(price) sel.get(url) content = sel.find_element_by_xpath("//*").get_attribute( 'outerHTML') #check name if name != None and name not in content: name_err = True #check description if description != None and False in map( lambda x: x in content, (word for word in seperate(description))): desc_err = True #check price if price != None and price not in content: price_err = True if name_err or desc_err or price_err: print 'error!! pk:%s' % idproducts, \ ',name error' if name_err else None, \ ',desc error' if desc_err else None, \ ',price error' if price_err else None else: print 'pass' except: raise
def brand_info(refetch=False): """ 返回品牌信息。 @param refetch: 强制重新读取信息。 """ info = getattr(brand_info, 'brand_info') if info and not refetch: return info info = {} with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db: for brand_id, name_e, name_c, name_s in db.query_match( ['brand_id', 'brandname_e', 'brandname_c', 'brandname_s'], 'brand_info', {}).fetch_row(maxrows=0): brand_id = int(brand_id) name_e = name_e.decode('utf-8') if name_e else None name_c = name_c.decode('utf-8') if name_c else None name_s = name_s.decode('utf-8') if name_s else None info[brand_id] = { 'brandname_e': name_e, 'brandname_c': name_c, 'brandname_s': name_s } setattr(brand_info, 'brand_info', info) return info
def tag_outdated(cls, **kwargs): """ 将价格过期的单品在数据库中标记出来。 @param kwargs: duration:多少天以后,单品的价格趋势信息就算是过期了?默认为7天。 """ duration = int(kwargs['duration'][0]) if 'duration' in kwargs else 7 ts = (datetime.datetime.now() - datetime.timedelta(duration)).strftime('"%Y-%m-%d %H:%M:%S"') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: pid_list = [ int(tmp[0]) for tmp in db.query( str.format( ''' SELECT prod.idproducts FROM products AS prod JOIN products_price_history AS price ON prod.idproducts=price.idproducts WHERE prod.price_change!='0' AND price.date<'{0}' ''', ts)).fetch_row(maxrows=0) ] max_bulk = 1000 offset = 0 while offset < len(pid_list): tmp_list = pid_list[offset:offset + max_bulk] offset += max_bulk db.query( str.format( ''' UPDATE products SET price_change='0', update_time='{1}' WHERE idproducts IN ({0}) ''', ', '.join(str(tmp) for tmp in tmp_list), datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) db.query( 'UPDATE products SET price_change="0" WHERE price_change!="0" AND offline!=0' )
def region_info(refetch=False): """ 返回国家/地区信息。 @param refetch: 强制重新读取信息。 """ info = getattr(region_info, 'region_info') if info and not refetch: return info info = {} with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db: for code, code3, name_e, name_c, currency, weight, status in db.query_match( [ 'iso_code', 'iso_code3', 'name_e', 'name_c', 'currency', 'weight', 'status' ], 'region_info', {}).fetch_row(maxrows=0): weight = int(weight) status = int(status) name_e = name_e.decode('utf-8') if name_e else None name_c = name_c.decode('utf-8') if name_c else None info[code] = { 'iso_code3': code3, 'name_e': name_e, 'name_c': name_c, 'currency': currency, 'weight': weight, 'status': status } setattr(region_info, 'region_info', info) return info
def currency_info(refetch=False): """ 返回货币信息。 @param refetch: 强制重新读取信息。 """ info = getattr(currency_info, 'currency_info') if info and not refetch: return info info = {} with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db: for currency, symbol, name, rate, update_time in db.query_match( ['currency', 'symbol', 'name', 'rate', 'update_time'], 'currency_info', {}).fetch_row(maxrows=0): currency = currency.decode('utf-8') symbol = symbol.decode('utf-8') if symbol else None name = name.decode('utf-8') if name else None rate = float(rate) if rate else None update_time = datetime.datetime.strptime( update_time, '%Y-%m-%d %H:%M:%S') if update_time else None info[currency] = { 'symbol': symbol, 'name': name, 'rate': rate, 'update_time': update_time } setattr(currency_info, 'currency_info', info) return info
def main(): with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db: db.start_transaction() try: for brand_id, region, modname in spider_generator(): if info.region_info()[region]['status'] != 1: continue parameter = {'brand_id': brand_id, 'region': region} # 检查是否存在 ret = db.query( str.format( 'SELECT * FROM monitor_status WHERE parameter LIKE "%{0}%{1}%"', brand_id, region)).fetch_row(maxrows=0) if ret: continue db.insert( { 'parameter': json.dumps(parameter, ensure_ascii=True), 'enabled': 0 }, 'monitor_status', replace=True) db.commit() except: db.rollback() raise
def get_ticket(): # 所有的ticket server列表 """ 生成全局唯一ID @rtype : @raise : TicketsError """ ticket_servers = getattr(global_settings, 'TICKETS_SERVER') if not ticket_servers: raise TicketsError('No ticket server can be found in the system.') last_error = None for offset in xrange(len(ticket_servers)): idx = (getattr(get_ticket, 'ticket_idx') + offset) % len(ticket_servers) server = ticket_servers.values()[idx] with RoseVisionDb(spec=server) as db: try: db.query('REPLACE INTO tickets64 (stub) VALUES ("a")') return int( db.query('SELECT LAST_INSERT_ID()').fetch_row()[0][0]) except _mysql_exceptions as e: last_error = TicketsError('Database server error.', cause=e) finally: setattr(get_ticket, 'ticket_idx', idx) raise last_error
def run(cls, logger=None, **kwargs): """ 更新货币的汇率信息 @param param_dict: """ logger = logger if 'logger' in kwargs else get_logger() logger.info('Update currency STARTED!!!!') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: for currency in db.query_match('currency', 'currency_info').fetch_row(maxrows=0): currency = currency[0] try: logger.debug(str.format('Fetching for currency data for {0}...', currency)) data = cm.get_data(url=str.format('http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rate, d, t = [val for val in csv.reader(StringIO(data['body']))][0][1:4] rate = float(rate) timestamp = datetime.strptime(' '.join((d, t)), '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d %H:%M:%S') db.update({'rate': rate, 'update_time': timestamp}, 'currency_info', str.format('currency="{0}"', currency)) except (ValueError, IOError): continue except: raise logger.info('Update currency ENDED!!!!')
def main(): data = cm.get_data('http://www.xe.com/symbols.php', proxy={'url': '127.0.0.1:8087'}) with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db: for node in Selector(text=data['body']).xpath( '//table[@class="cSymbl_tbl"]/tr[@class="row1" or @class="row2"]'): tmp = node.xpath('./td/text()').extract() name = tmp[0].strip() code = tmp[1].upper().strip() symbol = tmp[3].strip() if len(tmp) > 3 else '' db.insert({'name': name, 'currency': code, 'symbol': symbol}, 'currency_info', replace=True)
def run(cls, **kwargs): logger = kwargs['logger'] if 'logger' in kwargs else get_logger() logger.info('IMAGE CHECK ALERT STARTED') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: rs = db.query( 'SELECT fingerprint, brand_id, image_list, cover_image FROM products_release', use_result=True) while True: bulk = rs.fetch_row(maxrows=100) if not bulk: break is_err = False for fingerprint, brand_id, jlist, jcover in bulk: try: image_list = json.loads(jlist) for path in [tmp['path'] for tmp in image_list]: if not re.search(str.format(r'^{0}_', brand_id), path): content = str.format( 'fingerprint={0}, image_list={1}', fingerprint, jlist) logger.error(content) cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), content) is_err = True break cover = json.loads(jcover) if not re.search(str.format(r'^{0}_', brand_id), cover['path']): content = str.format('fingerprint={0}, jcover={1}', fingerprint, jcover) logger.error(content) cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), content) is_err = True break except: cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), str.format( 'fingerprint={0}, jlist={1}, jcover={2}', fingerprint, jlist, jcover)) raise if is_err: break logger.info('DONE!')
def run(self): logger = get_logger() # 只处理关键国家的数据 tmp = info.region_info() key_regions = filter(lambda val: tmp[val]['status'] == 1, tmp) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: # 删除原有的数据 logger.info( str.format('DELETING OLD RECORDS: brand_id={0}', self.brand_id)) db.execute( str.format('DELETE FROM products_release WHERE brand_id={0}', self.brand_id)) fp_list = [ tmp[0] for tmp in db.query( str.format( 'SELECT fingerprint FROM products WHERE brand_id={0} GROUP BY fingerprint', self.brand_id)).fetch_row(maxrows=0) ] self.tot = len(fp_list) self.progress = 0 # 最多每100次就需要提交一次事务 transaction_max = 100 logger.info( str.format('TOT: {0} fingerprints, brand_id={1}', self.tot, self.brand_id)) db.start_transaction() for self.progress in xrange(self.tot): if self.progress % transaction_max == 0: db.commit() db.start_transaction() logger.info( str.format( 'PROCESSED {0}/{1} fingerprints, brand_id={2}', self.progress, self.tot, self.brand_id)) fp = fp_list[self.progress] model_list = list( filter( lambda val: val['region'] in key_regions, db.query_match(['*'], 'products', { 'fingerprint': fp }).fetch_row(maxrows=0, how=1))) if model_list: self.merge_prods(model_list, db) db.commit() logger.info(str.format('DONE, brand_id={0}', self.brand_id))
def newly_fetched(brand_list=None, start=None, end=None): """ 获得start到end时间区间内新增加的单品记录。如果start和end中有任何一个为None,则默认采用过去一天的时间区间。 假设2014/02/25 02:00调用该函数,则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。 @param brand_list: 查找的品牌。如果为None,则默认对数据库中的所有品牌进行处理 @param start: datetime.date或datetime.datetime对象 @param end: """ with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: if not brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] if not (start and end): # 获得默认的时间区间 start = (datetime.datetime.now() - datetime.timedelta(1)).date() end = datetime.datetime.now().date() results = {} processed = set({}) for brand in brand_list: records = db.query_match(['*'], 'products', { 'brand_id': brand }, [ str.format( 'fetch_time BETWEEN {0} AND {1}', *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'), (start, end))), 'offline=0' ]).fetch_row(maxrows=0, how=1) for r in records: fp = r['fingerprint'] if fp not in processed: if brand not in results: results[brand] = [] # 按照fingerprint进行归并,并按照国家的权重进行排序 tmp = sorted(filter(lambda val: val['fingerprint'] == fp, records), key=lambda val: info.region_info()[val[ 'region']]['weight']) results[brand].append({ 'model': tmp[0]['model'], 'fingerprint': fp, 'name': unicodify(tmp[0]['name']) }) processed.add(fp) return results
def urlprocess_main(): db_spec = { "host": "127.0.0.1", "port": 3306, "username": "******", "password": "******", "schema": "editor_stores" } db = RoseVisionDb() db.conn(db_spec) idproducts_start = 0 idproducts_count = 100 opts, args = getopt.getopt(sys.argv[1:], "s:c:") for opt, arg in opts: if opt == '-s': idproducts_start = int(arg) elif opt == '-c': idproducts_count = int(arg) logger.info(str.format("Url process start")) while 1: products = get_products(db, idproducts_start, idproducts_count) if not products: logger.info(str.format("Url process end")) break else: logger.info( str.format("Url process offset : {0} count : {1}", idproducts_start, len(products))) idproducts_start += idproducts_count for product in products: origin_url = product['url'] url = None try: url = urlencode(origin_url) except: url = None logger.info( str.format("Error: {0} encode {1} failed", product['idproducts'], origin_url)) pass if url: try: db.update({'url': url}, 'products', str.format('idproducts="{0}"', product['idproducts'])) except: logger.info( str.format("Error: {0} update {1} failed", product['idproducts'], url)) pass
def run(self): change_detection = price_changed(self.brand_list, self.start_ts, self.end_ts) changes = {'U': [], 'D': []} for change_type in [ 'discount_down', 'price_down', 'discount_up', 'price_up' ]: for brand in change_detection[change_type]: for fingerprint, model_data in change_detection[change_type][ brand].items(): for product in model_data['products']: pid = product['idproducts'] c = '0' if change_type in ['discount_down', 'price_down']: c = 'D' elif change_type in ['discount_up', 'price_up']: c = 'U' if c != '0': changes[c].append(pid) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: db.start_transaction() try: for change_type in ['U', 'D']: db.update({'price_change': change_type}, 'products', str.format( 'idproducts IN ({0})', ','.join( str(tmp) for tmp in changes[change_type])), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() self.change_detection = change_detection return change_detection
def spider_prog_report(param_dict): """ 产生爬虫进度报告 """ report_tpl = u''' <h1>全球扫货指南:单品信息抓取系统简报</h1> <p>系统自动邮件,请勿回复。</p> <p>统计时间段:{STAT-DATE-RANGE}</p> <h2>新抓取的单品:</h2> <p><table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000"> <tbody> <tr> <td>品牌编号</td> <td>品牌名称</td> <td>单品总数</td> <td>主要市场的分布情况(中/美/法/英/意)</td> </tr> {NEW-PRODUCTS} </tbody> </table></p> <h2>已经发布的单品:</h2> <p><table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000"> <tbody> <tr> <td>品牌编号</td> <td>品牌名称</td> <td>单品总数</td> <td>主要市场的分布情况(中/美/法/英/意)</td> </tr> {FETCHED-PRODUCTS} </tbody> </table></p> <h2>价格变化趋势:</h2> <p>暂无</p> ''' # 确定收信人 try: group = getattr(gs, 'REPORTS')['DATA_STATUS'] if not isinstance(group, list): group = [group] recipients = {} for g in group: for key, value in getattr(gs, 'EMAIL_GROUPS')[g].items(): recipients[key] = value # recipent_addrs = gs.EMAIL_ADDR.values() # ['*****@*****.**', '*****@*****.**'] except (TypeError, AttributeError, KeyError): return cur = datetime.datetime.now() # 时间跨度:一天 delta = datetime.timedelta(1) from_time = cur - delta time_range_str = map(lambda v: v.strftime('%Y-%m-%d %H:%M:%S'), [from_time, cur]) stat_date_range = unicode.format(u'从{0}至{1}', time_range_str[0], time_range_str[1]) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: new_products = get_fetched_report(db, time_range_str) fetched_products = get_fetched_report(db, None) msg = MIMEText(report_tpl.replace( '{STAT-DATE-RANGE}', stat_date_range).replace('{NEW-PRODUCTS}', new_products).replace('{FETCHED-PRODUCTS}', fetched_products), _subtype='html', _charset='utf-8') msg['Subject'] = u'单品信息抓取系统简报' msg['From'] = 'MStore Admin <*****@*****.**>' msg['To'] = ', '.join([ unicode.format(u'{0} <{1}>', item[0], item[1]) for item in recipients.items() ]) server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465) # 使用gmail发送邮件 #server = smtplib.SMTP('smtp.gmail.com', 587) #server.ehlo() #server.starttls() server.login('*****@*****.**', 'rose123') server.sendmail('*****@*****.**', recipients.values(), msg.as_string()) server.quit()
def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], table='products', extra_cond=None): """ 处理editor库中的价格信息 :param table: 需要操作的表。默认为products。 :param db_spec: 需要操作的数据库,默认为editor库。 :param extra_cond: 筛选条件。 """ db = RoseVisionDb() db.conn(db_spec) extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' db.lock([table]) db.start_transaction() try: # 根据update_flag字段判断哪些记录是需要更新的 obj = EditorPriceProcessor( db.query( unicode.format( u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}', table, extra_cond).encode('utf-8')), db) core.func_carrier(obj, 1) db.commit() except OperationalError: db.rollback() finally: db.unlock() db.close()
# coding=utf-8 import os import sys import json from utils.db import RoseVisionDb import global_settings as gs import scripts from utils.utils_core import get_logger idm, brand, region = sys.argv[1].split('|') brand = int(brand) idm = int(idm) parameter = {'idmonitor': idm, 'brand_id': brand, 'region': region} with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: db.update({'monitor_status': 0, 'monitor_pid': None, 'recrawl_pid': os.getpid()}, 'monitor_status', str.format('idmonitor={0}', parameter['idmonitor'])) #todo hardcode for DKNY, need to add 'is_offline' for DknySpider if brand == 10108: os.system( 'python %s %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) else: os.system('python %s update --brand %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) os.system( 'python %s %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) # os.system('python %s process-tags --cond brand_id=%s' % parameter['brand_id']) # os.system('python %s release --brand %s' % parameter['brand_id'])
def run(cls, logger=None, **kwargs): logger = logger if 'logger' in kwargs else get_logger(logger_name='monitor') logger.info('Monitor STARTED!!!') #monitor process quantity, recrawl process quantity,limit interval for recrawl spider try: monitor_no = getattr(gs, 'MONITOR')['MAX_MONITOR'] except (AttributeError, KeyError): monitor_no = 6 try: recrawl_no = getattr(gs, 'MONITOR')['MAX_RECRAWLER'] except (AttributeError, KeyError): recrawl_no = 12 interval = kwargs['interval'] if 'interval' in kwargs else 7 limit_time = datetime.datetime.now() - datetime.timedelta(interval) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: pid_list = psutil.pids() rs = db.query_match(['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid'], 'monitor_status', {'enabled': 1}).fetch_row(maxrows=0) #update monitor_status for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid in rs: #更新monitor_pid,recrawl_pid if monitor_pid and int(monitor_pid) not in pid_list: db.update({'monitor_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor)) if recrawl_pid and int(recrawl_pid) not in pid_list: db.update({'recrawl_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor)) #更新 rs_new = db.query_match( ['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid', 'timestamp'], 'monitor_status', {'enabled': 1}, tail_str='ORDER BY priority desc, timestamp').fetch_row(maxrows=0) #空闲product列表,排序后,最早更新的等待monitor idle_monitor_list = [] idle_recrawl_list = [] monitor_list = [] recrawl_list = [] for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid, timestamp in rs_new: #生成monitor_pid、recrawl_pid列表,用于监控和重爬,保证数量 if monitor_pid is not None: monitor_list.append(int(monitor_pid)) if recrawl_pid is not None: recrawl_list.append(int(recrawl_pid)) if monitor_status == '0' and monitor_pid is None and recrawl_pid is None: idle_monitor_list.append((idmonitor, parameter, timestamp)) if monitor_status == '1' and monitor_pid is None and recrawl_pid is None: idle_recrawl_list.append(( idmonitor, parameter, timestamp)) # #爬虫最后更新时间早于最大限制时间,重爬。 # update_time = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') # if update_time <= limit_time: # db.update({'monitor_status': 1}, 'monitor_status', # str.format('idmonitor={0}', idmonitor)) # idle_monitor_list = sorted(idle_monitor_list, key=lambda m: m[2]) # idle_recrawl_list = sorted(idle_recrawl_list, key=lambda m: m[2]) #start monitor and set monitor_status if find update if len(monitor_list) < monitor_no: if len(idle_monitor_list) > monitor_no - len(monitor_list): ready_monitor = idle_monitor_list[:(monitor_no - len(monitor_list))] else: ready_monitor = idle_monitor_list for idmonitor, parameter, timestamp in ready_monitor: args = json.loads(parameter) #monitor --brand 10009 --region fr --idmonitor 1931 -v logger.info('Monitor started--> idmonitor:%s, brand_id:%s, region:%s' % ( idmonitor, args['brand_id'], args['region'])) spawn_process( os.path.join(scripts.__path__[0], 'run_crawler.py'), 'monitor --brand %s --region %s --idmonitor %s' % (args['brand_id'], args['region'], idmonitor)) #start recrawl and reset monitor_status after recrawl ended if len(recrawl_list) < recrawl_no: if len(idle_recrawl_list) > recrawl_no - len(recrawl_list): ready_recrawl = idle_recrawl_list[:(recrawl_no - len(recrawl_list))] else: ready_recrawl = idle_recrawl_list for idmonitor, parameter, timestamp in ready_recrawl: args = json.loads(parameter) args['idmonitor'] = idmonitor para = '|'.join([str(idmonitor), str(args['brand_id']), args['region']]) logger.info('Recrawl started--> idmonitor:%s, brand_id:%s, region:%s' % ( idmonitor, args['brand_id'], args['region'])) spawn_process(os.path.join(scripts.__path__[0], 'recrawler.py'), para)
} brand_id = 10226 categories = { 'books--stationery', 'handbags', 'travel', 'watches', 'timepieces', 'shoes', 'fine-jewelry', 'ready-to-wear', 'show-fall-winter-2013', 'mens-bags', 'small-leather-goods', 'icons', 'accessories/scarves-and-more', 'accessories/belts', 'accessories/sunglasses', 'accessories/fashion-jewelry', 'accessories/key-holders-bag-charms-and-more', 'accessories/scarves-ties-and-more', 'accessories/key-holders-and-other-accessories' } db = RoseVisionDb() db.conn(getattr(glob, 'DATABASE')['DB_SPEC']) def make_post_str(post_data): """ 给定post数据,生成post字符串。 :param post_data: """ def func(val): if val == "+": return val else: return urllib.quote(val, safe="") return u'&'.join(
def price_changed(brand_list=None, start=None, end=None, start_delta=datetime.timedelta(0), end_delta=datetime.timedelta(0)): """ 获得start到end时间区间内价格发生变化的单品记录。如果start和end中有任何一个为None,则默认采用过去一天的时间区间。 假设2014/02/25 02:00调用该函数,则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。 @param brand_list: 查找的品牌。如果为None,则默认对数据库中的所有品牌进行处理 @param start: datetime.date或datetime.datetime对象 @param end: """ def price_check(old, new): """ 对两组价格进行有效性检查。该函数的主要目的是:通过检查,查找可能存在的代码bug 检查策略: 1. 如果两条记录一样 2. 如果price为None,而price_discount不为None 3. 如果price<=price_discount 4. 如果old和new两项price的差别过大 则可能存在bug或错误,需要返回warning。 @param old: @param new: """ warnings = { -1: 'EQUAL RECORDS', -2: 'NULL PRICE', -3: 'PRICE IS EQUAL OR LESS THAN PRICE_DISCOUNT', -4: 'TOO MUCH GAP BETWEEN THE OLD AND THE NEW' } price1, discount1 = old price2, discount2 = new # 如果价格变化超过threshold,则认为old和new差异过大 threshold = 5 # if price1 == price2 and discount1 == discount2: # err_no = -1 if (not price1 and discount1) or (not price2 and discount2): err_no = -2 elif (price1 and discount1 and price1 <= discount1) or (price2 and discount2 and price2 <= discount2): err_no = -3 elif price1 > 0 and price2 > 0 and (price1 / price2 > threshold or price2 / price1 > threshold): err_no = -4 else: err_no = 0 if err_no != 0: return (err_no, warnings[err_no]) else: return err_no # 主要国家列表。只监控这些国家的单品的价格变化过程。 main_countries = [ tmp[0] for tmp in filter(lambda val: val[1]['weight'] < 999999, info.region_info().items()) ] with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: if not brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] # 获得默认的时间区间 if start: try: start = datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S') except ValueError: start = datetime.datetime.strptime(start, '%Y-%m-%d') else: start = datetime.datetime.fromordinal( (datetime.datetime.now() - datetime.timedelta(1)).toordinal()) if end: try: end = datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S') except ValueError: end = datetime.datetime.strptime(end, '%Y-%m-%d') else: end = datetime.datetime.fromordinal( datetime.datetime.now().date().toordinal()) start += start_delta end += end_delta results = { 'warnings': [], 'price_up': {}, 'discount_up': {}, 'price_down': {}, 'discount_down': {} } for brand in brand_list: pid_list = db.query( str.format( ''' SELECT p1.model,p1.idproducts,p1.region,p1.fingerprint FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts WHERE p1.offline=0 AND p2.price IS NOT NULL AND brand_id={0} AND p1.region IN ({1}) AND (p2.date BETWEEN {2} AND {3}) ''', brand, ','.join( str.format('"{0}"', tmp) for tmp in main_countries), *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'), (start, end)))).fetch_row(maxrows=0) if not pid_list: continue tmp = db.query( str.format( ''' SELECT p1.idproducts,p1.model,p1.region,p1.fingerprint,p2.price,p2.price_discount,p2.currency,p2.date FROM products AS p1 JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts WHERE p1.idproducts IN ({0}) ORDER BY p2.date DESC''', ','.join(tmp[1] for tmp in pid_list))).fetch_row(maxrows=0) rs = {} # 按照pid归并,即rs[pid] = [该pid所对应的价格历史] # 开始的时候,pid_set保留了所有需要处理的pid。归并的原则是,每个pid,取最近的最多两条有效记录。如果两条记录取满, # 该pid从pid_set中移除。今后,就算再次遇到这个pid,也不作处理了。 pid_set = set([val[0] for val in tmp]) for pid, model, region, fp, price, discount, currency, date in tmp: # 如果pid不在pid_set中,说明该pid对应的两条记录都已经取到。 # 如果price为None,说明该记录不包含有效价格数据,跳过不处理。 if pid not in pid_set or not price: continue if int(pid) in rs and len(rs[int(pid)]) >= 2: # 最近两条数据已满,跳过该pid pid_set.remove(pid) continue pid = int(pid) if pid not in rs: rs[pid] = [] rs[pid].append( [model, region, fp, price, discount, currency, date]) for pid, price_history in rs.items(): if len(price_history) < 2: continue def func(idx): rate = info.currency_info()[price_history[idx][-2]]['rate'] return (float(price_history[idx][-4]) * rate if price_history[idx][-4] else None, float(price_history[idx][-3]) * rate if price_history[idx][-3] else None) price1, discount1 = func(0) price2, discount2 = func(1) # 是否可能有错误? ret = price_check((price2, discount2), (price1, discount1)) if ret != 0: results['warnings'].append({ 'idproducts': pid, 'model': price_history[0][0], 'msg': ret[1] }) continue if price1 and price2 and price1 < price2: key = 'price_down' elif price1 and price2 and price1 > price2: key = 'price_up' elif discount1 and discount2 and discount1 < discount2: key = 'discount_down' elif not discount2 and discount1: key = 'discount_down' elif discount1 and discount2 and discount1 > discount2: key = 'discount_up' elif not discount1 and discount2: key = 'discount_up' else: key = None if key: if brand not in results[key]: results[key][brand] = {} fp = price_history[0][2] if fp not in results[key][brand]: results[key][brand][fp] = { 'model': price_history[0][0], 'brand_id': brand, 'fingerprint': fp, 'products': [] } # 获得单品的优先名称 region = price_history[0][1] price_new = float( price_history[0][3]) if price_history[0][3] else None price_old = float( price_history[1][3]) if price_history[1][3] else None discount_new = float( price_history[0][4]) if price_history[0][4] else None discount_old = float( price_history[1][4]) if price_history[1][4] else None currency_new = price_history[0][5] currency_old = price_history[1][5] results[key][brand][fp]['products'].append({ 'idproducts': int(pid), 'region': region, 'old_price': { 'price': price_old, 'price_discount': discount_old, 'currency': currency_old }, 'new_price': { 'price': price_new, 'price_discount': discount_new, 'currency': currency_new } }) # results中的记录,还需要单品名称信息。首先获得result中的所有fingerprint,并从数据库中查找对应的名称 fp_list = [] for change_type in [ 'price_up', 'price_down', 'discount_up', 'discount_down' ]: if brand in results[change_type]: fp_list.extend(results[change_type][brand].keys()) fp_list = list(set(fp_list)) # 获得fingerprint和name的关系 fp_name_map = {} if fp_list: for fp, name, region in db.query_match( ['fingerprint', 'name', 'region'], 'products', extra=str.format( 'fingerprint IN ({0})', ','.join( str.format('"{0}"', tmp) for tmp in fp_list))).fetch_row(maxrows=0): if fp not in fp_name_map: fp_name_map[fp] = { 'name': unicodify(name), 'region': region } elif info.region_info( )[region]['weight'] < info.region_info()[ fp_name_map[fp]['region']]['weight']: # 更高优先级的国家,替换: fp_name_map[fp] = { 'name': unicodify(name), 'region': region } for change_type in [ 'price_up', 'price_down', 'discount_up', 'discount_down' ]: if brand not in results[change_type]: continue for fp in results[change_type][brand]: results[change_type][brand][fp]['name'] = fp_name_map[ fp]['name'] return results
def currency_update(param_dict): """ 更新货币的汇率信息 @param param_dict: """ db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) rs = db.query_match(['iso_code', 'currency'], 'region_info').fetch_row(maxrows=0) db.start_transaction() try: for code, currency in rs: print str.format('Fetching for currency data for {0}...', currency) data = cm.get_data(url=str.format( 'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rdr = csv.reader(StringIO(data['body'])) line_data = [val for val in rdr][0] timestamp = datetime.datetime.strptime( str.format('{0} {1}', line_data[2], line_data[3]), '%m/%d/%Y %I:%M%p') db.update( { 'rate': line_data[1], 'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S') }, 'region_info', str.format('iso_code="{0}"', code)) db.commit() except: db.rollback() raise
def tag_changed(cls, **kwargs): """ 将价格发生变化的单品在数据库中标记出来,同时返回 @param kwargs: brand: 需要处理的品牌。如果为None,则对所有的品牌进行处理。 start:价格变化检查的开始时间。如果为None,则为昨天凌晨。 end:价格变化检查的结束时间。如果为None,则为今天凌晨。 start_delta: 调整时间(单位为天)。比如:delta为0.5,则表示在start/end的基础上,再往后延长半天。 end_delta """ # 是否处于静默模式 silent = 's' in kwargs # 如果没有指定brand,则对数据库中存在的所有brand进行处理 brand_list = [int(val) for val in kwargs['brand'] ] if 'brand' in kwargs else None start_ts = kwargs['start'] if 'start' in kwargs else None end_ts = kwargs['end'] if 'end' in kwargs else None start_delta = datetime.timedelta( kwargs['start_delta'] ) if 'start_delta' in kwargs else datetime.timedelta(0) end_delta = datetime.timedelta( kwargs['end_delta'] ) if 'end_delta' in kwargs else datetime.timedelta(0) # 得到价格变化的信息列表 change_detection = price_changed(brand_list, start_ts, end_ts, start_delta, end_delta) changes = {'U': [], 'D': []} for change_type in [ 'discount_down', 'price_down', 'discount_up', 'price_up' ]: for brand in change_detection[change_type]: for fingerprint, model_data in change_detection[change_type][ brand].items(): for product in model_data['products']: pid = product['idproducts'] c = '0' if change_type in ['discount_down', 'price_down']: c = 'D' elif change_type in ['discount_up', 'price_up']: c = 'U' if c != '0': changes[c].append(pid) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: db.start_transaction() try: for change_type in ['U', 'D']: if not changes[change_type]: continue db.update({'price_change': change_type}, 'products', str.format( 'idproducts IN ({0})', ','.join( str(tmp) for tmp in changes[change_type])), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() return change_detection
def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list if not brand_list: # 如果没有任何品牌,则直接退出 return self.report self.progress = 0 # 获得检查总数 self.tot = int( db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})', ','.join(str(tmp) for tmp in brand_list))).fetch_row()[0][0]) for brand in brand_list: if not self.silent: print unicode.format(u'\nPROCESSING {0} / {1}\n', brand, info.brand_info()[brand]['brandname_e']) db.start_transaction() try: for model, pid, fingerprint in db.query_match( ['model', 'idproducts', 'fingerprint'], 'products', { 'brand_id': brand }).fetch_row(maxrows=0): self.progress += 1 new_fp = gen_fingerprint(brand, model) if fingerprint != new_fp: self.report.append({ 'model': model, 'idproducts': pid, 'fingerprint_db': fingerprint, 'fingerprint_gen': new_fp, 'brand_id': brand }) if not self.silent: print unicode.format( u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, ' u'fingerprints: {3} => {4}\n', model, pid, brand, fingerprint, new_fp) if self.update_fingerprint: # 自动更新MD5指纹 db.update({'fingerprint': new_fp}, 'products', str.format('idproducts={0}', pid), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() db.close()
def run(cls, logger=None, **kwargs): logging.info('IMAGE CHECK STARTED!!!!') #check flag for urls and images url_flag = True if 'url_flag' not in kwargs else kwargs['url_flag'] img_flag = True if 'img_flag' not in kwargs else kwargs['img_flag'] storage_path = os.path.normpath(os.path.join(getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products/images')) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: rs = db.query_match(['checksum', 'url', 'path', 'width', 'height', 'format', 'size'], 'images_store',).fetch_row( maxrows=0) db.start_transaction() total = len(rs) count = 0 try: for checksum, url, path, width, height, fmt, size in rs: full_path = os.path.normpath(os.path.join(storage_path, path)) #错误标志 url_err = checksum_err = path_err = width_err = height_err = fmt_err = size_err = False #check """ 可能出现的错误:height_err or width_err and size_err代表图片更新过,数据库size_err未更新 height_err or width_err 无size_err代表图片过小,小于128X128 checksum_err and path_err出现代表文件不存在,打不开 """ #todo url_err待测,需要打开连接验证是否正常,默认正常 #check url if url_flag: if not url or not url.strip(): url_err = True # check img if img_flag: try: f = open(full_path, 'rb') cur_check = hashlib.md5(f.read()).hexdigest() if cur_check != checksum: checksum_err = True except: checksum_err = True path_err = True #check width height format,size img = Image.open(full_path) (cur_width, cur_height) = img.size cur_format = img.format cur_size = os.path.getsize(full_path) if cur_width != int(width) or int(width) < 128: width_err = True if cur_height != int(height) or int(height) < 128: height_err = True if cur_format != fmt: fmt_err = True if cur_size != int(size): size_err = True if url_err or checksum_err or path_err or width_err or height_err or fmt_err or size_err: logging.error((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Detail:', ( checksum, 'url_err' if url_err else None, 'path_err' if path_err else None, 'width_err' if width_err else None, 'height_err' if height_err else None, 'fmt_err' if fmt_err else None, 'size_err' if size_err else None, '-------checked value:', cur_check if cur_check else 'NA---', cur_width if cur_width else 'NA---', cur_height if cur_height else 'NA---', cur_format if cur_format else 'NA---', cur_size if cur_size else 'NA---', ))) pass #运行脚本显示百分比进度 count += 1 percent = float(count) / total * 100 print '\r%.3f%% :%s>' % (percent, int(percent) * '='), except: raise logging.info('IMAGE CHECK ENDED!!!!')
def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], db_spider_spec=getattr(glob, 'SPIDER_SPEC'), table='products', extra_cond=None): """ 给editor库的数据添加tags字段 """ db = RoseVisionDb() db.conn(db_spider_spec) try: extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' rs = db.query( unicode.format( u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}', extra_cond).encode('utf-8')) temp = rs.fetch_row(maxrows=0) mapping_rules = dict(temp) finally: db.close() db.conn(db_spec) db.start_transaction() try: rs = db.query( unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond)) for i in xrange(rs.num_rows()): record = rs.fetch_row(how=1)[0] extra = json.loads(record['extra']) tags = [] for k in extra: tags.extend(extra[k]) tags = set(tags) tag_names = [] for v in tags: if v in mapping_rules: tag_names.extend(json.loads(mapping_rules[v])) tag_names = list(set(tag_names)) db.update({'tags': json.dumps(tag_names, ensure_ascii=False)}, str.format('idproducts={0}', record['idproducts'])) db.commit() pass except OperationalError: db.rollback() finally: db.close()
class ProcessTags(object): """ 标签的映射规则有变动,需要更新 """ original_tags_tbl = 'original_tags' mfashion_tags_tbl = 'mfashion_tags' prod_tag_tbl = 'products_original_tags' prod_mt_tbl = 'products_mfashion_tags' products = 'products' db_spec = getattr(gs, 'DATABASE')['DB_SPEC'] tot = 1 progress = 0 def __init__(self, last_update=None, extra_cond=None): print str.format( 'Processing tags (last_update="{0}", extra_cond="{1}")...', last_update, extra_cond) self.db = RoseVisionDb() self.db.conn(self.db_spec) self.last_update = last_update self.extra_cond = extra_cond def get_msg(self): return str.format( '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot, float(self.progress) / self.tot) if self.tot > 0 else 'IDLE' def run(self): last_update = self.last_update extra_cond = self.extra_cond if not extra_cond: extra_cond = [] elif not iterable(extra_cond): extra_cond = [extra_cond] if last_update: extra_cond.append( unicode.format(u'update_time > "{0}"', last_update)) extra_cond.append('mapping_list IS NOT NULL') # MFashion标签的缓存记录 cached_mfashion = {} # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以 # 该标签对应brand/region下的所有标签都必须重做 rs = self.db.query_match(['brand_id', 'region'], self.original_tags_tbl, {}, extra=extra_cond, distinct=True) # 需要处理的标签 tag_dict = {} for i in xrange(rs.num_rows()): brand_id, region = rs.fetch_row()[0] for val in self.db.query_match( ['idmappings', 'mapping_list'], self.original_tags_tbl, { 'brand_id': brand_id, 'region': region }, extra='mapping_list IS NOT NULL').fetch_row(maxrows=0): tag_dict[val[0]] = json.loads(val[1].replace("'", '"')) # 删除旧单品/标签关系 self.db.execute( str.format( 'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts ' 'AND p1.brand_id={2} AND region="{3}"', self.products, self.prod_mt_tbl, brand_id, region)) self.tot = len(tag_dict) self.progress = 0 for tid, rule in tag_dict.items(): self.progress += 1 self.db.start_transaction() try: # 所有相关的单品 pid_list = [ int(val[0]) for val in self.db.query_match( ['idproducts'], self.prod_tag_tbl, { 'id_original_tags': tid }).fetch_row(maxrows=0) ] # 添加MFashion标签 for tag in rule: if tag not in cached_mfashion: self.db.insert({'tag': tag}, self.mfashion_tags_tbl, ignore=True) tid = int( self.db.query_match(['idmfashion_tags'], self.mfashion_tags_tbl, { 'tag': tag }).fetch_row()[0][0]) cached_mfashion[tag] = tid self.db.insert([{ 'idproducts': pid, 'id_mfashion_tags': cached_mfashion[tag] } for pid in pid_list], self.prod_mt_tbl, ignore=True) self.db.commit() except ValueError: self.db.rollback() except: self.db.rollback() raise