def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list self.progress = 0 self.tot = len(brand_list) for brand in brand_list: print unicode.format(u'PROCESSING {0} / {1}', brand, info.brand_info()[brand]['brandname_e']) self.progress += 1 rs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = rs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 # if r['price'] and int(r['offline']) == 0: # 这里更改为不管offline,全检查 if r['price']: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted(price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > self.threshold: print unicode.format( u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}', brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand]['brandname_e']) db.close()
def guess_currency(price, region=None): # 如果下面这些符号出现在字符串中,则可以直接确定货币 symbols = {u'€': 'EUR', 'HK$': 'HKD', 'AU$': 'AUD', 'CA$': 'CAD', 'US$': 'USD', u'£': 'GBP'} # 按照符号提取 for s in symbols: if s in price: return symbols[s] # 如果$前面没有紧贴一至两个大写的字母,即没有出现CA$,AU $等情况,则说明货币是美元。 if '$' in price: mt1 = re.search(r'[A-Z]{2}\s{0,2}\$', price, flags=re.U) mt2 = re.search(r'[A-Z]{1}\$', price, flags=re.U) if not mt1 and not mt2: return 'USD' if u'¥' in price and region in ('cn', 'hk', 'mo', 'tw'): return 'CNY' # 若字符串中包含大写的三个字母,并且该标识出现在货币列表中,说明这三个字母组成的字符串是货币信息 mt = re.search(r'([A-Z]{3})', price, flags=re.U) if mt and mt.group(1) in info.currency_info().keys(): return mt.group(1) else: # 未找到货币信息 return None
def merge_prods(self, prods, db): """ 按照国家顺序,挑选主记录 :param prods: """ logger = get_logger() # 将prods转换为unicode for idx in xrange(len(prods)): prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]} # 挑选primary记录 sorted_prods = sorted(prods, key=lambda k: self.region_order[k['region']]) main_entry = sorted_prods[0] entry = { k: unicodify(main_entry[k]) for k in ('brand_id', 'model', 'name', 'description', 'details', 'gender', 'category', 'color', 'url', 'fingerprint') } if not entry['name']: entry['name'] = u'单品' mfashion_tags = [ unicodify(val[0]) for val in db.query( str.format( 'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 ' 'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags ' 'WHERE p2.idproducts IN ({0})', ','.join( val['idproducts'] for val in prods))).fetch_row(maxrows=0) ] # # original_tags = [int(val[0]) for val in # db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags ' # 'WHERE idproducts IN ({0})', # ','.join(val['idproducts'] for val in prods))).fetch_row( # maxrows=0)] entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False) entry[ 'original_tags'] = '' #json.dumps(original_tags, ensure_ascii=False) entry['region_list'] = json.dumps([val['region'] for val in prods], ensure_ascii=False) entry['brandname_e'] = info.brand_info()[int( entry['brand_id'])]['brandname_e'] entry['brandname_c'] = info.brand_info()[int( entry['brand_id'])]['brandname_c'] # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time # entry['fetch_time'] = \ # sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[ # 0].strftime("%Y-%m-%d %H:%M:%S") url_dict = {int(val['idproducts']): val['url'] for val in prods} offline_dict = { int(val['idproducts']): int(val['offline']) for val in prods } price_change_dict = { int(val['idproducts']): val['price_change'] for val in prods } update_time_dict = { int(val['idproducts']): datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S") for val in prods } # pid和region之间的关系 region_dict = {int(val['idproducts']): val['region'] for val in prods} price_list = {} # 以pid为主键,将全部的价格历史记录合并起来 for item in db.query_match( ['price', 'price_discount', 'currency', 'date', 'idproducts'], self.price_hist, {}, str.format('idproducts IN ({0})', ','.join(val['idproducts'] for val in prods)), tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0, how=1): pid = int(item['idproducts']) region = region_dict[pid] offline = offline_dict[pid] if pid not in price_list: price_list[pid] = [] price = float(item['price']) if item['price'] else None if offline == 0: price_discount = float( item['price_discount']) if item['price_discount'] else None else: price_discount = None price_list[pid].append({ 'price': price, 'price_discount': price_discount, 'currency': item['currency'], 'date': datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"), 'price_change': price_change_dict[pid], 'url': url_dict[pid], 'offline': offline, 'code': region, 'country': info.region_info()[region]['name_c'] }) currency_conv = lambda val, currency: info.currency_info()[currency][ 'rate'] * val # 对price_list进行简并操作。 # 策略:如果有正常的最新价格,则返回正常的价格数据。 # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。 # 如果无法找到不为None的价格,则跳过该pid for pid, pid_data in price_list.items(): # 按照时间顺序逆排序,同时只保留price不为None的数据 # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True) # 有价格的pid_data子集 valid_pid_data = filter(lambda val: val['price'], pid_data) if pid_data[0]['price']: # 正常情况 price_list[pid] = pid_data[0] # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况 currency = valid_pid_data[0]['currency'] if price_change_dict[pid] == 'D' and len( valid_pid_data ) > 1 and currency == valid_pid_data[1]['currency']: if not pid_data[0]['price_discount'] and currency_conv( valid_pid_data[1]['price'], currency) > currency_conv( valid_pid_data[0]['price'], currency) and (datetime.datetime.now() - valid_pid_data[0]['date'] ) < datetime.timedelta(7): price_list[pid]['price_discount'] = price_list[pid][ 'price'] price_list[pid]['price'] = valid_pid_data[1]['price'] else: # 寻找回溯第一条price不为None的数据。 # tmp = filter(lambda val: val['price'], pid_data) if not valid_pid_data: # 没有价格信息,取消该pid记录 price_list.pop(pid) else: # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态 tmp = valid_pid_data[0] tmp['price_discount'] = None price_list[pid] = tmp # 第一次有效价格对应的时间,为fetch_time # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date'])) # pid_data = filter(lambda val: val['price'], pid_data) if valid_pid_data and pid in price_list: price_list[pid]['fetch_time'] = valid_pid_data[-1]['date'] price_list[pid]['idproducts'] = pid # 如果没有价格信息,则不发布 if not price_list: return entry['price_list'] = sorted( price_list.values(), key=lambda val: self.region_order[val['code']]) entry = release_filter(entry, logger) if not entry['price_list']: return entry['offline'] = entry['price_list'][0]['offline'] # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。 entry['fetch_time'] = min( tmp['fetch_time'] for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S") # 价格排序的列表 alt_prices = [] for price_item in entry['price_list']: # 将datetime序列化,进而保存在release表中。 price_item['date'] = price_item['date'].strftime( "%Y-%m-%d %H:%M:%S") price_item['fetch_time'] = price_item['fetch_time'].strftime( "%Y-%m-%d %H:%M:%S") if price_item['offline'] == 0: if price_item['price_discount']: tmp = map( lambda key_name: currency_conv(price_item[key_name], price_item['currency']), ('price', 'price_discount')) tmp.extend([ price_item[key] for key in ('price_change', 'price', 'price_discount', 'currency', 'date', 'idproducts') ]) alt_prices.append(tmp) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价 alt_prices = sorted(alt_prices, key=lambda val: val[1] if val[1] else val[0]) entry['price'], entry['price_discount'] = alt_prices[ 0][:2] if alt_prices else (None, ) * 2 entry['price_change'] = alt_prices[0][2] if alt_prices else '0' entry['o_price'], entry['o_discount'], entry[ 'o_currency'] = alt_prices[0][3:6] # 取消entry['price_list']中的idproducts for i in xrange(len(entry['price_list'])): entry['price_list'][i].pop('idproducts') entry['price_list'] = json.dumps(entry['price_list'], ensure_ascii=False) entry['last_price_ts'] = alt_prices[0][6] entry['product_update_ts'] = update_time_dict[ alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S") # 搜索字段 search_text = u' '.join(entry[tmp] if entry[tmp] else '' for tmp in ('name', 'description', 'details', 'model', 'brandname_e', 'brandname_c')) search_color = u' '.join(entry['color']) if entry['color'] else u'' rs = db.query_match( ['description_cn', 'description_en', 'details_cn', 'details_en'], 'products_translate', { 'fingerprint': entry['fingerprint'] }).fetch_row() part_translate = u' ' + u' '.join( unicodify(tmp) for tmp in filter(lambda v: v, rs[0])) if rs else ' ' search_tags = u' '.join(list(set(mfashion_tags))) entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text, part_translate, search_tags, search_color) p = prods[0] checksums = [] # 爆照checksums中的数据唯一,且顺序和idproducts_image一致 for tmp in db.query( str.format( ''' SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1 JOIN products AS p2 ON p1.fingerprint=p2.fingerprint JOIN images_store AS p3 ON p1.checksum=p3.checksum WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image ''', p['fingerprint'])).fetch_row(maxrows=0, how=1): if tmp not in checksums: checksums.append(tmp) # 如果没有图片,则暂时不添加到release表中 if not checksums: return image_list = [] for val in checksums: tmp = { 'path': val['path'], 'width': int(val['width']), 'height': int(val['height']) } if not image_list: entry['cover_image'] = json.dumps(tmp, ensure_ascii=False) image_list.append(tmp) entry['image_list'] = json.dumps(image_list[:self.max_images], ensure_ascii=False) db.insert(entry, 'products_release')
def func(idx): rate = info.currency_info()[price_history[idx][-2]]['rate'] return (float(price_history[idx][-4]) * rate if price_history[idx][-4] else None, float(price_history[idx][-3]) * rate if price_history[idx][-3] else None)
def price_conv(price, currency): return price * info.currency_info()[currency]['rate']
def run(cls, logger=None, **kwargs): log_path_name = os.path.normpath( os.path.join( getattr(gs, 'STORAGE_PATH'), 'log/check/DataCheck%s.log' % datetime.datetime.now().strftime('%Y%m%d'))) logging.basicConfig(filename=log_path_name, level=logging.DEBUG) logging.info('PRODUCT CHECK STARTED!!!!') threshold = kwargs['threshold'] if 'threshold' in kwargs else 10 if 'brand_list' in kwargs: brand_list = kwargs['brand_list'] else: with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: brand_list = db.query_match(['brand_id'], 'products', distinct=True).fetch_row(maxrows=0) db.start_transaction() brand_list = [int(val[0]) for val in brand_list] for brand in brand_list: with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: #=============================product check================================================== logging.info( unicode.format( u'{0} PROCESSING product check {1} / {2}', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), brand, info.brand_info()[brand]['brandname_e'])) rs = db.query_match([ 'idproducts', 'region', 'name', 'url', 'color', 'description', 'details', 'price_change' ], 'products', { 'brand_id': brand }).fetch_row(maxrows=0) for idproducts, region, name, url, color, desc, details, price_change in rs: name_err = url_err = color_err = desc_err = details_err = price_change_err = False #查找html转义符 # if name and has_escape(name): # print idproducts, name # name_err = True # if desc and has_escape(desc): # print idproducts, desc # desc_err = True # if details and has_escape(details): # print idproducts, details # details_err = True # for c in [name, desc, details]: # if c and has_escape(c): # print idproducts, c # # db.update({'name': lxmlparser()}, # # 'products', str.format('idproducts="{0}"', idproducts)) # pass if name and has_escape(name): print(idproducts, name) # print lxmlparser(unicode(name).encode("utf-8")) # db.update({'name': lxmlparser(name)}, # 'products', str.format('idproducts="{0}"', idproducts)) if desc and has_escape(desc): print(idproducts, desc) # print lxmlparser(unicode(desc).encode("utf-8")) # db.update({'desc': lxmlparser(desc)}, # 'products', str.format('idproducts="{0}"', idproducts)) if details and has_escape(details): print(idproducts, details) # print lxmlparser(unicode(details).encode("utf-8")) # db.update({'details': lxmlparser(details)}, # 'products', str.format('idproducts="{0}"', idproducts)) #中英美区域name、description检验,只能包含中英文字符和标点,出现其他文字及符号标识为错误 if region in ['cn', 'us', 'uk']: name_err = not region_pass(name) desc_err = not region_pass(desc) #url不含cjk字符,否则报错,quote生成新url,待用。 url_err = check_url(url) if url_err: url = urllib.quote(url, ":?=/") #color为[]或者可json解析的字符串 if color != '[]' and color is not None: try: t = json.loads(color) color_err = False except: color_err = True print 'color:', color if name_err or url_err or color_err or desc_err or details_err: logging.error( (datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'Detail info--------------idproducts:', idproducts, 'name_err' if name_err else None, 'url_err' if url_err else None, 'color_err' if color_err else None, 'desc_err' if desc_err else None, 'details' if details_err else None)) #=============================price check================================================== logging.info( unicode.format( u'{0} PROCESSING price check {1} / {2}', datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), brand, info.brand_info()[brand]['brandname_e'])) prs = db.query( str.format( 'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,' 'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON ' 'p1.idproducts=p2.idproducts ' 'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts', brand)) # 以model为键值,将同一个model下,不同区域的价格放在一起。 records = prs.fetch_row(maxrows=0, how=1) price_data = {} for r in records: model = r['model'] # 仅有那些price不为None,且offline为0的数据,才加入到price check中。 if r['price'] and int(r['offline']) == 0: # 首先检查model是否已存在 if model not in price_data: price_data[model] = [] price_data[model].append(r) # 最大值和最小值之间,如果差别过大,则说明价格可能有问题 for model in price_data: for item in price_data[model]: price = float(item['price']) item['nprice'] = info.currency_info()[ item['currency']]['rate'] * price # 按照nprice大小排序 sorted_data = sorted( price_data[model], key=lambda item: item['nprice']) max_price = sorted_data[-1]['nprice'] min_price = sorted_data[0]['nprice'] if min_price > 0 and max_price / min_price > threshold: logging.warning( unicode.format( u'{0} WARNING: {1}:{7} MODEL={2}, {3} / {4} => {5} / {6}', datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), brand, model, sorted_data[0]['nprice'], sorted_data[0]['region'], sorted_data[-1]['nprice'], sorted_data[-1]['region'], info.brand_info()[brand] ['brandname_e'])) logging.info('PRODUCT CHECK ENDED!!!!')