def run(cls, logger=None, **kwargs): """ 更新货币的汇率信息 @param param_dict: """ logger = logger if 'logger' in kwargs else get_logger() logger.info('Update currency STARTED!!!!') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: for currency in db.query_match('currency', 'currency_info').fetch_row(maxrows=0): currency = currency[0] try: logger.debug(str.format('Fetching for currency data for {0}...', currency)) data = cm.get_data(url=str.format('http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rate, d, t = [val for val in csv.reader(StringIO(data['body']))][0][1:4] rate = float(rate) timestamp = datetime.strptime(' '.join((d, t)), '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d %H:%M:%S') db.update({'rate': rate, 'update_time': timestamp}, 'currency_info', str.format('currency="{0}"', currency)) except (ValueError, IOError): continue except: raise logger.info('Update currency ENDED!!!!')
def run(cls, **kwargs): logger = kwargs['logger'] if 'logger' in kwargs else get_logger() logger.info('BACKUP STARTED') try: db_spec = getattr(glob, 'DATABASE')[kwargs['DATABASE']] # {"host": "127.0.0.1", "port": 1228, "schema": "editor_stores", "username": "******", "password": "******"} host = db_spec['host'] if 'host' in db_spec else '127.0.0.1' port = db_spec['port'] if 'port' in db_spec else 3306 schema = db_spec['schema'] db_user = db_spec['username'] db_pwd = db_spec['password'] ssh_user, ssh_host, ssh_port, dst = (None, None, 22, '') if 'SSH_USER' in kwargs: ssh_user = kwargs['SSH_USER'] if 'SSH_HOST' in kwargs: ssh_host = kwargs['SSH_HOST'] if 'SSH_PORT' in kwargs: ssh_port = int(kwargs['SSH_PORT']) if 'SSH_DST' in kwargs: dst = kwargs['SSH_DST'] except (AttributeError, KeyError): logger.exception('Invalid database specification.') return host_str = str.format('-h{0}', host) if host else '' port_str = str.format('-P{0}', port) if port else '' tmp_file = '/tmp/single_backup.sql' # single-transaction备份 logger.info('EXPORTING...') os.system( str.format( 'mysqldump {3} {4} -u {0} -p{1} --single-transaction {2} > {5}', db_user, db_pwd, schema, host_str, port_str, tmp_file)) # 打包 logger.info('ZIPPING...') backup_name = os.path.join( getattr(glob, 'STORAGE')['STORAGE_PATH'], 'backups', str.format('{0}_auto_backup.7z', datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))) os.system( str.format('7z a -mx7 {0} {1} > /dev/null', backup_name, tmp_file)) # 移除临时sql文件 logger.info('REMOVING TEMPORARY SQL FILES...') os.remove(tmp_file) # SCP if ssh_user and ssh_host and ssh_port: # 指明了SSH信息,需要上传到远程服务器作为备份 logger.info('UPLOADING...') ssh_port_str = str.format('-P {0}', ssh_port) if ssh_port else '' os.system( str.format('scp {0} {4} {1}@{2}:{3} > /dev/null', ssh_port_str, ssh_user, ssh_host, dst, backup_name)) logger.info(str.format('AUTO BACKUP COMPLETED: {0}', backup_name))
def translate_text_to(gs, text, to, source='', backup_gs=None): logger = get_logger() try: text = text.encode('utf-8') except: pass if not source and is_cht(text): source = 'zh-cn' result = None try: result = gs.translate(text, to, source) except: if not backup_gs: logger.info( str.format( "Error: gs translate error with text : {0} source : {1} target : {2}", text, source, to)) pass if not result and backup_gs: try: result = backup_gs.translate(text, to, source) except: logger.info( str.format( "Error: backupgs translate error with text : {0} source : {1} target : {2}", text, source, to)) pass return result
def run(cls, **kwargs): logger = kwargs['logger'] if 'logger' in kwargs else get_logger() logger.info('PRICE-CHANGE DETECTION STARTED') logger.info('CLEARING OUTDATED RECORDS') cls.tag_outdated(**kwargs) logger.info('GENERATING PRICE TRENDS') result = cls.tag_changed(**kwargs) if not result: logger.info('NO PRICE TRENDS DETECTED') return dst = kwargs['dst'] if 'dst' in kwargs and kwargs[ 'dst'] else '~/push_works/push.log' ssh_user, ssh_host, ssh_port = [None] * 3 if 'ssh' in kwargs and kwargs['ssh']: ssh_str = kwargs['ssh'] ssh_user, ssh = ssh_str.split('@') if ':' in ssh: ssh_host, ssh_port = ssh.split(':') else: ssh_host = ssh # Default ssh port ssh_port = 22 if ssh_host: # 如果没有SSH信息,说明不需要通过SFTP将结果传输到远端服务器上 logger.info('UPLOADING PRICE TRENDS') # 将变动结果写入临时目录 file_name = str.format( '/tmp/price_change_{0}.log', datetime.datetime.now().strftime('%Y%m%d%H%M%S')) with open(file_name, 'wb') as f: f.write(json.dumps(result, ensure_ascii=False).encode('utf-8')) # 指明了SSH信息,需要上传到远程服务器作为备份 ssh_port_str = str.format('-P {0}', ssh_port) if ssh_port else '' ssh_cmd = str.format('scp {0} {4} {1}@{2}:{3} > /dev/null', ssh_port_str, ssh_user, ssh_host, dst, file_name) logger.info(str.format('UPLOADING: {0}', ssh_cmd)) os.system(ssh_cmd) os.remove(file_name) # 发布更新的商品 updated_brands = set([]) for k in ('discount_down', 'discount_up', 'price_down', 'price_up'): updated_brands = updated_brands.union( result['discount_down'].keys()) for brand in updated_brands: PublishRelease(brand).run() logger.info('DONE')
def run(cls, **kwargs): logger = kwargs['logger'] if 'logger' in kwargs else get_logger() logger.info('IMAGE CHECK ALERT STARTED') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: rs = db.query( 'SELECT fingerprint, brand_id, image_list, cover_image FROM products_release', use_result=True) while True: bulk = rs.fetch_row(maxrows=100) if not bulk: break is_err = False for fingerprint, brand_id, jlist, jcover in bulk: try: image_list = json.loads(jlist) for path in [tmp['path'] for tmp in image_list]: if not re.search(str.format(r'^{0}_', brand_id), path): content = str.format( 'fingerprint={0}, image_list={1}', fingerprint, jlist) logger.error(content) cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), content) is_err = True break cover = json.loads(jcover) if not re.search(str.format(r'^{0}_', brand_id), cover['path']): content = str.format('fingerprint={0}, jcover={1}', fingerprint, jcover) logger.error(content) cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), content) is_err = True break except: cls.alert( str.format('INVALID IMAGES: {0}!!!', fingerprint), str.format( 'fingerprint={0}, jlist={1}, jcover={2}', fingerprint, jlist, jcover)) raise if is_err: break logger.info('DONE!')
def run(self): logger = get_logger() # 只处理关键国家的数据 tmp = info.region_info() key_regions = filter(lambda val: tmp[val]['status'] == 1, tmp) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: # 删除原有的数据 logger.info( str.format('DELETING OLD RECORDS: brand_id={0}', self.brand_id)) db.execute( str.format('DELETE FROM products_release WHERE brand_id={0}', self.brand_id)) fp_list = [ tmp[0] for tmp in db.query( str.format( 'SELECT fingerprint FROM products WHERE brand_id={0} GROUP BY fingerprint', self.brand_id)).fetch_row(maxrows=0) ] self.tot = len(fp_list) self.progress = 0 # 最多每100次就需要提交一次事务 transaction_max = 100 logger.info( str.format('TOT: {0} fingerprints, brand_id={1}', self.tot, self.brand_id)) db.start_transaction() for self.progress in xrange(self.tot): if self.progress % transaction_max == 0: db.commit() db.start_transaction() logger.info( str.format( 'PROCESSED {0}/{1} fingerprints, brand_id={2}', self.progress, self.tot, self.brand_id)) fp = fp_list[self.progress] model_list = list( filter( lambda val: val['region'] in key_regions, db.query_match(['*'], 'products', { 'fingerprint': fp }).fetch_row(maxrows=0, how=1))) if model_list: self.merge_prods(model_list, db) db.commit() logger.info(str.format('DONE, brand_id={0}', self.brand_id))
def run(cls, **kwargs): cls.running = True logger = kwargs['logger'] if 'logger' in kwargs else get_logger() logger.info(str.format("Translate tasker start")) db_spec = kwargs['db_spec'] if 'db_spec' in kwargs else 'DB_SPEC' start = kwargs['start'] if 'start' in kwargs else 0 count = kwargs['count'] if 'count' in kwargs else 100 translate_main(start, count, logger, db_spec) logger.info(str.format("Translate tasker end")) cls.running = False
def run(cls, logger=None, **kwargs): logger = logger if 'logger' in kwargs else get_logger(logger_name='monitor') logger.info('Monitor STARTED!!!') #monitor process quantity, recrawl process quantity,limit interval for recrawl spider try: monitor_no = getattr(gs, 'MONITOR')['MAX_MONITOR'] except (AttributeError, KeyError): monitor_no = 6 try: recrawl_no = getattr(gs, 'MONITOR')['MAX_RECRAWLER'] except (AttributeError, KeyError): recrawl_no = 12 interval = kwargs['interval'] if 'interval' in kwargs else 7 limit_time = datetime.datetime.now() - datetime.timedelta(interval) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: pid_list = psutil.pids() rs = db.query_match(['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid'], 'monitor_status', {'enabled': 1}).fetch_row(maxrows=0) #update monitor_status for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid in rs: #更新monitor_pid,recrawl_pid if monitor_pid and int(monitor_pid) not in pid_list: db.update({'monitor_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor)) if recrawl_pid and int(recrawl_pid) not in pid_list: db.update({'recrawl_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor)) #更新 rs_new = db.query_match( ['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid', 'timestamp'], 'monitor_status', {'enabled': 1}, tail_str='ORDER BY priority desc, timestamp').fetch_row(maxrows=0) #空闲product列表,排序后,最早更新的等待monitor idle_monitor_list = [] idle_recrawl_list = [] monitor_list = [] recrawl_list = [] for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid, timestamp in rs_new: #生成monitor_pid、recrawl_pid列表,用于监控和重爬,保证数量 if monitor_pid is not None: monitor_list.append(int(monitor_pid)) if recrawl_pid is not None: recrawl_list.append(int(recrawl_pid)) if monitor_status == '0' and monitor_pid is None and recrawl_pid is None: idle_monitor_list.append((idmonitor, parameter, timestamp)) if monitor_status == '1' and monitor_pid is None and recrawl_pid is None: idle_recrawl_list.append(( idmonitor, parameter, timestamp)) # #爬虫最后更新时间早于最大限制时间,重爬。 # update_time = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') # if update_time <= limit_time: # db.update({'monitor_status': 1}, 'monitor_status', # str.format('idmonitor={0}', idmonitor)) # idle_monitor_list = sorted(idle_monitor_list, key=lambda m: m[2]) # idle_recrawl_list = sorted(idle_recrawl_list, key=lambda m: m[2]) #start monitor and set monitor_status if find update if len(monitor_list) < monitor_no: if len(idle_monitor_list) > monitor_no - len(monitor_list): ready_monitor = idle_monitor_list[:(monitor_no - len(monitor_list))] else: ready_monitor = idle_monitor_list for idmonitor, parameter, timestamp in ready_monitor: args = json.loads(parameter) #monitor --brand 10009 --region fr --idmonitor 1931 -v logger.info('Monitor started--> idmonitor:%s, brand_id:%s, region:%s' % ( idmonitor, args['brand_id'], args['region'])) spawn_process( os.path.join(scripts.__path__[0], 'run_crawler.py'), 'monitor --brand %s --region %s --idmonitor %s' % (args['brand_id'], args['region'], idmonitor)) #start recrawl and reset monitor_status after recrawl ended if len(recrawl_list) < recrawl_no: if len(idle_recrawl_list) > recrawl_no - len(recrawl_list): ready_recrawl = idle_recrawl_list[:(recrawl_no - len(recrawl_list))] else: ready_recrawl = idle_recrawl_list for idmonitor, parameter, timestamp in ready_recrawl: args = json.loads(parameter) args['idmonitor'] = idmonitor para = '|'.join([str(idmonitor), str(args['brand_id']), args['region']]) logger.info('Recrawl started--> idmonitor:%s, brand_id:%s, region:%s' % ( idmonitor, args['brand_id'], args['region'])) spawn_process(os.path.join(scripts.__path__[0], 'recrawler.py'), para)
brand = int(brand) idm = int(idm) parameter = {'idmonitor': idm, 'brand_id': brand, 'region': region} with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: db.update({'monitor_status': 0, 'monitor_pid': None, 'recrawl_pid': os.getpid()}, 'monitor_status', str.format('idmonitor={0}', parameter['idmonitor'])) #todo hardcode for DKNY, need to add 'is_offline' for DknySpider if brand == 10108: os.system( 'python %s %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) else: os.system('python %s update --brand %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) os.system( 'python %s %s -r %s' % ( os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region'])) # os.system('python %s process-tags --cond brand_id=%s' % parameter['brand_id']) # os.system('python %s release --brand %s' % parameter['brand_id']) with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: db.update({'recrawl_pid': None, 'priority': None}, 'monitor_status', str.format('idmonitor={0}', parameter['idmonitor'])) logger = get_logger(logger_name='monitor') logger.info('Recrawl ended--> idmonitor:%s, brand_id:%s, region:%s' % ( parameter['idmonitor'], parameter['brand_id'], parameter['region']))
def merge_prods(self, prods, db): """ 按照国家顺序,挑选主记录 :param prods: """ logger = get_logger() # 将prods转换为unicode for idx in xrange(len(prods)): prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]} # 挑选primary记录 sorted_prods = sorted(prods, key=lambda k: self.region_order[k['region']]) main_entry = sorted_prods[0] entry = { k: unicodify(main_entry[k]) for k in ('brand_id', 'model', 'name', 'description', 'details', 'gender', 'category', 'color', 'url', 'fingerprint') } if not entry['name']: entry['name'] = u'单品' mfashion_tags = [ unicodify(val[0]) for val in db.query( str.format( 'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 ' 'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags ' 'WHERE p2.idproducts IN ({0})', ','.join( val['idproducts'] for val in prods))).fetch_row(maxrows=0) ] # # original_tags = [int(val[0]) for val in # db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags ' # 'WHERE idproducts IN ({0})', # ','.join(val['idproducts'] for val in prods))).fetch_row( # maxrows=0)] entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False) entry[ 'original_tags'] = '' #json.dumps(original_tags, ensure_ascii=False) entry['region_list'] = json.dumps([val['region'] for val in prods], ensure_ascii=False) entry['brandname_e'] = info.brand_info()[int( entry['brand_id'])]['brandname_e'] entry['brandname_c'] = info.brand_info()[int( entry['brand_id'])]['brandname_c'] # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time # entry['fetch_time'] = \ # sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[ # 0].strftime("%Y-%m-%d %H:%M:%S") url_dict = {int(val['idproducts']): val['url'] for val in prods} offline_dict = { int(val['idproducts']): int(val['offline']) for val in prods } price_change_dict = { int(val['idproducts']): val['price_change'] for val in prods } update_time_dict = { int(val['idproducts']): datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S") for val in prods } # pid和region之间的关系 region_dict = {int(val['idproducts']): val['region'] for val in prods} price_list = {} # 以pid为主键,将全部的价格历史记录合并起来 for item in db.query_match( ['price', 'price_discount', 'currency', 'date', 'idproducts'], self.price_hist, {}, str.format('idproducts IN ({0})', ','.join(val['idproducts'] for val in prods)), tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0, how=1): pid = int(item['idproducts']) region = region_dict[pid] offline = offline_dict[pid] if pid not in price_list: price_list[pid] = [] price = float(item['price']) if item['price'] else None if offline == 0: price_discount = float( item['price_discount']) if item['price_discount'] else None else: price_discount = None price_list[pid].append({ 'price': price, 'price_discount': price_discount, 'currency': item['currency'], 'date': datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"), 'price_change': price_change_dict[pid], 'url': url_dict[pid], 'offline': offline, 'code': region, 'country': info.region_info()[region]['name_c'] }) currency_conv = lambda val, currency: info.currency_info()[currency][ 'rate'] * val # 对price_list进行简并操作。 # 策略:如果有正常的最新价格,则返回正常的价格数据。 # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。 # 如果无法找到不为None的价格,则跳过该pid for pid, pid_data in price_list.items(): # 按照时间顺序逆排序,同时只保留price不为None的数据 # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True) # 有价格的pid_data子集 valid_pid_data = filter(lambda val: val['price'], pid_data) if pid_data[0]['price']: # 正常情况 price_list[pid] = pid_data[0] # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况 currency = valid_pid_data[0]['currency'] if price_change_dict[pid] == 'D' and len( valid_pid_data ) > 1 and currency == valid_pid_data[1]['currency']: if not pid_data[0]['price_discount'] and currency_conv( valid_pid_data[1]['price'], currency) > currency_conv( valid_pid_data[0]['price'], currency) and (datetime.datetime.now() - valid_pid_data[0]['date'] ) < datetime.timedelta(7): price_list[pid]['price_discount'] = price_list[pid][ 'price'] price_list[pid]['price'] = valid_pid_data[1]['price'] else: # 寻找回溯第一条price不为None的数据。 # tmp = filter(lambda val: val['price'], pid_data) if not valid_pid_data: # 没有价格信息,取消该pid记录 price_list.pop(pid) else: # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态 tmp = valid_pid_data[0] tmp['price_discount'] = None price_list[pid] = tmp # 第一次有效价格对应的时间,为fetch_time # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date'])) # pid_data = filter(lambda val: val['price'], pid_data) if valid_pid_data and pid in price_list: price_list[pid]['fetch_time'] = valid_pid_data[-1]['date'] price_list[pid]['idproducts'] = pid # 如果没有价格信息,则不发布 if not price_list: return entry['price_list'] = sorted( price_list.values(), key=lambda val: self.region_order[val['code']]) entry = release_filter(entry, logger) if not entry['price_list']: return entry['offline'] = entry['price_list'][0]['offline'] # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。 entry['fetch_time'] = min( tmp['fetch_time'] for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S") # 价格排序的列表 alt_prices = [] for price_item in entry['price_list']: # 将datetime序列化,进而保存在release表中。 price_item['date'] = price_item['date'].strftime( "%Y-%m-%d %H:%M:%S") price_item['fetch_time'] = price_item['fetch_time'].strftime( "%Y-%m-%d %H:%M:%S") if price_item['offline'] == 0: if price_item['price_discount']: tmp = map( lambda key_name: currency_conv(price_item[key_name], price_item['currency']), ('price', 'price_discount')) tmp.extend([ price_item[key] for key in ('price_change', 'price', 'price_discount', 'currency', 'date', 'idproducts') ]) alt_prices.append(tmp) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) else: alt_prices.append([ currency_conv(price_item['price'], price_item['currency']), None, price_item['price_change'], price_item['price'], price_item['price_discount'], price_item['currency'], price_item['date'], price_item['idproducts'] ]) # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价 alt_prices = sorted(alt_prices, key=lambda val: val[1] if val[1] else val[0]) entry['price'], entry['price_discount'] = alt_prices[ 0][:2] if alt_prices else (None, ) * 2 entry['price_change'] = alt_prices[0][2] if alt_prices else '0' entry['o_price'], entry['o_discount'], entry[ 'o_currency'] = alt_prices[0][3:6] # 取消entry['price_list']中的idproducts for i in xrange(len(entry['price_list'])): entry['price_list'][i].pop('idproducts') entry['price_list'] = json.dumps(entry['price_list'], ensure_ascii=False) entry['last_price_ts'] = alt_prices[0][6] entry['product_update_ts'] = update_time_dict[ alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S") # 搜索字段 search_text = u' '.join(entry[tmp] if entry[tmp] else '' for tmp in ('name', 'description', 'details', 'model', 'brandname_e', 'brandname_c')) search_color = u' '.join(entry['color']) if entry['color'] else u'' rs = db.query_match( ['description_cn', 'description_en', 'details_cn', 'details_en'], 'products_translate', { 'fingerprint': entry['fingerprint'] }).fetch_row() part_translate = u' ' + u' '.join( unicodify(tmp) for tmp in filter(lambda v: v, rs[0])) if rs else ' ' search_tags = u' '.join(list(set(mfashion_tags))) entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text, part_translate, search_tags, search_color) p = prods[0] checksums = [] # 爆照checksums中的数据唯一,且顺序和idproducts_image一致 for tmp in db.query( str.format( ''' SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1 JOIN products AS p2 ON p1.fingerprint=p2.fingerprint JOIN images_store AS p3 ON p1.checksum=p3.checksum WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image ''', p['fingerprint'])).fetch_row(maxrows=0, how=1): if tmp not in checksums: checksums.append(tmp) # 如果没有图片,则暂时不添加到release表中 if not checksums: return image_list = [] for val in checksums: tmp = { 'path': val['path'], 'width': int(val['width']), 'height': int(val['height']) } if not image_list: entry['cover_image'] = json.dumps(tmp, ensure_ascii=False) image_list.append(tmp) entry['image_list'] = json.dumps(image_list[:self.max_images], ensure_ascii=False) db.insert(entry, 'products_release')
def run(cls, logger=None, **param): ''' 通过文件名判断文件创建时间、货号、new or update 不使用文件属性判断,防止文件误删恢复影响创建时间、修改时间 ''' logger = logger if 'logger' in param else get_logger() logger.info('PARSE LOG STARTED!!!') max_err_cnt = 10 if 'log-path' in param: log_path = param['log-path'] else: log_path = os.sep.join( (getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products', 'log')) if 'interval' in param: interval = param['interval'] else: interval = 1 if 'start' in param and 'stop' in param: start_time = param['start'] stop_time = param['stop'] else: cur = datetime.datetime.now() to_time = cur.strftime('%Y%m%d%H%M%S') from_time = (cur - datetime.timedelta(interval)).strftime('%Y%m%d%H%M%S') start_time = from_time stop_time = to_time # 确定收信人 try: group = getattr(gs, 'REPORTS')['CRAWLER_STATUS'] if not isinstance(group, list): group = [group] recipients = {} for g in group: for key, value in getattr(gs, 'EMAIL_GROUPS')[g].items(): recipients[key] = value # recipent_addrs = gs.EMAIL_ADDR.values() # ['*****@*****.**', '*****@*****.**'] except (TypeError, AttributeError, KeyError): return files = os.listdir(log_path) process_error = {} for file in files: #通过文件名获取信息 # file = 'update_10192_20140304172558.log' file_name = os.path.basename(os.path.splitext(file)[0]) if os.path.splitext(file)[1] != '.log': continue # file_name = file.split('.')[0] if re.search(r'^update', file_name) and re.findall( r'_\d{14}', file_name): tmp = re.split(r'_', file_name) if len(tmp) == 2: (update, dt) = tmp model = 'MIX' elif len(tmp) == 3: (update, model, dt) = tmp elif re.search(r'^\d{5}_', file_name) and re.findall( r'_\d{8}', file_name): update = '' dt = re.findall(r'_(\d{8})', file_name)[0] dt += '000000' model = '_'.join(file_name.split('_')[1:-1]) #文件创建时间晚于stop_time,文件跳过不处理 if int(stop_time) < int(dt): continue else: f = open('%s%s%s' % (log_path, os.sep, file), 'rb') error_count = 0 error_time = '' error_info = '' error_line = 0 #默认文件最后生成时间为now last_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S') lines = f.readlines() i = 0 error_list = [] #获取文件最后生成时间 for l in xrange(len(lines) - 1, 0, -1): last = re.findall( r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\]', lines[l]) if last: last_time = ''.join(last[0]) break #文件创建结束时间早于start_time,文件跳过不处理 if int(start_time) > int(last_time): continue process_error[file] = [] #整理错误行号列表 while i < len(lines): if re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR|Error)', lines[i]): get_time = ''.join( re.findall( r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[\S+\] ERROR:', lines[i])[0]) if int(get_time) > int(stop_time) or int( get_time) < int(start_time): pass else: error_list.append(i) t = [] while i < len(lines) - 1 and lines[ i + 1].startswith('\t') and ( lines[i + 1].strip() != ''): t.append(i + 1) i += 1 if t: error_list.append(t) i += 1 # print(error_list) process_error[file] = [] for index in xrange(0, len(error_list)): if len(process_error[file]) >= max_err_cnt: break if type(error_list[index]) is not list: error_lineno = error_list[index] + 1 # 使用下面这个 # re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*)', lines[error_list[index]]) error_time = ''.join( re.findall( r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[\S+\] ERROR:', lines[error_list[index]])[0]) tmp2 = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*) <GET.*?>(.*)', lines[error_list[index]]) if tmp2: error_info = ''.join(tmp2[0]) else: tmp2 = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*) (.*)', lines[error_list[index]]) if tmp2: error_info = ''.join(tmp2[0]) else: error_info = 'UNKNOWN ERROR' if process_error[file] and process_error[file][-1][ 'error_info'] != error_info: process_error[file].append({ 'line_no': error_lineno, 'error_time': error_time, 'error_info': error_info, 'error_count': 1, 'Traceback': [] }) elif process_error[file] and process_error[file][-1][ 'error_info'] == error_info: process_error[file][-1]['error_count'] += 1 else: process_error[file].append({ 'line_no': error_lineno, 'error_time': error_time, 'error_info': error_info, 'error_count': 1, 'Traceback': [] }) else: for i in range(error_list[index][-1], error_list[index][0] + 1, -1): detail = re.findall( r'File "(/home/rose/MStore\S+)", line (\S+), in (\S+)', lines[i]) if detail: (error_file, error_file_line, error_function) = detail[0] error_detail = lines[i + 1].strip() if re.findall(r'exceptions.(\S+): ', lines[error_list[index][-1]]): exception = re.findall( r'exceptions.(\S+): ', lines[error_list[index][-1]])[0] else: exception = '' if re.findall(r'exceptions.\S+: (.*)', lines[error_list[index][-1]]): exception_detail = \ re.findall(r'exceptions.\S+: (.*)', lines[error_list[index][-1]])[0] else: exception_detail = '' process_error[file][-1]['Traceback'].append([ error_file, error_file_line, error_function, error_detail, exception, exception_detail ]) break cls.sendemail(process_error, recipients) logger.info('PARSE LOG EMAIL SENDED!!!') logger.info('PARSE LOG ENDED!!!')
kclass = __import__(name) else: mod_name, mod_class = '.'.join(tmp[:-1]), tmp[-1] mod = __import__(mod_name, fromlist=[mod_class]) kclass = getattr(mod, mod_class) return kclass if __name__ == "__main__": ret = parse_args(sys.argv) section = ret['cmd'] if not section: section = 'CRON_TASK_DEFAULT' logger = get_logger() logger.info(str.format('TASK {0} STARTED.', section)) for task_name, task_param in getattr(glob, section, {}).items(): try: class_name = task_param['classname'] func = getattr(my_import(class_name), 'run') if 'param' in task_param: func(**task_param['param']) else: func() except (KeyError,): logger = get_logger().exception(unicode.format(u'Invalid task name: {0}', unicodify(task_name)).encode('utf-8'))
def run(cls, logger=None, **kwargs): logger = logger if 'logger' in kwargs else get_logger() logger.info('Parse Duration Check STARTED') log_path = os.sep.join((getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products', 'log')) tmp = [] for x in os.listdir(log_path): if re.findall(r'^\d{5}', x): tmp.append(re.findall(r'^(\d{5})', x)[0]) brands = list(set(tmp)) if not brands: pass else: with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: for brand in brands: file_list = [] files = os.listdir(log_path) #过滤文件名 for file in files: if re.search(r'^%s' % brand, file): file_list.append(file) file_list.sort(reverse=True) if file_list: for log in file_list: # (product, name, log_time) = re.split(r'_', os.path.basename(os.path.splitext(file)[0])) f = open(log_path + os.sep + log, 'rb') lines = f.readlines() if not lines: continue st = re.findall( r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\] INFO: Spider started, processing the following regions:', lines[0]) ed = re.findall( r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\] INFO: Spider closed \(finished\)', lines[-1]) regions = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[[^\[\]]*\] INFO: Spider started, processing the following regions: (.*)', lines[0]) if st and ed and regions: start_time = '%s-%s-%s %s:%s:%s' % (st[0]) end_time = '%s-%s-%s %s:%s:%s' % (ed[0]) regions_no = len([x.strip() for x in regions[0].split(',')]) duration = datetime.datetime.strptime(''.join(ed[0]), '%Y%m%d%H%M%S') - datetime.datetime.strptime( ''.join(st[0]), '%Y%m%d%H%M%S') db.insert({'start_time': start_time, 'end_time': end_time, 'duration': duration.seconds, 'country_cnt': regions_no, 'brand_id': brand}, 'brand_duration', replace=True) logger.info('%s brand-id:%s duration time update' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), brand)) break else: pass else: pass logger.info('Parse Duration Check ENDED!!!')
def translate_main(start=0, count=100, logger=None, db_spec='DB_SPEC'): if not logger: logger = get_logger() with RoseVisionDb(getattr(global_settings, 'DATABASE')[db_spec]) as db: gs = goslate.Goslate() proxy_name = get_proxy() proxy = urllib2.ProxyHandler({'http': proxy_name }) if proxy_name else None opener = urllib2.build_opener(proxy) backup_gs = goslate.Goslate(opener=opener, debug=True) sorted_region = get_sorted_region(db) fingerprint_start = start fingerprint_count = count logger.info(str.format("Translate process start")) while 1: fingerprints = get_fingerprints(db, fingerprint_start, fingerprint_count) if not fingerprints: logger.info(str.format("Translate process end")) break else: logger.info( str.format("Translate process offset : {0} count : {1}", fingerprint_start, len(fingerprints))) fingerprint_start += fingerprint_count for fingerprint in fingerprints: is_exist = db.query_match({'fingerprint'}, 'products_translate', { 'fingerprint': fingerprint }).num_rows() if is_exist: continue product_infos = get_product(db, fingerprint) # 按权重排序 product_infos = sorted(product_infos.items(), key=lambda e: sorted_region.index(e[0])) product_infos = {e[0]: e[1] for e in product_infos} final_description_cn = None final_details_cn = None final_description_en = None final_details_en = None description_cn = check_cns_region(product_infos, 'description') details_cn = check_cns_region(product_infos, 'details') if is_chs(description_cn): final_description_cn = description_cn elif is_cht(description_cn): final_description_cn = translate_text_to( gs, description_cn, 'zh-cn', source='zh-cn', backup_gs=backup_gs) if is_chs(details_cn): final_details_cn = details_cn elif is_cht(details_cn): final_details_cn = translate_text_to(gs, details_cn, 'zh-cn', source='zh-cn', backup_gs=backup_gs) description_en = check_ens_region(product_infos, 'description') details_en = check_ens_region(product_infos, 'details') if is_eng(description_en): final_description_en = description_en if is_eng(details_en): final_details_en = details_en try: if not final_description_cn: for region, info in product_infos.items(): if product_infos[region]['description']: final_description_cn = translate_text_to( gs, product_infos[region]['description'], 'zh-cn', backup_gs=backup_gs) break if not final_details_cn: for region, info in product_infos.items(): if product_infos[region]['details']: final_details_cn = translate_text_to( gs, product_infos[region]['details'], 'zh-cn', backup_gs=backup_gs) break if not final_description_en: for region, info in product_infos.items(): if region != 'cn': # 尽量不从中文翻译到其他外语 if product_infos[region]['description']: final_description_en = translate_text_to( gs, product_infos[region]['description'], 'en', backup_gs=backup_gs) break if not final_details_en: for region, info in product_infos.items(): if region != 'cn': # 尽量不从中文翻译到其他外语 if product_infos[region]['details']: final_details_en = translate_text_to( gs, product_infos[region]['details'], 'en', backup_gs=backup_gs) break if not final_description_en and final_description_cn: final_description_en = translate_text_to( gs, final_description_cn, 'en', 'zh-cn', backup_gs=backup_gs) if not final_details_en and final_details_cn: final_details_en = translate_text_to( gs, final_details_cn, 'en', 'zh-cn', backup_gs=backup_gs) except: pass insert_dict = {} if final_description_cn: insert_dict['description_cn'] = final_description_cn if final_details_cn: insert_dict['details_cn'] = final_details_cn if final_description_en: insert_dict['description_en'] = final_description_en if final_details_en: insert_dict['details_en'] = final_details_en if insert_dict: insert_dict['fingerprint'] = fingerprint result = db.query_match({'fingerprint'}, 'products_translate', {'fingerprint': fingerprint}) try: if result.num_rows() == 0: db.insert(insert_dict, 'products_translate') else: db.update( insert_dict, 'products_translate', str.format('fingerprint="{0}"', fingerprint)) except: logger.info( str.format( "Error: Insert or update sql error with {0}", insert_dict)) pass else: logger.info( str.format( "Error: No insert_dict for fingerprint : {0}", fingerprint))