def _inner(self, *args, **kwargs): logger = log.getLogger(self.logger_name) if dkwargs['level'] in 'debug': logger.debug('State-%s %s(%s).%s()' % (self.__name__, kwargs['locate_way'], kwargs['xpath'], kwargs['operator'])) else: logger.info('') try: return func(self, *args, **kwargs) except NoElementError as e: logger.debug(e) raise e
def crawl_tmall_data(good_iid, main_data_queue, comment_data_queue, engine, platform): global logger url = r'https://detail.tmall.com/item.htm?id=%s' % good_iid logger = log.getLogger('spider_process') logger.info('iid(%s) spider start' % good_iid) conn = engine.vertica_engine() with conn.cursor() as crsr: crsr.execute('''SELECT id FROM huimei.dc_platform_products_main WHERE iid=%s AND platform=%s''' % (good_iid, platform)) ret = crsr.fetchall() if len(ret) == 0: rowcount = crsr.execute( '''INSERT INTO huimei.dc_platform_products_main (iid, platform) VALUES(%s, %s);''' % (good_iid, platform)) if rowcount: crsr.execute( '''SELECT s.id FROM huimei.dc_platform_products_main s INNER JOIN (SELECT iid, MAX(gmt_modified) AS maxgmt FROM huimei.dc_platform_products_main where iid=%s GROUP BY iid) a ON s.iid=a.iid where s.iid=%s AND gmt_modified>=a.maxgmt''' % (good_iid, good_iid)) ret = crsr.fetchall() primary_key = ret[0][0] conn.close() def call_proxy(): try: proxy = load_proxy_2() b = firefox_with_proxy(proxy['host'], proxy['port']) b.set_page_load_timeout(120) b.get(r'https://detail.tmall.com/item.htm?id=%s' % good_iid) return b except TimeoutException: # 超时换代理重试 logger.warning( 'proxy host:%s port:%s TimeOut loading page.Retry another proxy' % (proxy['host'], proxy['port'])) b.close() return call_proxy() b = call_proxy() w = WorkState(driver=b, default_state=DetailState(primary_key, good_iid, url, main_data_queue, comment_data_queue)) w.run() b.close()
def work(self, driver): try: return self.do(driver=driver) except Exception as e: logger = log.getLogger(self.logger_name) logger.error(e) print('redo') self.try_cnt += 1 time.sleep(2.0) if self.try_cnt > 15: # 30秒后刷新页面重试操作 driver.refresh() driver.get(driver.current_url) self.try_cnt = 0 return self.back_state( primary_key=self.primary_key, good_iid=self.good_iid, url=self.url, main_data_queue=self.main_data_queue, comment_data_queue=self.comment_data_queue) return self.fail_state
def __init__(self): self.logger = log.getLogger(self.logger_name)
# @Site : # @File : data_to_vertica.py # @Software: tmall spider # @Function: import os from .exceptions import DataBaseExecuteError import time import datetime import sys from util.logger import log reload(sys) sys.setdefaultencoding('utf8') logger = log.getLogger('database_process') def _class_logger(*dargs, **dkwargs): '''日志装饰器''' def decorator(func): def inner(self, *args, **kwargs): if dkwargs['level'] == 'debug': log = self.info.debug elif dkwargs['level'] == 'info': log = self.info.info if kwargs.has_key('tb_name'): log('%s %s' % (dkwargs['msg'], kwargs['tb_name'])) else: log('%s' % dkwargs['msg']) func(self, *args, **kwargs)
#!/usr/bin/env python # -*- coding:utf-8 -*- # @Time : 2018/03/19 # @Author : kingsley kwong # @Site : # @File : test_get_engine.py # @Software: tmall spider # @Function: from util.logger import log logger = log.getLogger('spider') print('testing') logger.debug('testing')