Пример #1
0
 def _inner(self, *args, **kwargs):
     logger = log.getLogger(self.logger_name)
     if dkwargs['level'] in 'debug':
         logger.debug('State-%s %s(%s).%s()' %
                      (self.__name__, kwargs['locate_way'],
                       kwargs['xpath'], kwargs['operator']))
     else:
         logger.info('')
     try:
         return func(self, *args, **kwargs)
     except NoElementError as e:
         logger.debug(e)
         raise e
Пример #2
0
def crawl_tmall_data(good_iid, main_data_queue, comment_data_queue, engine,
                     platform):
    global logger
    url = r'https://detail.tmall.com/item.htm?id=%s' % good_iid
    logger = log.getLogger('spider_process')
    logger.info('iid(%s) spider start' % good_iid)
    conn = engine.vertica_engine()
    with conn.cursor() as crsr:
        crsr.execute('''SELECT id FROM huimei.dc_platform_products_main 
                        WHERE iid=%s AND platform=%s''' % (good_iid, platform))
        ret = crsr.fetchall()
        if len(ret) == 0:
            rowcount = crsr.execute(
                '''INSERT INTO huimei.dc_platform_products_main (iid, platform)
                                          VALUES(%s, %s);''' %
                (good_iid, platform))
            if rowcount:

                crsr.execute(
                    '''SELECT s.id  FROM huimei.dc_platform_products_main s INNER JOIN
                                (SELECT iid, MAX(gmt_modified) AS maxgmt  FROM huimei.dc_platform_products_main  where iid=%s GROUP BY iid) a
                                ON s.iid=a.iid
                                where s.iid=%s
                                AND gmt_modified>=a.maxgmt''' %
                    (good_iid, good_iid))
                ret = crsr.fetchall()
        primary_key = ret[0][0]
    conn.close()

    def call_proxy():
        try:
            proxy = load_proxy_2()
            b = firefox_with_proxy(proxy['host'], proxy['port'])
            b.set_page_load_timeout(120)
            b.get(r'https://detail.tmall.com/item.htm?id=%s' % good_iid)
            return b
        except TimeoutException:  # 超时换代理重试
            logger.warning(
                'proxy host:%s port:%s TimeOut loading page.Retry another proxy'
                % (proxy['host'], proxy['port']))
            b.close()
            return call_proxy()

    b = call_proxy()
    w = WorkState(driver=b,
                  default_state=DetailState(primary_key, good_iid, url,
                                            main_data_queue,
                                            comment_data_queue))
    w.run()
    b.close()
Пример #3
0
 def work(self, driver):
     try:
         return self.do(driver=driver)
     except Exception as e:
         logger = log.getLogger(self.logger_name)
         logger.error(e)
         print('redo')
         self.try_cnt += 1
         time.sleep(2.0)
         if self.try_cnt > 15:  # 30秒后刷新页面重试操作
             driver.refresh()
             driver.get(driver.current_url)
             self.try_cnt = 0
             return self.back_state(
                 primary_key=self.primary_key,
                 good_iid=self.good_iid,
                 url=self.url,
                 main_data_queue=self.main_data_queue,
                 comment_data_queue=self.comment_data_queue)
         return self.fail_state
Пример #4
0
 def __init__(self):
     self.logger = log.getLogger(self.logger_name)
Пример #5
0
# @Site    :
# @File    : data_to_vertica.py
# @Software: tmall spider
# @Function:

import os
from .exceptions import DataBaseExecuteError
import time
import datetime
import sys
from util.logger import log

reload(sys)
sys.setdefaultencoding('utf8')

logger = log.getLogger('database_process')

def _class_logger(*dargs, **dkwargs):
    '''日志装饰器'''

    def decorator(func):
        def inner(self, *args, **kwargs):
            if dkwargs['level'] == 'debug':
                log = self.info.debug
            elif dkwargs['level'] == 'info':
                log = self.info.info
            if kwargs.has_key('tb_name'):
                log('%s %s' % (dkwargs['msg'], kwargs['tb_name']))
            else:
                log('%s' % dkwargs['msg'])
            func(self, *args, **kwargs)
Пример #6
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Time    : 2018/03/19
# @Author  : kingsley kwong
# @Site    :
# @File    : test_get_engine.py
# @Software: tmall spider
# @Function:

from util.logger import log

logger = log.getLogger('spider')
print('testing')
logger.debug('testing')