示例#1
0
class JHSActM(MyThread):
    '''A class of jhs activity item thread manager'''
    def __init__(self, jhs_type, thread_num = 15, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex      = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # appendix val
        self.a_val = a_val

        # jhs queue type
        self.jhs_type   = jhs_type # 1:即将上线品牌团频道页, 2:检查每天还没结束的活动, 3:新增活动
        
        # activity items
        self.items      = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
        #self._tag = 'tpent'

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            #self.items.append(v)
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.act_crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            print "# retry too many times, no get item:", _val

    # insert act
    def insertAct(self, actsql_list, f=False):
        if f or len(actsql_list) >= Config.act_max_arg:
            if len(actsql_list) > 0:
                self.mysqlAccess.insertJhsAct(actsql_list)
            return True
        return False

    # insert act day
    def insertActday(self, actdaysql_list, f=False):
        if f or len(actdaysql_list) >= Config.act_max_arg:
            if len(actdaysql_list) > 0:   
                self.mysqlAccess.insertJhsActDayalive(actdaysql_list)
            return True
        return False

    # insert act hour
    def insertActhour(self, acthoursql_list, f=False):
        if f or len(acthoursql_list) >= Config.act_max_arg:
            if len(acthoursql_list) > 0:
                self.mysqlAccess.insertJhsActHouralive(acthoursql_list)
            return True
        return False

    # insert act coming
    def insertActcoming(self, actcomingsql_list, f=False):
        if f or len(actcomingsql_list) >= Config.act_max_arg:
            if len(actcomingsql_list) > 0:
                self.mysqlAccess.insertJhsActComing(actcomingsql_list)
            return True
        return False

    # To crawl item
    def crawl(self):
        # sql list
        #_actsql_list, _actdaysql_list, _acthoursql_list = [], [], []
        _actcomingsql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    self.insertActcoming(_actcomingsql_list, True)
                    _actcomingsql_list = []

                    break

                item = None
                crawl_type = ''
                if self.jhs_type == 1:
                    # 品牌团实例 即将上线
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageComing(_val)
                    print '# To crawl coming activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'coming'
                    # 汇聚
                    self.push_back(self.items, item.outTupleForComing())
                    crawling_confirm,sql = item.outTupleForComing()
                    # 入库
                    if crawling_confirm == 1:
                        _actcomingsql_list.append(sql)
                    if self.insertActcoming(_actcomingsql_list): _actcomingsql_list = []
                elif self.jhs_type == 2:
                    # 品牌团实例 检查活动新加商品
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageHourcheck(_val)
                    #print '# To check activity val : ', Common.now_s(), _val[0], _val[1]
                    crawl_type = 'hourcheck'
                    # 汇聚
                    self.push_back(self.items, item.outTupleForHourcheck())
                elif self.jhs_type == 3:
                    # 品牌团实例
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPage(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'brand'

                    # 汇聚
                    self.push_back(self.items, item.outTuple())

                elif self.jhs_type == 4:
                    # 还没有开团的品牌团实例
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageMain(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'main'

                    # 汇聚
                    self.push_back(self.items, item.outTuple())

                elif self.jhs_type == 5:
                    # 解析品牌团数据
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageParser(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'parser'

                    # 汇聚
                    self.push_back(self.items, item.outTupleParse())
                else:
                    self.queue.task_done()
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                    
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoActivityException as e:
                print 'Not activity exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()

                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                        time.sleep(10)
                time.sleep(random.uniform(10,30))