Exemplo n.º 1
0
    def __init__(self):
        # jhs brand type
        self.worker_type    = Config.JHS_Brand
        # DB
        self.jhs_type       = Config.JHS_TYPE   # queue type
        self.mysqlAccess    = MysqlAccess()     # mysql access
        self.redisQueue     = RedisQueue()      # redis queue
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess()   # mongodb fs access

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 页面模板解析
        self.brand_temp     = JHSBrandTEMP()

        # message
        self.message        = Message()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()
Exemplo n.º 2
0
    def __init__(self):
        # jhs group item type
        self.worker_type    = Config.JHS_GroupItem

        self.jhs_type       = Config.JHS_TYPE   # queue type

        # message
        self.message        = Message()

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

        # DB
        # mysql access
        self.mysqlAccess    = MysqlAccess()

        # redis queue
        self.redisQueue     = RedisQueue()

        # redis access
        self.redisAccess    = RedisAccess()

        # mongodb fs access
        self.mongofsAccess  = MongofsAccess()
Exemplo n.º 3
0
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.JHS_Brand

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self._q_type        = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Exemplo n.º 4
0
class JHSWorker():
    '''A class of jhs worker'''
    def __init__(self):
        # jhs brand type
        self.worker_type    = Config.JHS_Brand
        # DB
        self.jhs_type       = Config.JHS_TYPE   # queue type
        self.mysqlAccess    = MysqlAccess()     # mysql access
        self.redisQueue     = RedisQueue()      # redis queue
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess()   # mongodb fs access

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 页面模板解析
        self.brand_temp     = JHSBrandTEMP()

        # message
        self.message        = Message()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

    def init_crawl(self, _obj, _crawl_type):
        self._obj           = _obj
        self._crawl_type    = _crawl_type

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._router_tag    = 'ikuai'
        #self._router_tag   = 'tpent'

        # items
        self.items          = []

        # giveup items
        self.giveup_items   = []

        # giveup msg val
        self.giveup_val     = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'cat':
            max_time = Config.json_crawl_retry
        elif _obj == 'act':
            max_time = Config.act_crawl_retry
        elif _obj == 'item':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

     # To crawl page
    def crawlPage(self, _obj, _crawl_type, _key, msg, _val):
        try:
            if _obj == 'cat':
                if _crawl_type == 'home' or _crawl_type == 'homeposition':
                    self.run_cat_home(msg, _val)
                else:
                    self.run_cat(msg, _val)
            elif _obj == 'act':
                self.run_act(msg)
            elif _obj == 'item':
                self.run_item(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % _obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_cat_home(self, msg, _val):
        msg_val = msg["val"]
        _url, refers = msg_val
        print '# brand home:',_url
        page = self.crawler.getData(_url, refers)
        # save to mongo
        # timeStr_jhstype_webtype_obj_crawltype
        time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
        key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type)
        p_content = '<!-- url=%s --> %s' % (_url,page)
        self.mongofsAccess.insertJHSPages((key,p_content))

        c_url_val_list = self.brand_temp.temp(page)
        for c_url_val in c_url_val_list:
            c_url, c_name, c_id = c_url_val
            self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand))

        if self._crawl_type == 'homeposition':
            top_acts = self.brand_temp.activityTopbrandTemp(page)
            print top_acts
            self.save_top_acts(top_acts)

    def save_top_acts(self, top_acts):
        if top_acts:
            for key in top_acts.keys():
                act = top_acts[key]
                c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', ''
                c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time))
                if act.has_key('act_id'):
                    act_id = act["act_id"]
                if act.has_key('position'):
                    act_position = act["position"]
                if act.has_key('url'):
                    act_url = act["url"]
                if act.has_key('datatype'):
                    f_name = act["datatype"]
                val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour)
                self.mysqlAccess.insertJhsActPosition_hour(val)

    def run_cat(self, msg, _val):
        msg_val = msg["val"]
        c_url, c_id, c_name, refers, pagetype = msg_val
        print '# category',c_name,c_id
        if pagetype == Config.JHS_Brand:
            a_val = (c_id, c_name)
            self.get_actjson(c_url, refers, a_val, _val, pagetype)
        elif pagetype == Config.JHS_GroupItem:
            self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype)
        else:
            print '# not get category pagetype...'

    def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype):
        a_val = (c_id, c_name)
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_id,c_name)
        ajax_url_list = self.getAjaxurlList(page_val)
        if len(ajax_url_list) > 0:
            # process ajax url list
            for url_val in ajax_url_list:
                c_url,c_subNav = url_val
                self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav)

    def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''):
        if self._crawl_type == 'position':
            _val = (pagetype,c_subNav) + _val

        Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val)
        if Result_list and len(Result_list) > 0:
            # parser act result
            act_valList = self.jsonpage.parser_brandjson(Result_list,_val)
            if act_valList and len(act_valList) > 0:
                print '# get brand act num:',len(act_valList)
                self.items.extend(act_valList)
            else:
                print '# not get brandjson parse val list...'

    # get json ajax url
    def getAjaxurlList(self, page_val):
        url_list = []
        page, c_id, c_name = page_val
        p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = ''
            f_id = a_info.group(1)
            a_url = a_info.group(2).replace('amp;','')
            info = a_info.group(3)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            if c_subNav == '':
                m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S)
                if m:
                    c_subNav = re.sub(r'<.+?>','',m.group(1))
            #url_list.append((a_url,refers,a_val))
            url_list.append((a_url,c_subNav))
            i += 1
        return url_list

    # ACT queue
    def run_act(self, msg):
        # 默认数据
        msg_val = msg["val"]
        print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        act_obj = None
        if self._crawl_type == 'main':
            act_obj = JHSAct()
            act_obj.antPageMain(msg_val)
        elif self._crawl_type == 'check':
            act_obj = JHSAct()
            act_obj.antPageCheck(msg_val)
        elif self._crawl_type == 'position':
            act_obj = JHSAct()
            act_obj.antPageParser(msg_val)
        print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        if self._crawl_type == 'position':
            brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition()
            if int(brandact_sign) != 3:
                if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time):
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
                
                elif brandact_status != '' and brandact_status != 'blank':
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
        else:
            act_keys = [self.worker_type, str(act_obj.brandact_id)]
            prev_act = self.redisAccess.read_jhsact(act_keys)
            # 是否需要抓取商品
            if act_obj and act_obj.crawling_confirm != 2:
                # 保存的活动信息
                self.putActDB(act_obj, prev_act)
                # 活动中的商品
                items_list = []
                # 只取非俪人购商品
                if int(act_obj.brandact_sign) != 3:
                    if act_obj.crawling_confirm == 0:
                        #更新马上开团活动中商品位置
                        self.update_actItems_position(act_obj)
                    # 多线程抓商品
                    items_list = self.run_actItems(act_obj, prev_act)
                else:
                    print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)

                #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # 处理活动信息
                #self.procAct(act_obj, prev_act, items_list)
                # 处理活动redis信息
                self.procActRedis(act_obj, prev_act, items_list)
                #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            else:
                self.update_startact(act_obj, prev_act)
                print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) 

    # 更新开团后活动
    def update_startact(self, act, prev_act):
        if act.brandact_endtime and act.brandact_endtime != 0.0:
            end_time_s = Common.time_s(float(act.brandact_endtime)/1000)
            if prev_act and end_time_s != prev_act['end_time']:
                prev_act['end_time'] = end_time_s
                # redis
                keys = [self.worker_type, str(act.brandact_id)]
                self.redisAccess.write_jhsact(keys, prev_act)
                self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id)))

    #更新马上开团活动中商品位置
    def update_actItems_position(self, act):
        update_val_list = []
        act_id = act.brandact_id
        for item in act.brandact_itemVal_list:
            if str(item[7]) != '':
                update_val_list.append((str(item[7]),str(act_id),item[4]))
        self.mysqlAccess.updateJhsItemPosition(update_val_list)

    # 并行获取品牌团商品
    def run_actItems(self, act, prev_act):
        print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 需要抓取的item
        item_val_list = []
        # 过滤已经抓取过的商品ID列表
        item_ids = act.brandact_itemids
        if prev_act:
            prev_item_ids = prev_act["item_ids"]
            item_ids      = Common.diffSet(item_ids, prev_item_ids)

            # 如果已经抓取过的活动没有新上线商品,则退出
            if len(item_ids) == 0:
                print '# Activity no new Items'
                print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
                return None

            for item in act.brandact_itemVal_list:
                if str(item[6]) in item_ids or str(item[7]) in item_ids:
                    item_val_list.append(item)
        else:
            item_val_list = act.brandact_itemVal_list

        # 如果活动没有商品, 则退出
        if len(item_ids) == 0:
            print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name)
            return None

        print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        # 多线程 控制并发的线程数
        if len(item_val_list) > Config.item_max_th:
            m_itemsObj = JHSItemM('main', Config.item_max_th)
        else: 
            m_itemsObj = JHSItemM('main', len(item_val_list))
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity find new Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        return item_list

    # To merge activity
    def mergeAct(self, act, prev_act):
        if prev_act:
            # 合并本次和上次抓取的商品ID列表
            prev_item_ids  = prev_act["item_ids"]
            act.brandact_itemids   = Common.unionSet(act.brandact_itemids, prev_item_ids)

            # 取第一次的活动抓取时间
            act.crawling_time = Common.str2timestamp(prev_act["crawl_time"])

            if not act.brandact_name or act.brandact_name == '':
                act.brandact_name = prev_act["act_name"]
            if not act.brandact_url or act.brandact_url == '':
                act.brandact_url = prev_act["act_url"]
            if not act.brandact_position or str(act.brandact_position) == '0':
                act.brandact_position = prev_act["act_position"]
            if not act.brandact_enterpic_url or act.brandact_enterpic_url == '':
                act.brandact_enterpic_url = prev_act["act_enterpic_url"]
            if not act.brandact_remindNum or str(act.brandact_remindNum) == '0':
                act.brandact_remindNum = prev_act["act_remindnum"]
            if not act.brandact_coupons or act.brandact_coupons == []:
                act.brandact_coupon = prev_act["act_coupon"]
                act.brandact_coupons = prev_act["act_coupons"].split(Config.sep)
            if not act.brandact_starttime or act.brandact_starttime == 0.0: 
                act.brandact_starttime = Common.str2timestamp(prev_act["start_time"])
            if not act.brandact_endtime or act.brandact_endtime == 0.0:
                act.brandact_endtime = Common.str2timestamp(prev_act["end_time"])
            if not act.brandact_other_ids or act.brandact_other_ids == '':
                act.brandact_other_ids = prev_act["_act_ids"]

    # To put act db
    def putActDB(self, act, prev_act):
        # 预热信息
        if self._crawl_type == 'main':
            self.mysqlAccess.insertJhsActComing(act.outSql()) 

        # redis
        self.mergeAct(act, prev_act)
        
        if self._crawl_type == 'main':
            # mysql
            if prev_act:
                print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.updateJhsAct(act.outSqlForUpdate())
            else:
                print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.insertJhsAct(act.outSql())

        # mongo
        # 存网页
        _pages = act.outItemPage(self._crawl_type)
        self.mongofsAccess.insertJHSPages(_pages)

    # To process activity in redis
    def procActRedis(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))

        # redis
        self.mergeAct(act, prev_act)
        keys = [self.worker_type, str(act.brandact_id)]
        val = act.outTupleForRedis()
        self.redisAccess.write_jhsact(keys, val)

    # To process activity
    def procAct(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))
        # 将抓取的活动信息存入redis
        self.putActDB(act, prev_act)

    # ITEM queue
    def run_item(self, msg, _val):
        # 默认数据
        msg_val = msg["val"]
        brandact_id, brandact_name, item_val_list = msg_val
        print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_val_list) > max_th:
            m_itemsObj = JHSItemM(self._crawl_type, max_th, _val)
        else:
            m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            self.giveup_val = (brandact_id, brandact_name, giveup_items)
            raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)

        i, M = 0, 20
        if _obj == 'cat':
            M = 10
        n = 0
        while True: 
            if _crawl_type and _crawl_type != '':
                _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type)
            else:
                _key = '%s_%s' % (self.jhs_type,_obj)
            _msg = self.redisQueue.get_q(_key)

            # 队列为空
            if not _msg:
                i += 1
                if i > M:
                    print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    print '# all get num of item in queue:',n
                    break
                time.sleep(10)
                continue
            n += 1
            try:
                self.crawlPage(_obj, _crawl_type, _key, _msg, _val)
            except Exception as e:
                print '# exception err in process of JHSWorker:',e,_key,_msg

    # 删除redis数据库过期活动
    def delAct(self, _acts):
        i = 0
        for _act in _acts:
            keys = [self.worker_type, str(_act[0])]

            item = self.redisAccess.read_jhsact(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的活动
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsact(keys)
        print '# delete acts num:',i

    def delItem(self, _items):
        i = 0
        for _item in _items:
            keys = [self.worker_type, str(_item[0])]

            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的商品
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsitem(keys)
        print '# delete items num:',i

    # 查找结束的活动
    def scanEndActs(self, val):
        _acts = self.mysqlAccess.selectJhsActEnd(val)
        print '# end acts num:',len(_acts)
        # 删除已经结束的活动
        self.delAct(_acts)

    # 查找结束的商品
    def scanEndItems(self, val):
        _items = self.mysqlAccess.selectJhsItemEnd(val)
        print '# end items num:',len(_items)
        # 删除已经结束的商品
        self.delItem(_items)

    # acts redis
    def actsRedis(self):
        _acts = self.mysqlAccess.selectActsRedisdata()
        print '# acts num:',len(_acts)
        i = 0
        for _act in _acts:
            act_id = _act[2]
            #_itemids = self.mysqlAccess.selectItemsids(str(act_id))
            #item_ids = []
            #for _itemid in _itemids:
            #    item_ids.append(str(_itemid[0]))
            #    item_ids.append(str(_itemid[1]))
            #act_val = _act + (item_ids,)
            #print act_val
            #keys = [self.worker_type, str(act_id)]
            #print keys
            #if self.redisAccess.exist_jhsact(keys):
                #act_redis = self.redisAccess.read_jhsact(keys)
                #if len(act_redis) != 15:
                #    print act_redis
                #    i += 1
                #print self.redisAccess.read_jhsact(keys)
                #self.redisAccess.delete_jhsact(keys)
            #self.redisAccess.write_jhsact(keys, act_val)
            #i += 1
            #break
        print '# redis acts num:',i

    # items redis
    def itemsRedis(self):
        _items = self.mysqlAccess.selectItemRedisdata()
        print '# items num:', len(_items)
        i = 0
        #for _item in _items:
            #msg = self.message.jhsitemMsg(_item)
            #print msg
            #keys = [self.worker_type, str(_item[0])]
            #print keys
            #if self.redisAccess.exist_jhsitem(keys):
                #print self.redisAccess.read_jhsitem(keys)
                #self.redisAccess.delete_jhsitem(keys)
            #self.redisAccess.write_jhsitem(keys, msg)
            #i += 1 
            #break
        print '# redis items num:',i
Exemplo n.º 5
0
class JHSItemM(MyThread):
    '''A class of jhs item thread manager'''
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.JHS_Brand

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self._q_type        = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To merge item
    def mergeAct(self, item, prev_item):
        if prev_item:
            if not item.item_position or item.item_position == 0:
                item.item_position      = prev_item["item_position"]
            if not item.item_juName or item.item_juName == '':
                item.item_juName        = prev_item["item_juname"]
            if not item.item_juDesc or item.item_juDesc == '':
                item.item_juDesc        = prev_item["item_judesc"]
            if not item.item_juPic_url or item.item_juPic_url == '':
                item.item_juPic_url     = prev_item["item_jupic_url"]
            if not item.item_url or item.item_url == '':
                item.item_url           = prev_item["item_url"]
            if not item.item_oriPrice or item.item_oriPrice == '':
                item.item_oriPrice      = prev_item["item_oriprice"]
            if not item.item_actPrice or item.item_actPrice == '':
                item.item_actPrice      = prev_item["item_actprice"]
            if not item.item_discount or item.item_discount == '':
                item.item_discount      = prev_item["item_discount"]
            if not item.item_coupons or item.item_coupons == []:
                item.item_coupons       = prev_item["item_coupons"].split(Config.sep)
            if not item.item_promotions or item.item_promotions == []:
                item.item_promotions    = prev_item["item_promotions"].split(Config.sep)
            if not item.item_remindNum or item.item_remindNum == '':
                item.item_remindNum     = prev_item["item_remindnum"]
            if not item.item_isLock_time or item.item_isLock_time == '':
                if prev_item["item_islock_time"] and prev_item["item_islock_time"] != '':
                    item.item_isLock_time   = Common.str2timestamp(prev_item["item_islock_time"])
                    item.item_isLock        = prev_item["item_islock"]
            if not item.item_starttime or item.item_starttime == 0.0:
                if prev_item["start_time"] and prev_item["start_time"] != '':
                    item.item_starttime     = Common.str2timestamp(prev_item["start_time"])
            if not item.item_endtime or item.item_endtime == 0.0:
                if prev_item["end_time"] and prev_item["end_time"] != '':
                    item.item_endtime       = Common.str2timestamp(prev_item["end_time"])

    # To put item redis db
    def putItemDB(self, item):
        # redis
        keys = [self.worker_type, str(item.item_juId)]
        prev_item = self.redisAccess.read_jhsitem(keys)
        self.mergeAct(item, prev_item)
        val = item.outTupleForRedis()
        msg = self.message.jhsitemMsg(val)
        self.redisAccess.write_jhsitem(keys, msg)

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.item_crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            self.push_back(self.giveup_items, _val)
            print "# retry too many times, no get item:", _val

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item day
    def insertItemday(self, itemdaysql_list, f=False):
        if f or len(itemdaysql_list) >= Config.item_max_arg:
            if len(itemdaysql_list) > 0:
                self.mysqlAccess.insertJhsItemForDay(itemdaysql_list)
                #print '# day insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsItemForHour(itemhoursql_list)
                #print '# hour insert data to database'
            return True
        return False

    # update item lock start-end time
    def updateItemLockStartEndtime(self, itemsql):
        if itemsql:
            self.mysqlAccess.updateJhsItemLockStartEndtime(itemsql)
            #print '# update data to database'

    def updateItems(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.updateJhsItems(itemsql_list)
                #print '# update data to database'
            return True
        return False

    # To crawl item
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemdaysql_list = []
        _itemhoursql_list = []
        _itemupdatesql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # day
                    self.insertItemday(_itemdaysql_list, True)
                    _itemdaysql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    # update
                    #self.updateItems(_itemupdatesql_list, True)
                    #_itemupdatesql_list = []

                    break

                item = None
                if self._q_type == 'main':
                    # 新商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPage(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outTuple())
                    # 入库
                    iteminfoSql = item.outTuple()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                elif self._q_type == 'day':
                    # 每天商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageDay(_val)
                    # 汇聚
                    self.push_back(self.items, item.outSqlForDay())
                    # 入库
                    daySql = item.outSqlForDay()
                    _itemdaysql_list.append(daySql)
                    if self.insertItemday(_itemdaysql_list): _itemdaysql_list = []
                elif self._q_type == 'hour':
                    # 每小时商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageHour(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outTupleHour())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)

                    hourSql = item.outSqlForHour()
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []
                elif self._q_type == 'update':
                    # 更新商品
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageUpdate(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outSqlForUpdate())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)
                elif self._q_type == 'check':
                    # check商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageUpdate(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outSqlForUpdate())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)

                # 存网页
                if item:
                    _pages = item.outItemPage(self._q_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                print 'Not item exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                #if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                #    print _data
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e 
                        time.sleep(10)
                time.sleep(random.uniform(10,40))
Exemplo n.º 6
0
class JHSGroupItemWorker():
    '''A class of JHS group item channel worker'''
    def __init__(self):
        # jhs group item type
        self.worker_type    = Config.JHS_GroupItem

        self.jhs_type       = Config.JHS_TYPE   # queue type

        # message
        self.message        = Message()

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

        # DB
        # mysql access
        self.mysqlAccess    = MysqlAccess()

        # redis queue
        self.redisQueue     = RedisQueue()

        # redis access
        self.redisAccess    = RedisAccess()

        # mongodb fs access
        self.mongofsAccess  = MongofsAccess()

    def init_crawl(self, _obj, _crawl_type):
        self._obj          = _obj
        self._crawl_type   = _crawl_type

        # dial client
        self.dial_client   = DialClient()

        # local ip
        self._ip           = Common.local_ip()

        # router tag
        self._router_tag   = 'ikuai'
        #self._router_tag  = 'tpent'

        # items
        self.items         = []

        # giveup items
        self.giveup_items  = []

        # giveup msg val
        self.giveup_val    = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back_list(self, L, v):
        L.extend(v)

    def push_back_val(self, L, v):
        L.append(v)

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'groupitemcat':
            max_time = Config.json_crawl_retry
        elif _obj == 'groupitem':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

    def crawlPage(self, _key, msg, _val):
        try:
            if self._obj == 'groupitemcat':
                self.run_category(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % self._obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_category(self, msg, _val):
        category_val = msg["val"]
        refers = _val
        c_url,c_name,c_id = category_val
        print c_url,c_name,c_id
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_name,c_id)
        ajax_url_list = self.getAjaxurlList(page_val,c_url)
        if len(ajax_url_list) > 0:
            self.get_jsonitems(ajax_url_list)

    # get json ajax url
    def getAjaxurlList(self, page_val, refers=''):
        url_list = []
        page, c_name, c_id = page_val
        p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = c_name
            a_url = a_info.group(1).replace('amp;','')
            info = a_info.group(2)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            a_val = (c_id,c_name,refers,c_subNav)
            url_list.append((a_url,refers,a_val))
            i += 1
        return url_list

    # get item json list in category page from ajax url
    def get_jsonitems(self, ajax_url_list):
        # today all items val
        todayall_item_val = []
        # other sub nav items val
        item_list = []
        # process ajax url list
        item_json_index = 0
        # mongo json pages
        cat_pages = {}
        for a_url in ajax_url_list:
            # get json from ajax url
            Result_list = self.jsonpage.get_json([a_url])
            # mongo page json
            _url,_refers,_val = a_url 
            _c_id = _val[0]
            time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
            # timeStr_jhstype_webtype_itemgroupcat_catid
            key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id))
            cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list))

            if Result_list and len(Result_list) > 0:
                item_result_valList = self.jsonpage.parser_itemjson(Result_list)
                if item_result_valList and len(item_result_valList) > 0:
                    item_json_index += 1
                    # the first item list is all online items
                    if item_json_index == 1:
                        if len(item_result_valList) > 0:
                            print '# all online items.....'
                            todayall_item_val = item_result_valList
                    else:
                        self.push_back_list(item_list, item_result_valList)
                else:
                    print '# not get itemjson parse val list...'
        if len(item_list) > 0:
            self.parseItems(item_list)

        # cat pages json 
        for key in cat_pages.keys():
            _pages = (key,cat_pages[key])
            self.mongofsAccess.insertJHSPages(_pages)

    # 解析从接口中获取的商品数据
    def parseItems(self, item_list):
        print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        # 附加信息
        a_val = (self.begin_time,)
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_list) > max_th:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val)
        else:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_list)
        m_itemsObj.run()

        _items = m_itemsObj.items
        self.push_back_list(self.items,_items)
        print '# queue item num:',len(self.items)
        print '# parse item num:',len(_items)
        print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)
        if _obj == 'groupitem':
            self.processMulti(_val)
        else:
            self.processOne(_val)

    def processOne(self, _val=None):
        i, M = 0, 10
        n = 0
        while True: 
            try:
                if self._crawl_type and self._crawl_type != '':
                    _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
                else:
                    _key = '%s_%s' % (self.jhs_type, self._obj)
                _msg = self.redisQueue.get_q(_key)

                # 队列为空
                if not _msg:
                    i += 1
                    if i > M:
                        print '# all get catQ item num:',n
                        print '# not get catQ of key:',_key
                        break
                    time.sleep(10)
                    continue
                n += 1
                self.crawlPage(_key, _msg, _val)

            except Exception as e:
                print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg

    def processMulti(self, _val=None):
        if self._crawl_type and self._crawl_type != '':
            _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
        else:
            _key = '%s_%s' % (self.jhs_type, self._obj)

        try:
            self.crawlPageMulti(_key, _val)
        except Exception as e:
            print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key)

    # To crawl page
    def crawlPageMulti(self, _key, _val):
        if self._obj == 'groupitem':
            self.run_groupitem(_key, _val)
        else:
            print '# crawlPageMulti unknown obj = %s' % self._obj

    def run_groupitem(self, _key, _val):
        m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val)
        m_itemQ.createthread()
        m_itemQ.run()
        item_list = m_itemQ.items
        print '# crawl Items num: %d' % len(item_list)

    # 删除redis数据库过期商品
    def delItem(self, _items):
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            
            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.begin_time)
                # 删除过期的商品
                if now_time > end_time: self.redisAccess.delete_jhsitem(keys)

    # 把商品信息存入redis数据库中
    def putItemDB(self, _items):
        for _item in _items:
            # 忽略已经存在的商品ID
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): continue

            # 将商品基础数据写入redis
            item_val = self.message.itemInfo(_item["r_val"])
            val = self.message.itemMsg(item_val)
            self.redisAccess.write_jhsitem(keys, val)

    # 更新商品信息
    def updateItem(self, _item):
        keys = [self.worker_type, _item["item_juId"]]

        item = self.redisAccess.read_jhsitem(keys)
        if item:
            item_val = self.message.itemParseInfo(_item["r_val"])
            c = False
            if item["start_time"] != item_val["start_time"]:
                item["start_time"] = item_val["start_time"]
                c = True
            if item["end_time"] != item_val["end_time"]:
                item["end_time"] = item_val["end_time"]
                c = True
            if c:
                self.redisAccess.write_jhsitem(keys, item)

    # 查找新商品
    def selectNewItems(self, _items):
        new_items = []
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): 
                self.updateItem(_item)
                continue
            new_items.append(_item["val"])
        return new_items

    def scanEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        _items = self.mysqlAccess.selectJhsGroupItemEnd(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)

    def scanEndItemsLasthour(self):
        val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1))
        _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums for last hour end:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)
            
    def scanAliveItems(self):
        # 到结束时间后的一个小时
        val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1))
        # 查找已经开团但是没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemAlive(val)
        print "# hour all item nums:",len(_items)
        return _items

    def scanNotEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        # 查找没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val)
        i = 1
        for _item in _items:
            print i
            item_juid = str(_item[1])
            keys = [self.worker_type, item_juid]

            item = self.redisAccess.read_jhsitem(keys)
            print item
            #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]}
            #self.redisAccess.write_jhsitem(keys, _new_item)
            i += 1

    def scanCategories(self):
        category_list = self.mysqlAccess.selectJhsGroupItemCategory()
        return category_list