Exemplos de RetryCrawler em Python, exemplos de RetryCrawler.RetryCrawler em Python

Exemplo n.º 1

0

Exibir arquivo

    def __init__(self):
        # 抓取设置
        self.crawler = XCCrawler()
        self.retrycrawler = RetryCrawler()
        self.crawling_time = Common.now()  # 当前爬取时间
        self.crawling_time_s = Common.time_s(self.crawling_time)
        self.crawling_begintime = ''  # 本次抓取开始时间
        self.crawling_beginDate = ''  # 本次爬取日期
        self.crawling_beginHour = ''  # 本次爬取小时

        # 频道信息
        self.platform = '携程-pc'  # 品牌团所在平台
        self.channel_id = ''  # 频道id
        self.channel_url = ''  # 频道链接
        self.channel_name = ''  # 频道name
        self.channel_type = ''  # 频道类型

        # 频道所属地理位置信息
        self.province_id = 0  # 省,州id
        self.province_name = ''  # 省,州名称

        # 原数据信息
        self.channel_page = ''  # 频道页面html内容
        self.channel_pages = {}  # 频道页面内请求数据列表

        # channel items
        self.channel_items = []

        # channel list
        self.channel_list = []

Exemplo n.º 2

0

Exibir arquivo

Arquivo: JHSActPosition.py Projeto: xzhoutxd/jhs_v1

    def __init__(self):
        # mysql
        self.mysqlAccess = MysqlAccess()

        # 抓取设置
        #self.crawler    = TBCrawler()
        self.crawler = RetryCrawler()

        # 页面模板解析
        self.brand_temp = JHSBrandTEMP()

        # 获取Json数据
        self.jsonpage = Jsonpage()

        # 首页的品牌团列表
        self.home_brands = {}

        # 品牌团页面的最上面推广位
        self.top_brands = {}

        # 页面信息
        self.ju_home_page = '' # 聚划算首页
        self.ju_brand_page = '' # 聚划算品牌团页面

        # 抓取开始时间
        self.begin_time = Common.now()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: XCChannel.py Projeto: xzhoutxd/xc_ly

    def __init__(self):
        # 抓取设置
        self.crawler            = XCCrawler()
        self.retrycrawler       = RetryCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_time_s    = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 频道信息
        self.platform           = '携程-pc' # 品牌团所在平台
        self.channel_id         = '' # 频道id
        self.channel_url        = '' # 频道链接
        self.channel_name       = '' # 频道name
        self.channel_type       = '' # 频道类型

        # 频道所属地理位置信息
        self.province_id        = 0  # 省,州id
        self.province_name      = '' # 省,州名称

        # 原数据信息
        self.channel_page       = '' # 频道页面html内容
        self.channel_pages      = {} # 频道页面内请求数据列表

        # channel items
        self.channel_items      = []

        # channel list
        self.channel_list       = []

Exemplo n.º 4

0

Exibir arquivo

Arquivo: JHSBrand.py Projeto: xzhoutxd/jhs_v1

    def __init__(self, m_type):
        # 抓取设置
        self.crawler = RetryCrawler()

        # DB
        self.mysqlAccess   = MysqlAccess()     # mysql access

        # cat queue
        self.cat_queue = JHSQ('cat','main')

        # act queue
        self.act_queue = JHSQ('act','main')

        self.work = JHSWorker()

        # 默认类别
        #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")]
        self.category_list = [
                ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"),
                ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"),
                ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"),
                ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"),
                ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"),
                ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"),
                ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"),
                ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"),
                ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"),
                ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"),
                ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"),
                ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"),
                ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"),
                ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"),
                ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"),
                ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000")
                ]

        # 页面
        self.site_page  = None

        # 抓取开始时间
        self.begin_time = Common.now()

        # 分布式主机标志
        self.m_type = m_type

Exemplo n.º 5

0

Exibir arquivo

Arquivo: JHSBrand.py Projeto: xzhoutxd/jhs_v1

class JHSBrand():
    '''A class of JHS category channel'''
    def __init__(self, m_type):
        # 抓取设置
        self.crawler = RetryCrawler()

        # DB
        self.mysqlAccess   = MysqlAccess()     # mysql access

        # cat queue
        self.cat_queue = JHSQ('cat','main')

        # act queue
        self.act_queue = JHSQ('act','main')

        self.work = JHSWorker()

        # 默认类别
        #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")]
        self.category_list = [
                ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"),
                ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"),
                ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"),
                ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"),
                ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"),
                ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"),
                ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"),
                ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"),
                ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"),
                ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"),
                ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"),
                ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"),
                ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"),
                ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"),
                ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"),
                ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000")
                ]

        # 页面
        self.site_page  = None

        # 抓取开始时间
        self.begin_time = Common.now()

        # 分布式主机标志
        self.m_type = m_type

    def antPage(self):
        try:
            # 主机器需要配置redis队列
            if self.m_type == 'm':
                category_list = self.mysqlAccess.selectJhsGroupItemCategory()
                if not category_list or len(category_list) == 0:
                    category_list = self.category_list
                if category_list and len(category_list) > 0:
                    cate_val_list = []
                    for cate in category_list:
                        cate_val_list.append((cate[0],cate[2],cate[1],Config.ju_home_today,Config.JHS_GroupItem))
                    # 清空category redis队列
                    self.cat_queue.clearQ()
                    # 保存category redis队列
                    self.cat_queue.putlistQ(cate_val_list)

                    # 清空act redis队列
                    self.act_queue.clearQ()
                    print '# category queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                else:
                    print '# not find category...'

            # 类目的活动Json
            obj = 'cat'
            crawl_type = 'main'
            # 获取还没有开团的活动id
            val = (Common.time_s(Common.now()),)
            acts = self.mysqlAccess.selectJhsActNotStart(val)
            brandact_id_list = []
            if acts:
                for act in acts:
                    brandact_id_list.append(str(act[1]))
            _val = (self.begin_time, brandact_id_list)
            self.work.process(obj,crawl_type,_val)

            # 活动数据
            act_val_list = self.work.items
            print '# act nums:', len(act_val_list)

            # 保存到redis队列
            self.act_queue.putlistQ(act_val_list)
            print '# act queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

            if self.m_type == 'm':
                val = (Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -1))
                # 删除Redis中上个小时结束的活动
                _acts = self.mysqlAccess.selectJhsActEndLastOneHour(val)
                print '# end acts num:',len(_acts)
                self.work.delAct(_acts)
                # 删除Redis中上个小时结束的商品
                _items = self.mysqlAccess.selectJhsItemEndLastOneHour(val)
                print '# end items num:',len(_items)
                self.work.delItem(_items)
        except Exception as e:
            print '# antpage error :',e
            Common.traceback_log()

    # 商品团频道
    def categoryListTEMP(self):
        page = self.crawler.getData(Config.ju_home_today, Config.ju_home)
        if not page or page == '': print '# not get today page'
        category_list = []
        m = re.search(r'<div class="J_CatLeft layout-left">.+?<table>(.+?)</table>.+?</div>',page,flags=re.S)
        if m:
            category_list = self.categoryListType1(m.group(1))
        else:
            m = re.search(r'<div class="catbg">\s+<div class="ju-wrapper">\s+<div class="cat-menu-h".+?>.+?<ul class="clearfix">(.+?)</ul>',page,flags=re.S)

            if m:
                category_list = self.categoryListType2(m.group(1))

        return category_list

    def categoryListType1(self,page):
        category_list = []
        m = re.search(r'<tr class="h2">.+?</tr>(.+?)<tr class="h2">',page,flags=re.S)
        if m:
            cate_list = m.group(1)
            p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S)
            for cate in p.finditer(cate_list):
                category_list.append((cate.group(1),cate.group(2).strip()))
        return category_list
    
    def categoryListType2(self,page):
        category_list = []
        p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S)
        for cate in p.finditer(page):
            category_list.append((cate.group(1),cate.group(2).strip()))
        return category_list

Exemplo n.º 6

0

Exibir arquivo

Arquivo: JHSActPosition.py Projeto: xzhoutxd/jhs_v1

class JHSActPosition():
    '''A class of brand position'''
    def __init__(self):
        # mysql
        self.mysqlAccess = MysqlAccess()

        # 抓取设置
        #self.crawler    = TBCrawler()
        self.crawler = RetryCrawler()

        # 页面模板解析
        self.brand_temp = JHSBrandTEMP()

        # 获取Json数据
        self.jsonpage = Jsonpage()

        # 首页的品牌团列表
        self.home_brands = {}

        # 品牌团页面的最上面推广位
        self.top_brands = {}

        # 页面信息
        self.ju_home_page = '' # 聚划算首页
        self.ju_brand_page = '' # 聚划算品牌团页面

        # 抓取开始时间
        self.begin_time = Common.now()

    def antPage(self):
        try:
            # 获取首页的品牌团
            page = self.crawler.getData(Config.ju_home, Config.tmall_home)
            hb = JHSHomeBrand()
            hb.antPage(page)
            if hb.home_brands == {} or not hb.home_brands:
                page = self.crawler.getData(Config.ju_home_today, Config.ju_home)
                hb.antPage(page)
            self.home_brands = hb.home_brands
            page_datepath = 'act/position/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time))
            Config.writefile(page_datepath,'home.htm',page)
            #print '# home activities:', self.home_brands

            # 获取品牌团列表页数据
            page = self.crawler.getData(Config.ju_brand_home, Config.ju_home)
            self.activityList(page) 
        except Exception as e:
            print '# exception err in antPage info:',e
            Common.traceback_log()

    # 品牌团列表
    def activityList(self, page):
        if not page or page == '': raise Common.InvalidPageException("# brand activityList: not get JHS brand home.")
        self.ju_brand_page = page
        # 保存html文件
        page_datepath = 'act/marketing/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time))
        Config.writefile(page_datepath,'brand.htm',self.ju_brand_page)

        # 数据接口URL list
        self.top_brands = self.brand_temp.activityTopbrandTemp(page)

        b_url_valList = self.brand_temp.activityListTemp(page)
        if b_url_valList != []:
            # 从接口中获取的数据列表
            bResult_list = []
            json_valList = []
            for b_url_val in b_url_valList:
                b_url, f_name, f_catid = b_url_val
                json_valList.append((b_url,Config.ju_brand_home,(f_catid,f_name)))
            bResult_list = self.jsonpage.get_json(json_valList)

            act_valList = []
            if bResult_list and bResult_list != []:
                a_val = (Config.JHS_Brand,'',self.begin_time,)
                act_valList = self.jsonpage.parser_brandjson(bResult_list,a_val)

            if act_valList != []:
                print '# get brand act num:',len(act_valList)
                self.run_brandAct(act_valList)
            else:
                print '# err: not get brandjson parser val list.'
        else:
            print '# err: not find activity json data URL list.'

    def run_brandAct(self, act_valList):
        repeatact_num = 0
        # 活动数量
        act_num = 0
        # 需要保存活动sql列表
        act_sql_list = []
        # 用于活动去重id dict
        brandact_id_dict = {}
        print '# brand activities start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 多线程 控制并发的线程数
        if len(act_valList) > Config.act_max_th:
            m_Obj = JHSActM(5, Config.act_max_th)
        else:
            m_Obj = JHSActM(5, len(act_valList))
        m_Obj.putItems(act_valList)
        m_Obj.createthread()
        m_Obj.run()


        item_list = m_Obj.items
        for b in item_list:
            act_num += 1
            brandact_id,brandact_name,brandact_url,brandact_sign,val = b
            if int(brandact_sign) == 3:
                continue
            # 去重
            if brandact_id_dict.has_key(str(brandact_id)):
                repeatact_num += 1
                print '# repeat brand act. activity id:%s name:%s'%(brandact_id, brandact_name)
            else:
                brandact_id_dict[str(brandact_id)] = brandact_name
                if self.home_brands.has_key(str(brandact_id)):
                    val = val + (self.home_brands[str(brandact_id)]["position"],self.home_brands[str(brandact_id)]["datatype"],self.home_brands[str(brandact_id)]["typename"])
                elif self.home_brands.has_key(brandact_url):
                    val = val + (self.home_brands[brandact_url]["position"],self.home_brands[brandact_url]["datatype"],self.home_brands[brandact_url]["typename"])
                else:
                    val = val + (None,None,None)

                if self.top_brands.has_key(str(brandact_id)):
                    val = val + (self.top_brands[str(brandact_id)]["position"],self.top_brands[str(brandact_id)]["datatype"])
                elif self.top_brands.has_key(brandact_url):
                    val = val + (self.top_brands[brandact_url]["position"],self.top_brands[brandact_url]["datatype"])
                else:
                    val = val + (None,None)
                act_sql_list.append(val)
        print '# brand activities end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        # 品牌团活动位置信息入库
        # 保存
        actsql_list = []
        for sql in act_sql_list:
            actsql_list.append(sql)
            if len(actsql_list) >= Config.act_max_arg:
                self.mysqlAccess.insertJhsActPosition(actsql_list)
                actsql_list = []
        if len(actsql_list) > 0:
            self.mysqlAccess.insertJhsActPosition(actsql_list)

        print '# Find act num:', act_num
        print '# Repeat brand activity num:', repeatact_num

Exemplo n.º 7

0

Exibir arquivo

Arquivo: XCChannel.py Projeto: xzhoutxd/xc_ly

class Channel():
    '''A class of XC channel'''
    def __init__(self):
        # 抓取设置
        self.crawler            = XCCrawler()
        self.retrycrawler       = RetryCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_time_s    = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 频道信息
        self.platform           = '携程-pc' # 品牌团所在平台
        self.channel_id         = '' # 频道id
        self.channel_url        = '' # 频道链接
        self.channel_name       = '' # 频道name
        self.channel_type       = '' # 频道类型

        # 频道所属地理位置信息
        self.province_id        = 0  # 省,州id
        self.province_name      = '' # 省,州名称

        # 原数据信息
        self.channel_page       = '' # 频道页面html内容
        self.channel_pages      = {} # 频道页面内请求数据列表

        # channel items
        self.channel_items      = []

        # channel list
        self.channel_list       = []

    # 频道页初始化
    def init(self, channel_id, channel_url, channel_type, begin_time):
        self.channel_id = channel_id
        self.channel_url = channel_url
        self.channel_type = channel_type
        self.crawling_begintime = begin_time
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime))
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime))

    def config(self):
        self.channelPage()
        if self.channel_type == 1:
            self.spot()
        #elif self.channel_type == 2:
        else:
            Common.log('# not find this channel type...')

    def spot(self):
        if self.channel_page:
            m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S)
            if m:
                cate_select = m.group(1)
                c_list = []
                p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S)
                for c in p.finditer(cate_select):
                    c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip())
                self.channel_name = '-'.join(c_list)

            i_p = 1
            i_page = 1
            m_page = 1
            page_main = ''
            m = re.search(r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S)
            if m:
                page_main = m.group(1)
            else:
                page_main = self.channel_page
            
            Common.log(i_page)
            i_p = self.get_items(page_main, i_p)

            m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S)
            if m:
                m_page_info = m.group(1)
                m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S)
                if m:
                    m_page = int(m.group(1))

            page_url = self.channel_url[0:-1] + 'P%s/'
            while i_page < m_page:
                i_page += 1
                p_url = page_url % str(i_page)
                Common.log(i_page)
                page = self.retrycrawler.getData(p_url, self.channel_url)
                i_p = self.get_items(page, i_p)

    def get_items(self, page_main, i_p):
        if page_main:
            p = re.compile(r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S)
            for info in p.finditer(page_main):
                if int(self.channel_type) == 1:
                    i_url = Config.xc_piao_home + info.group(1)
                else:
                    i_url = Config.xc_home + info.group(1)
                i_img, i_name, i_area, i_desc = info.group(2), info.group(3).strip(), info.group(4).strip(), info.group(5).strip()
                i_book = 1
                i_id = 0
                if i_url != '':
                    m = re.search(r't(\d+).html', i_url)
                    if m:
                        i_id = m.group(1)
                    val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime))
                    self.channel_items.append(val)
                i_p += 1
        return i_p

    def channelList(self): 
        self.channelPage()
        if self.channel_page:
            m = re.search(r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S)
            if m:
                area_infos = m.group(1)
                p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S)
                for area in p.finditer(area_infos):
                    channel_url, c_name = Config.xc_piao_home + area.group(1), area.group(2)
                    channel_id = 0
                    if channel_url:
                        m = re.search(r'D(\d+)', channel_url)
                        if m:
                            channel_id = m.group(1)
                    if c_name:
                        m = re.search(r'(.+?)\(', c_name, flags=re.S)
                        if m:
                            channel_name = m.group(1).strip()
                        else:
                            channel_name = c_name.strip()
                    if int(channel_id) != 0 and channel_url:
                        self.channel_list.append((channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name))
                    
    def channelPage(self):
        if self.channel_url:
            refers = Config.xc_home
            if int(self.channel_type) == 1:
                refers = Config.xc_piao_home
            data = self.crawler.getData(self.channel_url, Config.xc_home)
            if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url))
            if data and data != '':
                self.channel_page = data
                self.channel_pages['channel-home'] = (self.channel_url, data)

    def antPage(self, val):
        channel_id, channel_url, channel_type, begin_time = val
        self.init(channel_id, channel_url, channel_type, begin_time)
        self.config()

    def antChannelList(self, val):
        self.channel_url, self.channel_type, self.province_id, self.province_name = val
        self.channelList()

Exemplo n.º 8

0

Exibir arquivo

class Channel():
    '''A class of XC channel'''
    def __init__(self):
        # 抓取设置
        self.crawler = XCCrawler()
        self.retrycrawler = RetryCrawler()
        self.crawling_time = Common.now()  # 当前爬取时间
        self.crawling_time_s = Common.time_s(self.crawling_time)
        self.crawling_begintime = ''  # 本次抓取开始时间
        self.crawling_beginDate = ''  # 本次爬取日期
        self.crawling_beginHour = ''  # 本次爬取小时

        # 频道信息
        self.platform = '携程-pc'  # 品牌团所在平台
        self.channel_id = ''  # 频道id
        self.channel_url = ''  # 频道链接
        self.channel_name = ''  # 频道name
        self.channel_type = ''  # 频道类型

        # 频道所属地理位置信息
        self.province_id = 0  # 省,州id
        self.province_name = ''  # 省,州名称

        # 原数据信息
        self.channel_page = ''  # 频道页面html内容
        self.channel_pages = {}  # 频道页面内请求数据列表

        # channel items
        self.channel_items = []

        # channel list
        self.channel_list = []

    # 频道页初始化
    def init(self, channel_id, channel_url, channel_type, begin_time):
        self.channel_id = channel_id
        self.channel_url = channel_url
        self.channel_type = channel_type
        self.crawling_begintime = begin_time
        self.crawling_beginDate = time.strftime(
            "%Y-%m-%d", time.localtime(self.crawling_begintime))
        self.crawling_beginHour = time.strftime(
            "%H", time.localtime(self.crawling_begintime))

    def config(self):
        self.channelPage()
        if self.channel_type == 1:
            self.spot()
        #elif self.channel_type == 2:
        else:
            Common.log('# not find this channel type...')

    def spot(self):
        if self.channel_page:
            m = re.search(r'<div class="cate_select">(.+?)</div>',
                          self.channel_page,
                          flags=re.S)
            if m:
                cate_select = m.group(1)
                c_list = []
                p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S)
                for c in p.finditer(cate_select):
                    c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip())
                self.channel_name = '-'.join(c_list)

            i_p = 1
            i_page = 1
            m_page = 1
            page_main = ''
            m = re.search(
                r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">',
                self.channel_page,
                flags=re.S)
            if m:
                page_main = m.group(1)
            else:
                page_main = self.channel_page

            Common.log(i_page)
            i_p = self.get_items(page_main, i_p)

            m = re.search(r'<span class="c_page2_numtop">(.+?)</span>',
                          self.channel_page,
                          flags=re.S)
            if m:
                m_page_info = m.group(1)
                m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S)
                if m:
                    m_page = int(m.group(1))

            page_url = self.channel_url[0:-1] + 'P%s/'
            while i_page < m_page:
                i_page += 1
                p_url = page_url % str(i_page)
                Common.log(i_page)
                page = self.retrycrawler.getData(p_url, self.channel_url)
                i_p = self.get_items(page, i_p)

    def get_items(self, page_main, i_p):
        if page_main:
            p = re.compile(
                r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>',
                flags=re.S)
            for info in p.finditer(page_main):
                if int(self.channel_type) == 1:
                    i_url = Config.xc_piao_home + info.group(1)
                else:
                    i_url = Config.xc_home + info.group(1)
                i_img, i_name, i_area, i_desc = info.group(2), info.group(
                    3).strip(), info.group(4).strip(), info.group(5).strip()
                i_book = 1
                i_id = 0
                if i_url != '':
                    m = re.search(r't(\d+).html', i_url)
                    if m:
                        i_id = m.group(1)
                    val = (self.channel_id, self.channel_name,
                           self.channel_url, self.channel_type,
                           (i_book, i_id, i_url, i_img, i_name, i_desc, i_area,
                            i_p, self.crawling_begintime))
                    self.channel_items.append(val)
                i_p += 1
        return i_p

    def channelList(self):
        self.channelPage()
        if self.channel_page:
            m = re.search(
                r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>',
                self.channel_page,
                flags=re.S)
            if m:
                area_infos = m.group(1)
                p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S)
                for area in p.finditer(area_infos):
                    channel_url, c_name = Config.xc_piao_home + area.group(
                        1), area.group(2)
                    channel_id = 0
                    if channel_url:
                        m = re.search(r'D(\d+)', channel_url)
                        if m:
                            channel_id = m.group(1)
                    if c_name:
                        m = re.search(r'(.+?)\(', c_name, flags=re.S)
                        if m:
                            channel_name = m.group(1).strip()
                        else:
                            channel_name = c_name.strip()
                    if int(channel_id) != 0 and channel_url:
                        self.channel_list.append(
                            (channel_id, channel_name, channel_url,
                             str(self.channel_type), str(self.province_id),
                             self.province_name))

    def channelPage(self):
        if self.channel_url:
            refers = Config.xc_home
            if int(self.channel_type) == 1:
                refers = Config.xc_piao_home
            data = self.crawler.getData(self.channel_url, Config.xc_home)
            if not data and data == '':
                raise Common.InvalidPageException(
                    "# channelPage:not find channel page,channel_id:%s,channel_url:%s"
                    % (str(self.channel_id), self.channel_url))
            if data and data != '':
                self.channel_page = data
                self.channel_pages['channel-home'] = (self.channel_url, data)

    def antPage(self, val):
        channel_id, channel_url, channel_type, begin_time = val
        self.init(channel_id, channel_url, channel_type, begin_time)
        self.config()

    def antChannelList(self, val):
        self.channel_url, self.channel_type, self.province_id, self.province_name = val
        self.channelList()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: JHSGroupItem.py Projeto: xzhoutxd/jhs_v1

class JHSGroupItem():
    '''A class of JHS group item channel'''
    def __init__(self, m_type):
        # 分布式主机标志
        self.m_type = m_type

        # 抓取设置
        self.crawler = RetryCrawler()

        # cat queue
        self.cat_queue = JHSQ('groupitemcat', 'main')

        self.worker = JHSGroupItemWorker()

        # 默认类别
        #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")]
        self.category_list = [
                ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"),
                ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"),
                ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"),
                ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"),
                ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"),
                ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"),
                ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"),
                ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"),
                ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"),
                ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"),
                ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"),
                ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"),
                ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"),
                ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"),
                ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"),
                ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000")
                ]

        # 页面
        self.site_page  = None

        # 抓取开始时间
        self.crawling_time = Common.now() # 当前爬取时间
        self.begin_time = Common.now()
        self.begin_date = Common.today_s()
        self.begin_hour = Common.nowhour_s()

    def antPage(self):
        try:
            # 主机器需要配置redis队列
            if self.m_type == 'm':
                category_list = self.worker.scanCategories()
                if not category_list or len(category_list) == 0:
                    category_list = self.category_list
                if category_list and len(category_list) > 0:
                    # 清空category redis队列
                    self.cat_queue.clearQ()
                    # 保存category redis队列
                    self.cat_queue.putlistQ(category_list)
                    print '# groupitem category queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                else:
                    print '# groupitem not find category...'

            obj = 'groupitemcat'
            crawl_type = 'main'
            self.worker.process(obj, crawl_type, Config.ju_home_today)
            items = self.worker.items
            print '# all parser items num:',len(items)
            # 查找新上商品
            self.get_newitems(items)

            if self.m_type == 'm':
                # 删除Redis中结束商品
                #self.worker.scanEndItems()
                self.worker.scanEndItemsLasthour()
        except Exception as e:
            print '# antpage error :',e
            Common.traceback_log()

    # 查找新上商品,并抓取新上商品详情
    def get_newitems(self, items):
        result_items = []
        for item in items:
            item_status, item_val, o_val = item
            item_juid = item_val[1]
            result_items.append({"item_juId":str(item_juid),"val":o_val,"r_val":item_val})
        new_item_list = self.worker.selectNewItems(result_items)
        print '# new items num:',len(new_item_list)
        # 抓取新上商品
        itemcrawl_type = 'new'
        # 附加信息
        a_val = (self.begin_time,)
        items = self.crawlNewItems(new_item_list,itemcrawl_type,a_val)

        # 保存新上商品信息到Redis
        new_items = []
        for item in items:
            iteminfoSql = item
            item_juid = iteminfoSql[1]
            new_items.append({"item_juId":item_juid,"r_val":iteminfoSql})
        self.worker.putItemDB(new_items)

    # 抓取新上的商品详情
    def crawlNewItems(self, _new_items, itemcrawl_type, a_val):
        print '# crawl Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(_new_items) > max_th:
            m_itemsObj = JHSGroupItemCrawlerM(itemcrawl_type, max_th, a_val)
        else:
            m_itemsObj = JHSGroupItemCrawlerM(itemcrawl_type, len(_new_items), a_val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(_new_items)
        m_itemsObj.run()

        _items = m_itemsObj.items
        print '# insert new item num:',len(_items)
        print '# crawl Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return _items
 
    # 商品团频道
    def categoryListTEMP(self):
        page = self.crawler.getData(Config.ju_home_today, Config.ju_home)
        if not page or page == '': print '# not get today page'
        category_list = []
        m = re.search(r'<div class="J_CatLeft layout-left">.+?<table>(.+?)</table>.+?</div>',page,flags=re.S)
        if m:
            category_list = self.categoryListType1(m.group(1))
        else:
            m = re.search(r'<div class="catbg">\s+<div class="ju-wrapper">\s+<div class="cat-menu-h".+?>.+?<ul class="clearfix">(.+?)</ul>',page,flags=re.S)

            if m:
                category_list = self.categoryListType2(m.group(1))

        return category_list

    def categoryListType1(self,page):
        category_list = []
        m = re.search(r'<tr class="h2">.+?</tr>(.+?)<tr class="h2">',page,flags=re.S)
        if m:
            cate_list = m.group(1)
            p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S)
            for cate in p.finditer(cate_list):
                category_list.append((cate.group(1),cate.group(2).strip()))
        return category_list
    
    def categoryListType2(self,page):
        category_list = []
        p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S)
        for cate in p.finditer(page):
            category_list.append((cate.group(1),cate.group(2).strip()))
        return category_list

    def getAjaxurlList(self,page_val,refers):
        url_list = []
        page, c_name, c_id = page_val
        p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = c_name
            a_url = a_info.group(1).replace('amp;','')
            info = a_info.group(2)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            a_val = (c_id,c_name,refers,c_subNav)
            url_list.append((a_url,refers,a_val))
            i += 1
        return url_list

Exemplo n.º 10

0

Exibir arquivo

Arquivo: TCChannel.py Projeto: xzhoutxd/tc_ly

class Channel():
    '''A class of TC channel'''
    def __init__(self):
        # 抓取设置
        self.crawler            = TCCrawler()
        self.retrycrawler       = RetryCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_time_s    = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 频道信息
        self.platform           = '同程-pc' # 品牌团所在平台
        self.channel_id         = '' # 频道id
        self.channel_url        = '' # 频道链接
        self.channel_name       = '' # 频道name
        self.channel_type       = '' # 频道类型

        # 原数据信息
        self.channel_page       = '' # 频道页面html内容
        self.channel_pages      = {} # 频道页面内请求数据列表

        # channel items
        self.channel_items      = []

        # channel list
        self.channel_list       = []

    # 频道页初始化
    def init(self, channel_id, channel_url, channel_type, begin_time):
        self.channel_id = channel_id
        self.channel_url = channel_url
        self.channel_type = channel_type
        self.crawling_begintime = begin_time
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime))
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime))

    def config(self):
        self.channelPage()
        if self.channel_type == 1:
            self.spot()
        #elif self.channel_type == 2:
        else:
            Common.log('# not find this channel type...')

    def spot(self):
        if self.channel_page:
            m = re.search(r'<title>(.+?)</title>', self.channel_page, flags=re.S)
            if m:
                self.channel_name = m.group(1)

            keyword, pid, cid, cyid = '', 0, 0, 0
            m = re.search(r'<span id="hdKeyWord">(.*?)</span>', self.channel_page, flags=re.S)
            if m:
                keyword = m.group(1)
            m = re.search(r'<span id="hdPid">(.*?)</span>', self.channel_page, flags=re.S)
            if m:
                pid = int(m.group(1))
            m = re.search(r'<span id="hdCid">(.*?)</span>', self.channel_page, flags=re.S)
            if m:
                cid = int(m.group(1))
            m = re.search(r'<span id="hdCyid">(.*?)</span>', self.channel_page, flags=re.S)
            if m:
                cyid = int(m.group(1))
        
            i_p = 1
            i_page = 1
            m_page = 1
            page_main = ''
            m = re.search(r'<div class="scenery_main" id="sceneryListInfo">(.+?)<div id="pageNum_box" class="s_pager none">', self.channel_page, flags=re.S)
            if m:
                page_main = m.group(1)
            else:
                page_main = self.channel_page
            
            Common.log(i_page)
            i_p = self.get_items(page_main, i_p)

            m = re.search(r'<input type="hidden" id="txt_AllpageNumber" value="(.+?)">', page_main, flags=re.S)
            if m:
                m_page = int(m.group(1))

            page_url = 'http://www.ly.com/scenery/SearchList.aspx?&action=getlist&page=%d&kw=&pid=%d&cid=%d&cyid=%d&theme=0&grade=0&money=0&sort=0&paytype=0&ismem=0&istuan=0&isnow=0&spType=&isyiyuan=0&lbtypes=&IsNJL=0&classify=0'
            while i_page < m_page:
                i_page += 1
                p_url = page_url % (i_page, pid, cid, cyid)
                Common.log(i_page)
                page = self.retrycrawler.getData(p_url, self.channel_url)
                i_p = self.get_items(page, i_p)

    def get_items(self, page_main, i_p):
        if page_main:
            p = re.compile(r'<div class="scenery_list(.+?)">\s*<div class="s_info"><div class="img_con"><a class="a_img".+?href="(.+?)"><img.+?src="(.+?)".*?></a></div><div class="info_con"><dl class="info_top"><dt><a class="fir_name".+?>(.+?)</a>.+?<span class="s_level">(.*?)</span>.+?<dd class="scenery_area"><span>(.+?)</span>.+?</dl></div></div>', flags=re.S)
            for info in p.finditer(page_main):
                all_info, i_info, i_url, i_img, i_name, i_level, i_area = info.group(), info.group(1), (Config.tc_home + info.group(2)), info.group(3), info.group(4), re.sub(r'<.+?>', '', info.group(5)), ' '.join(info.group(6).split())
                i_book = 1
                i_desc = ''
                m = re.search(r'<dd class="scenery_desc"><p>(.+?)</p>', all_info, flags=re.S)
                if m:
                    i_desc = m.group(1)
                if i_info.find('nobook') != -1:
                    i_book = 0
                    if i_desc == '':
                        m = re.search(r'<dd class="scenery_state">(.+?)<a', all_info, flags=re.S)
                        if m:
                            i_desc = m.group(1)
                i_id = 0
                if i_url != '':
                    m = re.search(r'BookSceneryTicket_(\d+).html', i_url)
                    if m:
                        i_id = m.group(1)
                    val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_level, i_area, i_p, self.crawling_begintime))
                    self.channel_items.append(val)
                #if i_p == 1: Common.log(val)
                i_p += 1
        return i_p

    def channelList(self):
        self.channelPage()
        if self.channel_page:
            city_list = self.moreCity(self.channel_page, 'city')
            for city in city_list:
                city_url, city_name, province_id, city_id, dcity_id = city 
                if city_url:
                    province_name, city_name = '', ''
                    city_page = self.crawler.getData(city_url, self.channel_url)
                    if city_page:
                        m = re.search(r'<div class="search_screen_dl"><dl action="province">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S)
                        if m:
                            province_id, province_name = m.group(1), m.group(2)
                        m = re.search(r'<div class="search_screen_dl">.+?<dl action="city">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S)
                        if m:
                            city_id, city_name = m.group(1), m.group(2)
                    channel_list = self.moreCity(city_page, 'district')
                    if channel_list and len(channel_list) > 0:
                        for channel in channel_list:
                            channel_url, channel_name, p_id, c_id, cy_id = channel
                            self.channel_list.append((str(cy_id), channel_name, channel_url, str(self.channel_type), city_id, city_name, province_id, province_name))

    def moreCity(self, page, action_key):
        city_list = []
        if page:
            p_id, c_id, cy_id = 0, 0, 0
            m = re.search(r'<span id="hdPid">(.*?)</span>', page, flags=re.S)
            if m:
                p_id = int(m.group(1))
            m = re.search(r'<span id="hdCid">(.*?)</span>', page, flags=re.S)
            if m:
                c_id = int(m.group(1))
            m = re.search(r'<span id="hdCyid">(.*?)</span>', page, flags=re.S)
            if m:
                cy_id = int(m.group(1))
            
            p_url = 'http://www.ly.com/scenery/scenerysearchlist_%d_%d__0_0_0_%d_0_0_0.html'
            m = re.search(r'<div class="search_screen_box" id="searchScreenBox">.+?<dl action="%s".+?>.+?<dd>.+?<div class="right">(.+?)</div></dd>' % action_key, page, flags=re.S)
            if m:
                city_infos = m.group(1)
                p = re.compile(r'<a.+?tvalue="(\d+)" title="(.+?)">', flags=re.S)
                for city in p.finditer(city_infos):
                    city_id, city_name = int(city.group(1)), city.group(2)
                    if action_key == 'city':
                        c_id = city_id
                    elif action_key == 'district':
                        cy_id = city_id
                    city_url, city_name = p_url % (p_id, c_id, cy_id), city.group(2)
                    city_list.append((city_url, city_name, p_id, c_id, cy_id))
        return city_list

    def channelPage(self):
        if self.channel_url:
            data = self.crawler.getData(self.channel_url, Config.tc_home)
            if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url))
            if data and data != '':
                self.channel_page = data
                self.channel_pages['channel-home'] = (self.channel_url, data)


    def antPage(self, val):
        channel_id, channel_url, channel_type, begin_time = val
        self.init(channel_id, channel_url, channel_type, begin_time)
        self.config()

    def antChannelList(self, val):
        self.channel_url, self.channel_type = val
        self.channelList()