Python QueueFactory示例，SpiderInterface.queue.QueueFactory.QueueFactory Python示例

示例#1

0

显示文件

文件： zhengwen_consumer.py 项目： xiangjunchen/NewsCollector

    def __init__(self):
        # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type,
                                               config.proxy_area,
                                               config.proxy_host,
                                               config.proxy_port)
        self.queue_news = self.queue_factory.create(config.queue_type,
                                                    private_config.queue_news,
                                                    config.queue_host,
                                                    config.queue_port)
        self.queue_pinglun = self.queue_factory.create(
            config.queue_type, private_config.queue_pinglun, config.queue_host,
            config.queue_port)
        self.db = self.db_factory.create(config.db_type,
                                         config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

示例#2

0

显示文件

文件： get_comment.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
        # 实例化工厂对象
        self.queue_redis= QueueFactory()
        self.field_factory = FieldFactory(u'新浪四川')
        self.browser_factory = BrowserFactory()
        self.db_factory = QueueFactory()


        # 实例化具体对象
        self.log = Logging('./Log/log_pl').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.queue_comment = self.queue_redis.create(config.queue_type, private_config.queue_comment,
                                                            config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun,
                                          config.db_host, config.db_port)

示例#3

0

显示文件

文件： Zhengwen_Pinglun_Consumer.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
        # 实例化工厂对象
        self.queue= QueueFactory()
        self.field_factory = FieldFactory(u'人民网')
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/ZW_Pinglun').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun,
                                          config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun,
                                          config.db_host, config.db_port)

示例#4

0

显示文件

文件： zhengwen_zw_consumer.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
         # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen,
                                             config.queue_host, config.queue_port)
        self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun,
                                             config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

示例#5

0

显示文件

文件： zhi_wei_href_producer.py 项目： Nikita-Ting/tuan_shi_wei

class Producer(object):
    def __init__(self):
        # 实例化工厂对象
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.key_words=[u'社会',u'互联网',u'政务',u'国际',u'体育',u'财经',u'谣言',u'企业']

        # 实例化具体对象
        self.log = Logging('./Log/zhi_wei_producer').get_logging()
        self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)
        self.browser_2 = self.browser_factory.create(config.browser_type)


    def main(self,totalPage):
        cookie='nocookie'
        for words in self.key_words:
            # print words
            for page in range(1,totalPage):
                url='http://ef.zhiweidata.com/CatItem?first='+urllib.quote(words.encode('utf-8'))+'&second=%E5%85%A8%E9%83%A8&page='+str(page)+'&word=null'
                print url
                html=self.Get_page(url,cookie_j=cookie)
                   # print html
                if html:
                    try:
                        self.Parse_data(html)
                    except Exception as e:
                        self.log.info('parse data wrong!%s'%e)
        self.log.info('put the url into queue succesfully! ')

    def Parse_data(self,data):
        resultList=data.get('resultList')
        for result in resultList:
            id_s=result.get('id')
            page_href='http://ef.zhiweidata.com/Baike?id='+id_s
            # print page_href
            self.queue.put(page_href)

    def Get_page(self,url,cookie_j):
        headers=self.Get_header(refer=url,cookie_j=cookie_j)
        html=self.browser_2.visit(url=url,headers=headers,timeout=60,retry=5)
        html= json.loads(html)
        #print json.dumps(html,ensure_ascii=False,indent=4)
        return html

    def Get_header(self,refer,cookie_j):
        headers={ 'Host': 'ef.zhiweidata.com',
                 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
                 'Accept':'*/*',
                 'Accept-Language':'zh-Hans;q=1',
                 'Accept-Encoding':"gzip, deflate",
                 'Connection':"keep-alive"}
        return headers

示例#6

0

显示文件

文件： zhi_wei_href_producer.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
        # 实例化工厂对象
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.key_words=[u'社会',u'互联网',u'政务',u'国际',u'体育',u'财经',u'谣言',u'企业']

        # 实例化具体对象
        self.log = Logging('./Log/zhi_wei_producer').get_logging()
        self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)
        self.browser_2 = self.browser_factory.create(config.browser_type)

示例#7

0

显示文件

文件： Zhengwen_Producer.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
        # 实例化工厂对象
        self.queue = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()

        # 实例化具体对象
        self.log = Logging('./Log/ZW_producer').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue = self.queue.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)

示例#8

0

显示文件

文件： zhi_wei_href_consumer.py 项目： Nikita-Ting/tuan_shi_wei

    def __init__(self):
        # 实例化工厂对象
        self.field_factory = FieldFactory(u'zhi_wei_shi_jian')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()

        # 实例化具体对象
        self.log = Logging('./Log/zhiwei_message').get_logging()
        self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)
        self.db=self.queue_factory.create(config.db_type, config.db_table_zhi_wei,
                                          config.db_host, config.db_port)
        self.browser_2 = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                          config.proxy_host, config.proxy_port)

示例#9

0

显示文件

文件： zhengwen_zw_consumer.py 项目： Nikita-Ting/tuan_shi_wei

class Zhengwen_consumer():

    def __init__(self):
         # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen,
                                             config.queue_host, config.queue_port)
        self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun,
                                             config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

    def main(self):
        while(True):
            url = self.queue_zhengwu.get()
            if url:
                self.ParsePage(url)
            else:
                self.log.info('new queue is empty!')
                break
            time.sleep(random.randint(10,20))

    def ParsePage(self,url):
        try:
            html = self.browser.visit(url,encoding='gbk')
            if html:
                field =self.field_factory.create('si_chuan_news')
                tree=etree.HTML(html)
                #栏目
                lanmu = tree.xpath('.//span[@class="ep-crumb JS_NTES_LOG_FE"]/a/text()')
                #标题
                biaoti = self.textxpath(tree,'.//head/title/text()')
                #关键词
                guanjianci = self.textxpath(tree,'.//head/meta[@name="keywords"]/@content')
                #发布时间
                shijian = self.textxpath(tree,'.//div[@class="ep-time-soure cDGray"]/text()')
                timestamp=0
                if shijian:
                    shijian = re.findall('\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}:\d{2}',shijian)[0]
                    timestamp = self.datatransform(shijian)
                #文章来源
                laiyuan = self.textxpath(tree,'.//div[@class="ep-time-soure cDGray"]/a/text()')
                #正文
                wen_zhang_zheng_wen = xpathutil.get_Node_text(tree,'.//div[@id="endText"]/p')
                #图片链接
                tu_pian_lian_jie = tree.xpath('.//div[@id="endText"]/p/img/@src')
                #评论数
                ping_lun_shu_liang = self.GetPinglun(url)

                field.set('wen_zhang_wang_zhi',url)
                field.set('wen_zhang_lan_mu',' '.join(lanmu))
                field.set('wen_zhang_biao_ti',biaoti)
                field.set('guan_jian_ci',guanjianci)
                field.set('fa_bu_shi_jian',timestamp)#时间戳格式
                field.set('wen_zhang_lai_yuan',laiyuan)
                field.set('wen_zhang_zheng_wen',wen_zhang_zheng_wen)
                field.set('tu_pian_lian_jie',tu_pian_lian_jie)
                field.set('ping_lun_shu_liang',ping_lun_shu_liang)
                field.set('id',url)
                data =field.make()
                if data:
                    # print json.dumps(data,ensure_ascii=False)
                    self.db.put(data)
                    self.log.info('save data sucess!')
            else:
                self.log.info('Parsing page wrong!')
        except Exception as e:
            self.log.info(e)
            print url#打印出解析错误的页面
            time.sleep(10)


    def GetPinglun(self,url):
        #http://news.163.com/16/0808/14/BTV3ABLP00014AEE.html
        pinglunshu=0
        try:
            uid = url.split('.html')[0].split('/')[-1]
            comment_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s'%uid
            html=self.browser.visit(comment_url)
            data =json.loads(html)
            if data["tcount"]!=0:
                pinglunshu =  data["tcount"]
                uid =data["docId"]
                comment_url = 'http://comment.news.163.com/news_gov_bbs/%s.html'%uid
                # print comment_url
                self.queue_pinglun.put(comment_url+'@@@@@@'+url)
        except Exception as e:
            self.log.info('get pinglunshu wrong!%s'%e)
        return pinglunshu


    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#10

0

显示文件

文件： Zhengwen_Pinglun_Consumer.py 项目： Nikita-Ting/tuan_shi_wei

class Pinglun_Consumer():
    #消费者根据生产者爬取的网页爬取文章数据
    def __init__(self):
        # 实例化工厂对象
        self.queue= QueueFactory()
        self.field_factory = FieldFactory(u'人民网')
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/ZW_Pinglun').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun,
                                          config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun,
                                          config.db_host, config.db_port)

    def main(self):
        #爬取文章评论数据
        href = self.pinglun_queue.get()
        if href:
            self.news_link = href.split('@@@@@@')[-1]
            pinglun_link = href.split('@@@@@@')[0]
            # print pinglun_link
            self.ParseData(pinglun_link)
        else:
            #队列为空
            self.log.info(u'评论链接队列为空，休息1h！')
            time.sleep(60*60*2)

    def ParseData(self,pinglun_link):
        id= pinglun_link.split('?id=')[1]
        pinglun_link= urllib.quote(pinglun_link)
        link_p= 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrhbddTW&topic_url=%s&page_size=30&topic_source_id=%s'%(pinglun_link,id)
        try:
            html =self.browser.visit(link_p)
            data = json.loads(html)
            topic_id =data['topic_id']
            total_page_no =data['total_page_no']
            for page in xrange(1,total_page_no+1):
                link_p='http://changyan.sohu.com/api/2/topic/comments?&client_id=cyrhbddTW&page_size=30&topic_id=%s&page_no=%s'%(topic_id,str(page))
                html =self.browser.visit(link_p)
                data = json.loads(html)
                comments = data['comments']
                for comment in comments:
                    field = self.field_factory.create('ping_lun')
                    field.set('news_url',self.news_link)
                    field.set('ping_lun_nei_rong',comment['content'])
                    field.set('ping_lun_shi_jian',comment['create_time'])
                    field.set('hui_fu_shu',comment['reply_count'])
                    field.set('dian_zan_shu',comment['support_count'])
                    field.set('ping_lun_id',comment['comment_id'])
                    field.set('yong_hu_ming',comment['passport']['nickname'])
                    field.set('yong_hu_deng_ji',comment['userScore']['level'])
                    field.set('yong_hu_sheng_fen',comment['ip_location'])
                    field.set('id', field.ping_lun_id)
                    data = field.make()
                    if data:
                        self.db.put(data)
                        self.log.info('save Pinglun success!')
                        # print json.dumps(data, ensure_ascii=False, indent=4)
        except Exception as e:
            self.log.info('%s page wrong!%s'%(pinglun_link,e))

    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#11

0

显示文件

文件： Zhengwen_Producer.py 项目： Nikita-Ting/tuan_shi_wei

class Zhengwen_Producer():

    def __init__(self):
        # 实例化工厂对象
        self.queue = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()

        # 实例化具体对象
        self.log = Logging('./Log/ZW_producer').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue = self.queue.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)

    def main(self,totalpage):
        '''爬取手机网页以下版块
        [时政，国际，科技，军事，社会，观点，财经，教育]下面的子版块正文连接
        每个版块下只有20页的新闻
        '''
        hrefs=open('./lanmu_hrefs.txt','r').readlines()
        for href in hrefs:
            self.GetPage(href,totalpage)
        self.log.info(u'文章链接灌入完毕！')

    def GetPage(self,href_1,totalpage):
        if href_1:
            http_h = 'http://m.people.cn'
            for page in xrange(1,totalpage):#翻页
                href = href_1.split('.html')[0]+str(page)+'.html'
                try:
                    html = self.browser.visit(href)
                    tree = etree.HTML(html)
                    xpath_next = tree.xpath('.//ul[@class="news_list news_list_c"]/li')
                    if len(xpath_next)>0:
                        for li in xpath_next:
                            news_link = self.textxpath(li, './/@href')
                            if 'http://m.people.cn' in news_link:
                                # print news_link
                                self.queue.put(news_link)
                            else:
                                news_link=http_h+news_link
                                # print news_link
                                self.queue.put(news_link)
                    else:
                        self.log.info(u'栏目链接解析失败！')
                except Exception as e:
                    self.log.info('%s page wrong!%s'%(href,e))
        else:
            self.log.info(u'栏目连接为空！')


    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None

示例#12

0

显示文件

文件： zhengwen_pinglun_consumer.py 项目： xiangjunchen/NewsCollector

class Zhengwen_consumer():
    def __init__(self):
        # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_pinglun').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type,
                                               config.proxy_area,
                                               config.proxy_host,
                                               config.proxy_port)
        self.queue_pinglun = self.queue_factory.create(
            config.queue_type, private_config.queue_pinglun, config.queue_host,
            config.queue_port)
        self.db = self.db_factory.create(config.db_type,
                                         config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

    def main(self):
        while (True):
            url = self.queue_pinglun.get()
            if url:  #http://comment.news.163.com/news_gov_bbs/BTHNT83600234IG8.html
                comment_url, self.docurl = url.split('@@@@@@')
                # comment_url,self.docurl=['http://comment.news.163.com/news3_bbs/BTSJOECA00014SEH.html','url']
                try:
                    if comment_url:
                        self.GetPage(comment_url)
                except Exception as e:
                    self.log.info(e)
                    # print comment_url#打印出解析错误的页面
            else:
                self.log.info('new queue is empty!')
                break
            time.sleep(random.randint(10, 20))

    def GetPage(self, comment_url):
        uid = comment_url.split('.html')[0].split('/')[-1]
        offset = 0
        limit = 30
        while (True):
            js_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867' \
                     'c3d71db5856/threads/%s/comments/newList?offset=%s&limit=%s'%(uid,str(offset),str(limit))
            html = self.browser.visit(js_url)
            datas = json.loads(html)
            newListSize = datas["newListSize"]
            if newListSize != 0:
                self.ParseData(datas)
                limit = 30
                offset += limit
                time.sleep(random.randint(5, 10))
            else:
                # self.log.info('Parsing page wrong!')
                break

    def ParseData(self, datas):
        commentIds = datas['commentIds']
        comments = datas['comments']
        for ids in commentIds:
            idList = ids.split(',')
            id_n = idList[-1]  #最后一个id为当前帖子的id
            field = self.field_factory.create('ping_lun')
            # 评论文章链接
            field.set('news_url', self.docurl)
            #评论时间戳
            time = comments[id_n]['createTime']
            field.set('ping_lun_shi_jian', self.datatransform(time))
            # 回复数量
            field.set('hui_fu_shu', 0)
            # 点赞数量
            field.set('dian_zan_shu', comments[id_n]['vote'])
            #评论id
            field.set('ping_lun_id', id_n)
            # 用户昵称
            field.set('yong_hu_ming', comments[id_n]['user']['nickname'])
            # 用户省份
            field.set('yong_hu_sheng_fen',
                      comments[id_n]['user']['location'])  #时间戳格式
            # 评论内容
            content_all = u''
            for id in idList:
                if id in comments.keys():
                    content = comments[id]['content']
                    content_all += content
            field.set('ping_lun_nei_rong', content_all)
            field.set('id', id_n)
            data = field.make()
            if data:
                # print json.dumps(data,ensure_ascii=False)
                self.db.put(data)
                self.log.info('save data sucess!')

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp = time.mktime(time.strptime(data, '%Y-%m-%d %H:%M:%S'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#13

0

显示文件

文件： get_comment.py 项目： Nikita-Ting/tuan_shi_wei

class GetComment(object):

    def __init__(self):
        # 实例化工厂对象
        self.queue_redis= QueueFactory()
        self.field_factory = FieldFactory(u'新浪四川')
        self.browser_factory = BrowserFactory()
        self.db_factory = QueueFactory()


        # 实例化具体对象
        self.log = Logging('./Log/log_pl').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.queue_comment = self.queue_redis.create(config.queue_type, private_config.queue_comment,
                                                            config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_pinglun,
                                          config.db_host, config.db_port)

    def main(self):
         while(True):
            link = self.queue_comment.get()
            # print link
            if link:
                pl_href = link.split('@@@@@@')[0]
                self.zw_href = link.split('@@@@@@')[-1]
                try:
                    html=self.browser.visit(pl_href,encoding='utf-8')
                    if html:
                        self.getContent(html)
                except Exception as e:
                    self.log.info(e)
            else:
                self.log.info('queue is empty!')
                time.sleep(60*60)
            # time.sleep(10)#每隔10ms爬取一条新闻


    def getContent(self,html):
        #http://cmnt.sina.cn/aj/cmnt/list?&index=fxsvenx3222366&page=1 fxvixer7556179
        tree =etree.HTML(html)
        newsid = self.textxpath(tree,'.//div[@class="cmnt_list"]/div[@class="cmnt_item"]/@data-newsid')
        page_num=1
        while(True):
            try:
                json_link = 'http://cmnt.sina.cn/aj/v2/index?product=comos&group=0&index=%s&page=%s'%(newsid.replace('comos-',''),str(page_num))
                page_content = self.browser.visit(json_link)
                datas=json.loads(page_content)
                data = datas['data']
                if data['data']:
                    data=data['data']
                    self.save_comment(data)
                    page_num+=1
                else:
                    # self.log.info('get pinglun content fail!')
                    break
            except Exception as e:
                self.log.info(e)
                break

    def save_comment(self,datas):
        for item in datas:
            field = self.field_factory.create('ping_lun')
            # 评论内容
            ping_lun_nei_rong = item['main']['content']
            #评论时间
            ping_lun_shi_jian = item['main']['time']
            timestamp = self.datatransform(ping_lun_shi_jian)
            # 点赞数量
            dian_zan_shu = item['main']['agree']
            #评论id
            ping_lun_id=item['main']['mid']
            # 用户昵称
            yong_hu_ming = item['main']['nick']
            # 用户省份
            yong_hu_sheng_fen = item['main']['source']

            field.set('news_url',self.zw_href)
            field.set('ping_lun_nei_rong',ping_lun_nei_rong)
            field.set('ping_lun_shi_jian',int(timestamp))
            field.set('dian_zan_shu',dian_zan_shu)
            field.set('ping_lun_id',ping_lun_id)
            field.set('yong_hu_ming',yong_hu_ming)
            field.set('yong_hu_sheng_fen',yong_hu_sheng_fen)
            field.set('id',ping_lun_id)
            data = field.make()
            if data:
                # print json.dumps(data, ensure_ascii=False, indent=4)
                self.db.put(data)
                self.log.info('save pinglun success!')

    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None

    def datatransform(self, data):
        #将年月日转换为时间戳
        if u'月' in data:
            sec = int(re.findall('\d+',data)[0]) *30*24*60*60
            dt = datetime.datetime.now() - datetime.timedelta(seconds=sec)
        elif u'天' in data:
            sec = int(re.findall('\d+',data)[0]) *24*60*60
            dt = datetime.datetime.now() - datetime.timedelta(seconds=sec)
        elif u'小时' in data:
            sec = int(re.findall('\d+',data)[0]) *60*60
            dt = datetime.datetime.now() - datetime.timedelta(seconds=sec)
        elif u'分钟' in data:
            sec = int(re.findall('\d+',data)[0]) *60
            dt = datetime.datetime.now() - datetime.timedelta(seconds=sec)
        else:
            dt = datetime.datetime.strptime(data, '%Y-%m-%d %H:%M')
        return time.mktime(dt.timetuple())

示例#14

0

显示文件

文件： Zhengwen_Consumer.py 项目： Nikita-Ting/tuan_shi_wei

class Zhengwen_Consumer():
    #消费者根据生产者爬取的网页爬取文章数据
    def __init__(self):
        # 实例化工厂对象
        self.queue= QueueFactory()
        self.field_factory = FieldFactory(u'人民网')
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()
        self.producer = Zhengwen_Producer()

        # 实例化具体对象
        self.log = Logging('./Log/ZW_consumer').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.news_queue = self.queue.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)
        self.pinglun_queue = self.queue.create(config.queue_type, private_config.queue_table_pinglun,
                                          config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen,
                                          config.db_host, config.db_port)

    def main(self):
        #消费者从队列中取出文章地址访问爬取数据
        news_link = self.news_queue.get()
        # news_link='http://m.people.cn/n4/2016/0718/c203-7230678.html'
        if news_link:
            try:
                self.ParseData(news_link)
            except Exception as e:
                self.log.info('%s page wrong!%s'%(news_link,e))
        else:
            #队列为空
            self.log.info(u'文章链接队列为空，休息4h！')
            time.sleep(60*60*4)
            self.producer.main(10)

    def ParseData(self,news_link):
        html = self.browser.visit(news_link)
        tree = etree.HTML(html)

        field = self.field_factory.create('si_chuan_news')
        guan_jian_ci = self.textxpath(tree,'.//meta[@name="keywords"]/@content')
        if guan_jian_ci is not None:
                field.set('guan_jian_ci',guan_jian_ci.split())
        lai_yuan = self.textxpath(tree,'.//meta[@name="source"]/@content')
        shi_jian = self.textxpath(tree,'.//meta[@name="publishdate"]/@content')
        time_stamp=self.datatransform(shi_jian)
        field.set('fa_bu_shi_jian', int(time_stamp))
        field.set('wen_zhang_lai_yuan', lai_yuan)
        field.set('wen_zhang_lan_mu', self.textxpath(tree, './/header/em//a/text()'))
        tu_pian = tree.xpath('.//div[@class="wb_content"]/div[@id="p_content1"]//img/@src')
        field.set('tu_pian_lian_jie',tu_pian)
        ping_lun_shu = self.getPingLun(news_link)
        field.set('ping_lun_shu_liang',ping_lun_shu)
        #正文
        text_all = xpathutil.get_all_text(tree, './/*[@id="p_content1"]')
        field.set('wen_zhang_zheng_wen',text_all)
        field.set('wen_zhang_biao_ti', self.textxpath(tree, './/div[@class="wb_content"]/h1/text()'))
        field.set('wen_zhang_wang_zhi', news_link)

        field.set('id', field.wen_zhang_wang_zhi)
        data = field.make()
        if data:
            self.db.put(data)
            self.log.info('save news success!')
             # print json.dumps(data, ensure_ascii=False, indent=4)
         #爬取评论
        if ping_lun_shu != 0:
            pinglun_href = self.textxpath(tree,'.//p[@class="all_pinglun"]/a/@href')
            self.pinglun_queue.put(pinglun_href+'@@@@@@'+news_link)
            # self.GetPinglun(pinglun_href,news_link)


    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None

    def getPingLun(self,topic_url):
        pinglunshu =0
        topic_url =urllib.quote(topic_url)
        link_p= 'http://changyan.sohu.com/api/2/topic/load?client_id=cyrhbddTW&topic_url=%s&page_size=3'%(topic_url)
        try:
            html =self.browser.visit(link_p)
            if 'cmt_sum' in html:
                data = json.loads(html)
                pinglunshu=data['cmt_sum']
        except Exception as e:
            self.log.info('get pinglunshu wrong!%s'%e)
        return pinglunshu

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#15

0

显示文件

文件： zhengwen_producer.py 项目： xiangjunchen/NewsCollector

class Zhengwen_producer(object):
    def __init__(self):
        # 实例化工厂对象
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type,
                                               config.proxy_area,
                                               config.proxy_host,
                                               config.proxy_port)
        self.queue_news = self.queue_factory.create(config.queue_type,
                                                    private_config.queue_news,
                                                    config.queue_host,
                                                    config.queue_port)
        self.queue_zhengwu = self.queue_factory.create(
            config.queue_type, private_config.queue_zhengwu_zhenwgen,
            config.queue_host, config.queue_port)
        self.queue_pinglun = self.queue_factory.create(
            config.queue_type, private_config.queue_pinglun, config.queue_host,
            config.queue_port)

    def main(self):
        lanmu_list = self.get_lanmu()
        # lanmu_list=['http://public.house.163.com/special/03531F4E/index_news.js?callback=data_callback']
        try:
            for url in lanmu_list:
                if 'gov' in url:
                    self.getGovHref(url)
                else:
                    self.getNewsHref(url)
                # time.sleep(random.randint(10,20)
            self.log.info('getting the zhengwen href successful!')
        except Exception as e:
            self.log.info(e)
            time.sleep(10)

    def getNewsHref(self, url):
        #获取网易新闻正文链接
        url_l = url
        page = 2
        try:
            while (True):
                # print url_l
                html = self.browser.visit(url_l, encoding='gbk')
                if html:
                    html = html.replace(' ', '').replace('\n', '')
                    html = str(html).replace('data_callback([',
                                             '{"data":[').replace('])', ']}')
                    html = re.sub(r",\s*?]", "]", html)
                    datas = json.loads(html, encoding='utf-8')
                    for data in datas['data']:
                        # print data
                        docurl = data['docurl']  #正文链接
                        commenturl = data['commenturl']  #评论链接
                        # print docurl,commenturl
                        if commenturl:
                            self.queue_pinglun.put(commenturl + '@@@@@@' +
                                                   docurl)
                        else:
                            uid = docurl.split('.html')[0].split('/')[-1]
                            commenturl = 'http://comment.news.163.com/news3_bbs/%s.html' % uid
                            self.queue_pinglun.put(commenturl + '@@@@@@' +
                                                   docurl)
                        self.queue_news.put(docurl)
                else:
                    break
                url_l = url.split('.js?')[0] + '_%s.js' % str(page).zfill(2)
                page += 1
        except Exception as e:
            self.log.info(e)

    def getGovHref(self, url):
        #获取网易政务新闻链接
        url_l = url
        #http://gov.163.com/special/zwzx_n/;http://gov.163.com/special/zwzx_n_02/
        try:
            if 'zwzx' in url:
                totalpage = 17
            else:
                totalpage = 5
            for page in xrange(2, totalpage):
                html = self.browser.visit(url_l, encoding='gbk')
                tree = etree.HTML(html)
                hrefs = tree.xpath('.//div[@class="cnt"]/ul/li/a/@href')
                # print len(hrefs)
                if hrefs:
                    for href in hrefs:
                        # print href
                        self.queue_zhengwu.put(href)
                url_l = url[0:-1] + '_%s/' % str(page).zfill(2)
        except Exception as e:
            self.log.info(e)

    def get_lanmu(self):
        #从lanmu_href文件读入栏目链接
        lanmu_hrefs = []
        file = open('lanmu_href', 'r')
        for line in file.readlines():
            lanmu_hrefs.append(line.split('\n')[0])
        return lanmu_hrefs

    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(),
                                                   texts))[pos]
        except:
            return None

示例#16

0

显示文件

文件： zhengwen_zw_consumer.py 项目： xiangjunchen/NewsCollector

class Zhengwen_consumer():
    def __init__(self):
        # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type,
                                               config.proxy_area,
                                               config.proxy_host,
                                               config.proxy_port)
        self.queue_zhengwu = self.queue_factory.create(
            config.queue_type, private_config.queue_zhengwu_zhenwgen,
            config.queue_host, config.queue_port)
        self.queue_pinglun = self.queue_factory.create(
            config.queue_type, private_config.queue_pinglun, config.queue_host,
            config.queue_port)
        self.db = self.db_factory.create(config.db_type,
                                         config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

    def main(self):
        while (True):
            url = self.queue_zhengwu.get()
            if url:
                self.ParsePage(url)
            else:
                self.log.info('new queue is empty!')
                break
            time.sleep(random.randint(10, 20))

    def ParsePage(self, url):
        try:
            html = self.browser.visit(url, encoding='gbk')
            if html:
                field = self.field_factory.create('si_chuan_news')
                tree = etree.HTML(html)
                #栏目
                lanmu = tree.xpath(
                    './/span[@class="ep-crumb JS_NTES_LOG_FE"]/a/text()')
                #标题
                biaoti = self.textxpath(tree, './/head/title/text()')
                #关键词
                guanjianci = self.textxpath(
                    tree, './/head/meta[@name="keywords"]/@content')
                #发布时间
                shijian = self.textxpath(
                    tree, './/div[@class="ep-time-soure cDGray"]/text()')
                timestamp = 0
                if shijian:
                    shijian = re.findall(
                        '\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}:\d{2}', shijian)[0]
                    timestamp = self.datatransform(shijian)
                #文章来源
                laiyuan = self.textxpath(
                    tree, './/div[@class="ep-time-soure cDGray"]/a/text()')
                #正文
                wen_zhang_zheng_wen = xpathutil.get_Node_text(
                    tree, './/div[@id="endText"]/p')
                #图片链接
                tu_pian_lian_jie = tree.xpath(
                    './/div[@id="endText"]/p/img/@src')
                #评论数
                ping_lun_shu_liang = self.GetPinglun(url)

                field.set('wen_zhang_wang_zhi', url)
                field.set('wen_zhang_lan_mu', ' '.join(lanmu))
                field.set('wen_zhang_biao_ti', biaoti)
                field.set('guan_jian_ci', guanjianci)
                field.set('fa_bu_shi_jian', timestamp)  #时间戳格式
                field.set('wen_zhang_lai_yuan', laiyuan)
                field.set('wen_zhang_zheng_wen', wen_zhang_zheng_wen)
                field.set('tu_pian_lian_jie', tu_pian_lian_jie)
                field.set('ping_lun_shu_liang', ping_lun_shu_liang)
                field.set('id', url)
                data = field.make()
                if data:
                    # print json.dumps(data,ensure_ascii=False)
                    self.db.put(data)
                    self.log.info('save data sucess!')
            else:
                self.log.info('Parsing page wrong!')
        except Exception as e:
            self.log.info(e)
            print url  #打印出解析错误的页面
            time.sleep(10)

    def GetPinglun(self, url):
        #http://news.163.com/16/0808/14/BTV3ABLP00014AEE.html
        pinglunshu = 0
        try:
            uid = url.split('.html')[0].split('/')[-1]
            comment_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' % uid
            html = self.browser.visit(comment_url)
            data = json.loads(html)
            if data["tcount"] != 0:
                pinglunshu = data["tcount"]
                uid = data["docId"]
                comment_url = 'http://comment.news.163.com/news_gov_bbs/%s.html' % uid
                # print comment_url
                self.queue_pinglun.put(comment_url + '@@@@@@' + url)
        except Exception as e:
            self.log.info('get pinglunshu wrong!%s' % e)
        return pinglunshu

    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(),
                                                   texts))[pos]
        except:
            return None

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp = time.mktime(time.strptime(data, '%Y-%m-%d %H:%M:%S'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#17

0

显示文件

文件： zhengwen_pinglun_consumer.py 项目： Nikita-Ting/tuan_shi_wei

class Zhengwen_consumer():

    def __init__(self):
         # 实例化工厂对象
        self.field_factory = FieldFactory(u'网易新闻')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        self.db_factory = QueueFactory()

        # 实例化具体对象
        self.log = Logging('./Log/log_pinglun').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun,
                                             config.queue_host, config.queue_port)
        self.db = self.db_factory.create(config.db_type, config.db_table_news_zhengwen,
                                         config.db_host, config.db_port)

    def main(self):
        while(True):
            url = self.queue_pinglun.get()
            if url:#http://comment.news.163.com/news_gov_bbs/BTHNT83600234IG8.html
                comment_url,self.docurl=url.split('@@@@@@')
                # comment_url,self.docurl=['http://comment.news.163.com/news3_bbs/BTSJOECA00014SEH.html','url']
                try:
                    if comment_url:
                        self.GetPage(comment_url)
                except Exception as e:
                    self.log.info(e)
                    # print comment_url#打印出解析错误的页面
            else:
                self.log.info('new queue is empty!')
                break
            time.sleep(random.randint(10,20))


    def GetPage(self,comment_url):
        uid = comment_url.split('.html')[0].split('/')[-1]
        offset = 0
        limit = 30
        while(True):
            js_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867' \
                     'c3d71db5856/threads/%s/comments/newList?offset=%s&limit=%s'%(uid,str(offset),str(limit))
            html = self.browser.visit(js_url)
            datas = json.loads(html)
            newListSize =datas["newListSize"]
            if newListSize!=0:
                self.ParseData(datas)
                limit = 30
                offset += limit
                time.sleep(random.randint(5,10))
            else:
                # self.log.info('Parsing page wrong!')
                break

    def ParseData(self,datas):
        commentIds = datas['commentIds']
        comments = datas['comments']
        for ids in commentIds:
            idList=ids.split(',')
            id_n = idList[-1]#最后一个id为当前帖子的id
            field =self.field_factory.create('ping_lun')
            # 评论文章链接
            field.set('news_url',self.docurl)
            #评论时间戳
            time=comments[id_n]['createTime']
            field.set('ping_lun_shi_jian',self.datatransform(time))
            # 回复数量
            field.set('hui_fu_shu',0)
            # 点赞数量
            field.set('dian_zan_shu',comments[id_n]['vote'])
            #评论id
            field.set('ping_lun_id',id_n)
            # 用户昵称
            field.set('yong_hu_ming',comments[id_n]['user']['nickname'])
            # 用户省份
            field.set('yong_hu_sheng_fen',comments[id_n]['user']['location'])#时间戳格式
            # 评论内容
            content_all =u''
            for id in idList:
                if id in comments.keys():
                    content=comments[id]['content']
                    content_all+=content
            field.set('ping_lun_nei_rong',content_all)
            field.set('id',id_n)
            data =field.make()
            if data:
                # print json.dumps(data,ensure_ascii=False)
                self.db.put(data)
                self.log.info('save data sucess!')

    def datatransform(self, data):
        #将年月日转换为时间戳2016-08-08 10:01:56

        timeStamp=time.mktime(time.strptime(data,'%Y-%m-%d %H:%M:%S'))
        # if not data:
        #     data = '2016-01-02'
        # data=data.decode('gbk')
        # timeArray = data.split('-')
        # d = datetime.datetime(int(timeArray[0]), int(timeArray[1]), int(timeArray[2]))
        # timeStamp=int(time.mktime(d.timetuple()))
        return timeStamp

示例#18

0

显示文件

文件： zhi_wei_href_consumer.py 项目： Nikita-Ting/tuan_shi_wei

class Consumer(object):

    def __init__(self):
        # 实例化工厂对象
        self.field_factory = FieldFactory(u'zhi_wei_shi_jian')
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()

        # 实例化具体对象
        self.log = Logging('./Log/zhiwei_message').get_logging()
        self.queue = self.queue_factory.create(config.queue_type, private_config.queue_table,
                                          config.queue_host, config.queue_port)
        self.db=self.queue_factory.create(config.db_type, config.db_table_zhi_wei,
                                          config.db_host, config.db_port)
        self.browser_2 = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                          config.proxy_host, config.proxy_port)

    def main(self):
        while(True):
            href = self.queue.get()
            if href:
                cookie='nocookie'
                try:
                    html=self.Get_page(href,cookie_j=cookie)
                    if html:
                        self.Parse_data(html,href)
                except Exception as e:
                    self.log.info('parse data wrong!')
            else:
                self.log.info('queue is empty!')
                break
            time.sleep(60)

    def Parse_data(self,data,href):
        field = self.field_factory.create('zhi_wei_shi_jian')
        #标题
        field.set('biao_ti',data.get('base').get('name'))
        #页面链接
        field.set('ye_mian_lian_jie',href)
        #事件标签
        field.set('shi_jian_biao_qian',data.get('base').get('tag'))
        #事件趋势
        field.set('shi_jian_qu_shi',data.get('base').get('trend'))
        #参与媒体
        try:
            field.set('can_yu_mei_ti',data.get('base').get('media').split('、'))
        except Exception as e:
            self.log.info('get can_yu_mei_ti wrong!')
        #参与微博大V
        try:
            field.set('can_yu_wei_bo_da_V',data.get('base').get('weibo').split('、'))
        except Exception as e:
           self.log.info('get can_yu_wei_bo_da_V wrong!')
        #影响力指数
        field.set('ying_xiang_li_zhi_shu',data.get('base').get('index'))
        #同类事件
        field.set('tong_lei_shi_jian',data.get('ration'))
        #关联事件
        message_2=[]
        time_s=data.get('thread').get('time')
        times_value=data.get('thread').get('timevalue')
        message_2.append(time_s)
        message_2.append(times_value)
        field.set('guan_lian_shi_jian',message_2)
        #传播图形
        line=data.get('thread').get('line')
        field.set('chuan_bo_tu_xing',line)
        #关键事件
        field.set('guan_jian_shi_jian',data.get('thread').get('info'))
        field.set('rawdata',data)
        field.set('id',href)
        save_data=field.make()
        if save_data:
            # print json.dumps(save_data,ensure_ascii=False,indent=4)
            self.db.put(save_data)
            self.log.info('save data sucess!%s'%time.ctime())
        else:
            self.log.error(u'数据生成失败!')

    def Get_page(self,url,cookie_j):
        headers=self.Get_header(refer=url,cookie_j=cookie_j)
        html=self.browser_2.visit(url=url,headers=headers,timeout=60,retry=5)
        html= json.loads(html)
        return html

    def Get_header(self,refer,cookie_j):
        headers={ 'Host': 'ef.zhiweidata.com',
                 'User-Agent':'Explore/4.9.4 (iPhone; iOS 7.1.2; Scale/2.00)',
                 'Accept':'*/*',
                 'Accept-Language':'zh-Hans;q=1',
                 'Accept-Encoding':"gzip, deflate",
                 'Connection':"keep-alive"
        }
        return headers

示例#19

0

显示文件

文件： zhengwen_producer.py 项目： Nikita-Ting/tuan_shi_wei

class Zhengwen_producer(object):

    def __init__(self):
         # 实例化工厂对象
        self.queue_factory = QueueFactory()
        self.browser_factory = BrowserFactory()
        self.proxy_factory = ProxyFactory()
        # 实例化具体对象
        self.log = Logging('./Log/log_zhengwen').get_logging()
        self.browser = self.browser_factory.create(config.browser_type)
        self.proxy = self.proxy_factory.create(config.proxy_type, config.proxy_area,
                                               config.proxy_host, config.proxy_port)
        self.queue_news = self.queue_factory.create(config.queue_type, private_config.queue_news,
                                             config.queue_host, config.queue_port)
        self.queue_zhengwu = self.queue_factory.create(config.queue_type, private_config.queue_zhengwu_zhenwgen,
                                             config.queue_host, config.queue_port)
        self.queue_pinglun = self.queue_factory.create(config.queue_type, private_config.queue_pinglun,
                                             config.queue_host, config.queue_port)


    def main(self):
        lanmu_list=self.get_lanmu()
        # lanmu_list=['http://public.house.163.com/special/03531F4E/index_news.js?callback=data_callback']
        try:
            for url in lanmu_list:
                if 'gov' in url:
                    self.getGovHref(url)
                else:
                    self.getNewsHref(url)
                # time.sleep(random.randint(10,20)
            self.log.info('getting the zhengwen href successful!')
        except Exception as e:
            self.log.info(e)
            time.sleep(10)


    def getNewsHref(self,url):
        #获取网易新闻正文链接
        url_l = url
        page=2
        try:
            while(True):
                # print url_l
                html = self.browser.visit(url_l,encoding='gbk')
                if html:
                    html = html.replace(' ','').replace('\n','')
                    html = str(html).replace('data_callback([','{"data":[').replace('])',']}')
                    html = re.sub(r",\s*?]", "]", html)
                    datas = json.loads(html,encoding='utf-8')
                    for data in datas['data']:
                        # print data
                        docurl=data['docurl']#正文链接
                        commenturl = data['commenturl']#评论链接
                        # print docurl,commenturl
                        if commenturl:
                            self.queue_pinglun.put(commenturl+'@@@@@@'+docurl)
                        else:
                            uid = docurl.split('.html')[0].split('/')[-1]
                            commenturl = 'http://comment.news.163.com/news3_bbs/%s.html'%uid
                            self.queue_pinglun.put(commenturl+'@@@@@@'+docurl)
                        self.queue_news.put(docurl)
                else:
                    break
                url_l = url.split('.js?')[0]+'_%s.js' %str(page).zfill(2)
                page+=1
        except Exception as e:
            self.log.info(e)

    def getGovHref(self,url):
        #获取网易政务新闻链接
        url_l =url
        #http://gov.163.com/special/zwzx_n/;http://gov.163.com/special/zwzx_n_02/
        try:
            if 'zwzx' in url:
                totalpage=17
            else:
                totalpage =5
            for page in xrange(2,totalpage):
                html = self.browser.visit(url_l,encoding='gbk')
                tree = etree.HTML(html)
                hrefs = tree.xpath('.//div[@class="cnt"]/ul/li/a/@href')
                # print len(hrefs)
                if hrefs:
                    for href in hrefs:
                        # print href
                        self.queue_zhengwu.put(href)
                url_l = url[0:-1]+'_%s/'%str(page).zfill(2)
        except Exception as e:
            self.log.info(e)

    def get_lanmu(self):
        #从lanmu_href文件读入栏目链接
        lanmu_hrefs=[]
        file = open('lanmu_href','r')
        for line in file.readlines():
            lanmu_hrefs.append(line.split('\n')[0])
        return lanmu_hrefs

    def textxpath(self, tree, path, pos=0):
        texts = tree.xpath(path)
        if not texts:
            return None
        try:
            return map(lambda x: x.strip(), filter(lambda x: x.strip(), texts))[pos]
        except:
            return None