示例#1
0
class GaoDunSpider(scrapy.spiders.Spider):
    name = "gaodun_fagui"
    start_urls = [
        "https://fagui.gaodun.com/",

    ]
    statuteData = StatuteData()
    #爬虫入口
    def parse(self, response):
        return self.parse_article_count(response)

    # 解析文章数量
    def parse_article_count(self,response):
        list_url='https://fagui.gaodun.com/index.php/Search/index/t/1/p/{0}.html'
        for page in range(1,12926):
            yield scrapy.Request(list_url.format(str(page)), callback=self.parse_article_list)


    #获取指定页面的列表
    def parse_article_list(self,response):
        lis=response.xpath('//div[@class="mesgebox"]/ul/li')
        for li in lis:
            article_url= 'https://fagui.gaodun.com'+li.xpath("div[@class='cb randwen randwen_2']/p/a/@href").extract_first()
            src = ''.join(li.css('.yxdbox img::attr(src)').extract())
            level=u'现行有效'
            if src!='':
                level = int(re.match('.*/yx(?P<flag>\d+)[\.]png', src).group('flag'))
                if  level ==2:
                    level=u'已失效'
                if level == 3:
                    level = u'尚未生效'
                if level ==4:
                    level = u'已被修正'
            yield scrapy.Request(article_url,callback=self.parse_article,meta={'level':level})

    #解析文章内容
    def parse_article(self,response):
        item={}
        item["url"]=response.url
        top = response.xpath('//div[@class="topfond tac"]')
        item['title'] = ''.join(top.xpath('h1/text()').extract())
        item['pubish_org'] =''.join(top.xpath('h3/text()').extract())
        item['anNo'] = ''.join(top.xpath('p/text()').extract())
        item['pubish_time']= ''.join(response.xpath(u'//div[@class="towaltext"]/span[contains(text(),"发文时间:")]/text()').extract()).replace(u'发文时间:','').replace(' ','')
        item['effect_time']= ''.join(response.xpath(u'//div[@class="towaltext"]/span[contains(text(),"生效时间:")]/text()').extract()).replace(u'生效时间:','').replace(' ','')
        item['level']= u'行业团体规定'
        item['time_liness'] = response.meta['level']
            # 是否导入正式数据库
        item['export'] = '0'
        item['source'] = u"高顿"
        content="".join(response.xpath('//div[@id="para"]').extract()).replace('\r\n', '')
        content=re.sub('(<a.*?>.*?</a>)|((class|style|color|href|target|align)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', content)  # 内容'''
        item['content'] = content.replace("\r",'').replace("\n",'').replace("\t",'').replace(' ','').replace('附件下载:','')
        uid = str(uuid.uuid1()).replace('-', '')
        if item['content']!='' and item['title']!='':
            self.statuteData.insert_statute((uid, item['time_liness'], item['effect_time'], item['level'], item['pubish_time'],
                                             item['title'], item['anNo'], item['source'], item['pubish_org'],
                                             item["content"], 0))
        del  item['content']
        print item
示例#2
0
class FaGuiWriterPipeline(object):
    def __init__(self):
        self.statuteData = StatuteData()

    def process_item(self, item, spider):
        line = dict(item)
        if line.has_key('title'):
            self.statuteData.insert_statute(
                (item['Id'], item['time_liness'], item['effect_time'],
                 item['level'], item['pubish_time'], item['title'],
                 item['anNo'], item['source'], item['pubish_org'],
                 item["content"], item['provinceName'], item['cityName'],
                 item['provinceCode'], item['cityCode'], item['sIndex'],
                 item['sTypeName'], 0))
            return item

    def close_spider(self, spider):
        pass
示例#3
0
class BYNRSpider(scrapy.Spider):
    #  巴彦淖尔市人民政府

    name = "bynr"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    cityName = u"巴彦淖尔市"
    provinceCode='15'
    cityCode='1502'
    level = u"地方法规"
    pubish_time='2019-04-24'

    allowed_domains=["www.bynr.gov.cn"]

    start_urls = [ 'http://www.bynr.gov.cn/xxgk/zwgkml/']

    page_domain = "http://www.bynr.gov.cn/xxgk/zwgkml/%s"

    def parse(self, response):
        send_requests = []
        pageSize=19;

        for item in response.css('.cont_left_zwgk_cont_foot ul li'):
            url = ''.join(item.css("a::attr(href)").extract())
            url=url.replace('../','').replace('./','')
            type=''.join(item.css("a::text").extract()).replace(' ','')
            dcount=''.join(item.css("::text").extract()).replace(' ','')
            dcount= re.findall("\d+",dcount)[0]
            page= (int(dcount) + pageSize - 1) / pageSize;

            for index in range(1, page + 1):
                if index==1:
                    send_requests.append(scrapy.Request(self.page_domain % url, callback=self.parse_list, method='get', errback=self.handle_error ,meta={'sublevel': url}))
                else:
                    p=index-1
                    newurl=''.join((url, 'index_%d.html' % p ))
                    send_requests.append(scrapy.Request(self.page_domain % newurl, callback=self.parse_list, method='get', errback=self.handle_error ,meta={'sublevel': url}))

        return send_requests

    def parse_list(self, response):
        sublevel=response.meta['sublevel']
        for item in response.css(".cont_right_cont a"):
            detail_url=''.join((sublevel , ''.join(item.css("::attr(href)").extract()).replace('./','')))
            yield scrapy.Request(self.page_domain % detail_url, callback=self.parse_detail, method='get',errback=self.handle_error)
        pass

    def parse_detail(self, response):
        item = {}
        title = ''.join(response.css('.zwgk_hei18::text').extract_first())
        if title != '':
            item['title'] = title
            item['anNo'] =''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(4) td:nth-child(2)::text').re('[^\s+]'))
            item['pubish_time'] =''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(2) td:nth-child(4)::text').re('[^\s+]'))
            item['effect_time'] = None
            item['pubish_org'] = ''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(2) td:nth-child(2)::text').extract())
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('.cont_right_cont_xilan table:nth-child(2) table:nth-child(2)').extract())
            item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)|(?i)(<SCRIPT)[\\s\\S]*?((</SCRIPT>)|(/>))|(?i)(<style)[\\s\\S]*?((</style>)|(/>))', '',content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["cityName"] = self.cityName
            item["provinceCode"] = self.provinceCode
            item["cityCode"] = self.cityCode
            item['sIndex'] = ''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(1) td:nth-child(2)::text').extract_first())

            #判断文件类型
            stype=''
            if '通知' in title:
                stype=u'通知'
            elif '通告' in title:
                stype=u'通告'
            elif '批复' in title:
                stype = u'批复'
            elif '命令' in title:
                stype = u'命令'
            elif '通报' in title:
                stype = u'通报'
            elif '意见' in title:
                stype = u'意见'
            elif '决定' in title:
                stype = u'决定'
            elif '公告' in title:
                stype = u'公告'
            else:
                stype = u'其他'

            item["sTypeName"] = stype
            item['source'] = u"巴彦淖尔市人民政府"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item


    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#4
0
class OrdosSpider(scrapy.Spider):
    #  鄂尔多斯市人民政府网

    name = "ordos"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    cityName = u"鄂尔多斯"
    provinceCode='15'
    cityCode='1505'
    level = u"地方法规"
    pubish_time='2019-04-23'

    allowed_domains=["ordos.gov.cn"]

    start_urls = [ 'http://xxgk.ordos.gov.cn/xxgk/channel/ordos_xxw/col10204f.html']

    page_domain = "http://xxgk.ordos.gov.cn%s"

    def parse(self, response):
        send_requests = []
        pageSize=15;

        for item in response.css('#tree4 div'):
            url=''.join(item.css("a::attr(href)").extract()).replace('../..','')
            type=''.join(item.css("a::text").extract()).replace(' ','')
            dcount=''.join(item.css("font::text").extract()).replace(' ','')
            dcount= re.findall("\d+",dcount)[0]
            page= (int(dcount) + pageSize - 1) / pageSize;

            for index in range(1, page + 1):
                if index==1:
                    send_requests.append(scrapy.Request(self.page_domain % url, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type':type}, ))
                else:
                    newurl=''.join((url, '&pos=%d' % index ))
                    send_requests.append(scrapy.Request(self.page_domain % newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': type }, ))

        return send_requests



    def parse_list(self, response):
        for item in response.css(".recordlist a"):
            detail_url =  self.page_domain % ''.join(item.css("::attr(href)").extract()).replace('../..','')
            yield scrapy.Request(detail_url, callback=self.parse_detail, method='get',errback=self.handle_error,meta={'type': response.meta['type']})
        pass

    def parse_detail(self, response):
        item = {}
        title = ''.join(response.css('#title::text').extract_first())
        if title != '':
            item['title'] = title
            item['anNo'] = ''.join(response.css('.detail table tr:nth-child(4) td:nth-child(2)::text').extract())
            item['pubish_time'] =''.join(response.css('.detail table tr:nth-child(4) td:nth-child(4)::text').extract())
            item['effect_time'] = None
            item['pubish_org'] = ''.join(response.css('.detail table tr:nth-child(2) td:nth-child(2)::text').extract())
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('#content').extract())
            item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '',content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["cityName"] = self.cityName
            item["provinceCode"] = self.provinceCode
            item["cityCode"] = self.cityCode
            item['sIndex'] = ''.join(response.css('.detail table tr:nth-child(1) td:nth-child(2)::text').extract())
            item["sTypeName"] = response.meta["type"]
            item['source'] = u"鄂尔多斯市人民政府"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item


    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#5
0
class WFFGSpider(scrapy.Spider):
    """问法法律法规"""

    name = "wffagui_NoFilter"

    allowed_domains = ["www.51wf.com"]

    last_faguis = [
        {
            'source': u'部门规章',
            'pubish_time': '2018-04-27'
        },  #部门规章
        #{'source':u'行政法规', 'pubish_time':'2012-11-12'},
    ]
    statuteData = StatuteData()
    start_urls = [
        "http://www.51wf.com/law/search--authority-1--page-1",  #宪法
        "http://www.51wf.com/law/search--authority-2--page-1",  # 行政行规
        "http://www.51wf.com/law/search--authority-3--page-1",  # 司法解释
        "http://www.51wf.com/law/search--authority-4--page-1",  # 部门规章
        "http://www.51wf.com/law/search--authority-5--page-1",  # 军事专项法
        "http://www.51wf.com/law/search--authority-6--page-1",  # 行政团体规定
        "http://www.51wf.com/law/search--authority-7--page-1",  # 地方法规规章
    ]

    def parse(self, response):
        self.parse_list(response)
        page_count = ''.join(
            response.xpath("//a[@name='last']/text()").extract())
        if page_count == "":
            return
        data_total = int(page_count)
        pageurl = str(response.url)[0:len(response.url) - 1]
        for page in range(2, data_total):
            yield scrapy.Request(pageurl + str(page),
                                 callback=self.parse_list,
                                 method='get',
                                 errback=self.handle_error)

    def parse_list(self, response):
        domain = "http://www.51wf.com%s"
        for item in response.css(".lie_biao li"):
            detail_url = domain % item.css(
                ".xin_wen a::attr(href)").extract_first()
            #if datetime.strptime(self.last_faguis[0]['pubish_time'], "%Y-%m-%d") < datetime.strptime(item.css(".shi_jian span::text").extract_first(), "%Y-%m-%d"):
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error)

    def parse_detail(self, response):
        item = {}
        item['title'] = "".join(response.css('.LL_bt_a::text').re('[^\s]'))
        # 发文字号
        item['anNo'] = "".join(
            response.xpath(
                u'//div[@class="LL_sx"]/p[contains(text(),"【发文字号】")]/text()').
            re('[^\s]')).replace(u'【发文字号】', '')
        # 颁布日期
        item['pubish_time'] = "".join(
            response.xpath(
                u'//div[@class="LL_sx"]/p[contains(text(),"【颁布日期】")]/text()').
            re('[^\s]')).replace(u'【颁布日期】', '')
        # 时效性
        if len(
                response.xpath(
                    u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")]')) > 1:
            item['time_liness'] = "".join(
                response.xpath(
                    u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")][2]/text()'
                ).re('[^\s]')).replace(u'【时效性】', '')
        else:
            item['time_liness'] = "".join(
                response.xpath(
                    u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")][1]/text()'
                ).re('[^\s]')).replace(u'【时效性】', '')
        # 生效日期
        item['effect_time'] = "".join(
            response.xpath(
                u'//div[@class="LL_sx"]/p[contains(text(),"【生效日期】")]/text()').
            re('[^\s]')).replace(u'【生效日期】', '')
        # 效力级别
        item['level'] = "".join(
            response.xpath(
                u'//div[@class="LL_sx"]/p[contains(text(),"【效力级别】")]/span/a/text()'
            ).re('[^\s]'))
        # 颁布机构
        item['pubish_org'] = "".join(
            response.xpath(
                u'//div[@class="LL_sx"]/p[contains(text(),"【颁布机构】")]/text()').
            re('[^\s]')).replace(u'【颁布机构】', '')
        #[@class!="law_realate"]
        content = ''.join(response.css('.law-content').extract())

        #替换【相关资料】【相关词条】 等内容
        for cxt in response.css('.law_realate').extract():
            content = content.replace(cxt, "")

        item["content"] = re.sub('(class|style|color|href|name)="[^"]*?"', '',
                                 content)  # 内容
        item['url'] = response.url
        item['source'] = u"问法法规"
        uid = str(uuid.uuid1()).replace('-', '')
        self.statuteData.insert_statute(
            (uid, item['time_liness'], item['effect_time'], item['level'],
             item['pubish_time'], item['title'], item['anNo'], item['source'],
             item['pubish_org'], item["content"], 0))
        del item["content"]
        print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#6
0
class HuhhotSpider(scrapy.Spider):
    # 呼和浩特市人民政府

    name = "huhhot"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    cityName = u"呼和浩特"
    provinceCode='15'
    cityCode='1506'
    level = u"地方法规"
    pubish_time='2019-04-23'

    allowed_domains=["www.huhhot.gov.cn"]

    start_urls = [ 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/']

    fgparamlist = [
        {'type': u'决定', 'page': '3', 'url':'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/219/index_74%s.html'},
        {'type': u'命令', 'page': '1', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/220/index_74%s.html'},
        {'type': u'通告', 'page': '4', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/221/index_74%s.html'},
        {'type': u'公告', 'page': '6', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1888/index_74%s.html'},
        {'type': u'意见', 'page': '6', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/222/index_74%s.html'},
        {'type': u'通知', 'page': '67', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/223/index_74%s.html'},
        {'type': u'通报', 'page': '2', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/224/index_74%s.html'},
        {'type': u'批复', 'page': '30', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/225/index_74%s.html'},
        {'type': u'报告', 'page': '10', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1020/index_74%s.html'},
        {'type': u'会议纪要', 'page': '3', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1860/index_74%s.html'},
        {'type': u'其他', 'page': '67', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/226/index_74%s.html'},
    ]

    page_domain = "http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx%s"

    def parse(self, response):
        send_requests = []

        for item in self.fgparamlist:
            page = int(item['page'])
            listurl = item['url']
            for index in range(1, page + 1):
                if index==1:
                    newurl = listurl % ''
                    send_requests.append(scrapy.Request(newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': item['type']}, ))
                else:
                    p=index-1;
                    newurl=listurl % '_%s' % p
                    send_requests.append(scrapy.Request(newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': item['type']}, ))

        return send_requests

    def parse_list(self, response):
        for item in response.css("#tbStu td a"):
            detail_url =  self.page_domain % ''.join(item.css("::attr(href)").extract()).replace('../..','')
            yield scrapy.Request(detail_url, callback=self.parse_detail, method='get',errback=self.handle_error,meta={'type': response.meta['type']})
        pass

    def parse_detail(self,response):
        item = {}
        title = ''.join(response.css('.zwgkxl_content h3::text').extract())
        if title != '':
            item['title'] = title
            item['anNo'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(4)::text').extract())
            item['pubish_time'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(6)::text').extract())
            item['effect_time'] = ''.join(response.css('.xxgk_tlb tr:nth-child(3) td:nth-child(6)::text').extract())
            item['pubish_org'] = ''.join(response.css('.xxgk_tlb tr:nth-child(3) td:nth-child(2)::text').extract())
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('.trs_word').extract())
            item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '',content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["cityName"] = self.cityName
            item["provinceCode"] = self.provinceCode
            item["cityCode"] = self.cityCode
            item['sIndex'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(2)::text').extract())
            item["sTypeName"] = response.meta["type"]
            item['source'] = u"呼和浩特市人民政府"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#7
0
class TongLiaoSpider(scrapy.Spider):
    #  通辽市政府网

    name = "tongliao"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    cityName = u"通辽市"
    provinceCode = '15'
    cityCode = '1508'
    level = u"地方法规"
    pubish_time = '2019-04-23'

    allowed_domains = ["www.tongliao.gov.cn"]

    start_urls = ['http://www.tongliao.gov.cn/tl/ztfl/gkml.shtml']

    page_domain = "http://www.tongliao.gov.cn%s"

    fgparamlist = [
        {
            'type':
            u'决定',
            'page':
            '41',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=4f023e3455c2427d9a779bf9d5609b58&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'报告',
            'page':
            '4',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=6c32e4a797504bf794249ca2282780bb&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'公告',
            'page':
            '1',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=a2f5809614104fae871ffbe92629f036&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'通告',
            'page':
            '2',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=6d92ebf491f445578629f29ae2d00acc&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'意见',
            'page':
            '9',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=4d89a2fd76de4fb1a81eecad0ff3db1a&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'通知',
            'page':
            '59',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=f50f51364fc9455181011222b2a67809&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'通报',
            'page':
            '5',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=183816bd4c9843ab8ff177ccbd715321&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'批复',
            'page':
            '1',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=93b71c1c6c624b059abfa9fb59f43990&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
        {
            'type':
            u'其他',
            'page':
            '32',
            'url':
            'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=8424888c9d614124ac179e6e0fd8569a&parentChannelId=402881a06053e512016053e51cfd0087&page=%d'
        },
    ]

    def parse(self, response):
        send_requests = []

        for item in self.fgparamlist:
            page = int(item['page'])
            listurl = item['url']
            for index in range(1, page + 1):
                newurl = listurl % index
                send_requests.append(
                    scrapy.Request(
                        newurl,
                        callback=self.parse_list,
                        method='get',
                        errback=self.handle_error,
                        meta={'type': item['type']},
                    ))

        return send_requests

    def parse_list(self, response):
        for item in response.css(".dataList td a"):
            detail_url = self.page_domain % ''.join(
                item.css("::attr(href)").extract()).replace('../..', '')
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error,
                                 meta={'type': response.meta['type']})
        pass

    def parse_detail(self, response):
        item = {}
        title = ''.join(response.css('.textc::text').re('[^\s+]'))
        if title != '':
            item['title'] = title
            anNo = ''.join(
                response.css('.detail_ysj:nth-child(5) em::text').extract())
            if anNo == '':
                anNo = None
            item['anNo'] = anNo
            item['pubish_time'] = ''.join(
                response.css('.detail_ysj:nth-child(7) em::text').extract())
            item['effect_time'] = None
            item['pubish_org'] = ''.join(
                response.css('.detail_ysj:nth-child(2) em::text').extract())
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('#text01').extract())
            item["content"] = re.sub(
                '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)',
                '', content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["cityName"] = self.cityName
            item["provinceCode"] = self.provinceCode
            item["cityCode"] = self.cityCode
            item['sIndex'] = ''.join(
                response.css('.detail_ysj:nth-child(1) em::text').extract())
            item["sTypeName"] = response.meta["type"]
            item['source'] = u"通辽市人民政府"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#8
0
class NMGShengSpider(scrapy.Spider):
    # 内蒙古自治区人民政府网

    name = "nmgsheng"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    provinceCode = '15'
    level = u"地方法规"
    pubish_time = '2019-04-23'

    allowed_domains = ["www.nmg.gov.cn"]

    start_urls = ['http://www.nmg.gov.cn/col/col4191/index.html']

    fgparamlist = [{
        'type': u'决定',
        'page': '2',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '13',
            'vc_bm': 'NC1',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'命令',
        'page': '3',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '14',
            'vc_bm': 'NC2',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'通报',
        'page': '3',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '17',
            'vc_bm': 'NC5',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'意见',
        'page': '20',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '18',
            'vc_bm': 'NC6',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'批复',
        'page': '1',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '19',
            'vc_bm': 'NC7',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'通知',
        'page': '89',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '20',
            'vc_bm': 'NC8',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'公告',
        'page': '1',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '23',
            'vc_bm': 'NC11',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'通告',
        'page': '2',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '24',
            'vc_bm': 'NC12',
            'area': '1115000001151201XD'
        }
    }, {
        'type': u'其他',
        'page': '1',
        'params': {
            'infotypeId': '0',
            'jdid': '2',
            'nServiceid': '26',
            'vc_bm': 'NC14',
            'area': '1115000001151201XD'
        }
    }]

    headers = {
        'X-Requested-With': 'XMLHttpRequest',
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
    }

    def parse(self, response):
        send_requests = []

        for item in self.fgparamlist:
            page = int(item['page'])
            for index in range(1, page + 1):
                url = "http://www.nmg.gov.cn/module/xxgk/serviceinfo.jsp?currpage=%d" % index
                send_requests.append(
                    scrapy.FormRequest(url=url,
                                       method="POST",
                                       formdata=item['params'],
                                       headers=self.headers,
                                       callback=self.parse_list,
                                       meta={'type': item['type']},
                                       errback=self.handle_error))
        return send_requests

    def parse_list(self, response):
        for item in response.css('table a[href*="http://www.nmg.gov.cn"]'):
            url = ''.join(item.css("::attr(href)").extract()).replace(
                '../..', '')
            yield scrapy.Request(url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error,
                                 meta={'type': response.meta['type']})

    def parse_detail(self, response):
        item = {}
        title = ''.join(response.css('.main-fl-tit::text').re('[^\s+]'))
        if title != '':
            item['title'] = title
            item['anNo'] = ''.join(
                response.css(
                    '.xxgk_table tr:nth-child(3) td:nth-child(2)::text').re(
                        '[^\s+]'))
            item['pubish_time'] = ''.join(
                response.css(
                    '.xxgk_table tr:nth-child(2) td:nth-child(4)::text').re(
                        '[^\s+]'))
            item['effect_time'] = None
            item['pubish_org'] = ''.join(
                response.css(
                    '.xxgk_table tr:nth-child(2) td:nth-child(2)::text').re(
                        '[^\s+]'))
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('#zoom').extract())
            item["content"] = re.sub(
                '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)',
                '', content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["provinceCode"] = self.provinceCode
            item["cityName"] = None
            item["cityCode"] = None
            item['sIndex'] = ''.join(
                response.css(
                    '.xxgk_table tr:nth-child(1) td:nth-child(2)::text').
                extract())
            item["sTypeName"] = response.meta["type"]
            item['source'] = u"内蒙古自治区人民政府"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item

        pass

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#9
0
class qfayuan_fagui_spider(scrapy.Spider):
    """中国法院网法规"""

    name = "qfyfagui"
    statuteData = StatuteData()
    allowed_domains = ["www.chinacourt.org"]

    start_urls = [
        'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml',  # 国家法规库
        'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D/page/1.shtml',  # 司法解释
        'https://www.chinacourt.org/law/more/law_type_id/MzAwMkAFAA%3D%3D/page/1.shtml',  # 地方法规
        'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml'  # 政策参考
    ]
    page_domain = "http://www.chinacourt.org%s"

    def parse(self, response):
        self.parse_list(response)
        pageurlstr = ''.join(
            response.xpath(u'//a[text()="尾页"]/@href').extract())
        pagecount = int(
            re.match('.*/page/(?P<page>\d+)\.shtml', pageurlstr).group('page'))
        for page in range(2, pagecount):
            pageurl = re.sub(r'(?P<page>\d+)\.shtml',
                             '{0}.shtml'.format(str(page)), response.url)
            yield scrapy.Request(pageurl,
                                 callback=self.parse_list,
                                 method='get',
                                 errback=self.handle_error)

    def parse_list(self, response):
        for item in response.xpath('//div[@class="law_list"]')[0].css("ul li"):
            detailurl = item.css('.left a::attr(href)').extract_first()
            title = item.css('.left a::text').extract_first()
            detail_url = self.page_domain % detailurl
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error,
                                 meta={"title": title})

    def parse_detail(self, response):
        item = {}
        item['title'] = response.meta['title']
        if response.css(" .STitle") != None:
            STitle = "".join(
                response.css(".law_content .STitle").re('[^\s]')).split("<br>")
            item['anNo'] = ''
            item['pubish_time'] = None
            item['effect_time'] = None
            item['pubish_org'] = ''
            item['level'] = ''
            for st in STitle:
                sindex = st.find(u"】") + 1
                if st.find(u"发布文号") != -1:
                    # 发文字号
                    item['anNo'] = st[sindex:len(st)]
                if st.find(u"发布日期") != -1:
                    # 颁布日期
                    item['pubish_time'] = st[sindex:len(st)]

                if st.find(u"生效日期") != -1:
                    # 生效日期
                    item['effect_time'] = st[sindex:len(st)]

                    # 颁布机构
                if st.find(u"发布单位") != -1:
                    item['pubish_org'] = st[sindex:len(st)]

                # 效力级别
                if st.find(u"所属类别") != -1:
                    item['level'] = st[sindex:len(st)]
            # 时效性
            item['time_liness'] = ""
            content = ''.join(response.css('.content_text').extract())
            item["content"] = re.sub('(class|style|color|href)="[^"]*?"', '',
                                     content)  # 内容'''
            item['url'] = response.url
            item['source'] = u"中国法院网"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            uid = str(uuid.uuid1()).replace('-', '')
            #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild
            self.statuteData.insert_statute(
                (uid, u'', item['effect_time'], item['level'],
                 item['pubish_time'], item['title'], item['anNo'],
                 item['source'], item['pubish_org'], item["content"], 0))
            del item["content"]
            print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#10
0
class WYFGSpider(scrapy.Spider):
    """中国法院网法规"""

    name = "chinalawfagui"
    statuteData = StatuteData()
    allowed_domains = ["www.chinalaw.gov.cn"]

    start_urls = [
        'http://www.chinalaw.gov.cn',
    ]
    page_domain = "http://www.chinalaw.gov.cn%s"

    def parse(self, response):
        rqs = []
        with open(os.path.abspath('./chinalawData.json'), 'r') as f:
            line = json.load(f)
            rqs.extend(self.parse_list(u"国家法律法规", line["fvfg"]))
            rqs.extend(self.parse_list(u"地方法规", line["df"]))
            rqs.extend(self.parse_list(u"行业团体规定", line["xztt"]))
            rqs.extend(self.parse_list(u"部门规章", line["bumen"]))
        return rqs

    def parse_list(self, level, source):
        for item in source:
            yield scrapy.Request(self.page_domain % item["infostaticurl"],
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error,
                                 meta={
                                     'level': level,
                                     'title': item["listtitle"],
                                     'pub_time': item["releasedate"]
                                 })

    def parse_detail(self, response):
        item = {}
        title = response.meta['title']
        if title != '':
            item['title'] = title
            item['anNo'] = None
            item['pubish_time'] = response.meta['pub_time']
            item['effect_time'] = response.meta['pub_time']
            item['pubish_org'] = None
            item['level'] = response.meta['level']
            item['time_liness'] = "现行有效"
            content = ''.join(
                response.xpath('//div[@id="content"]/span').extract())
            item["content"] = re.sub(
                '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)',
                '', content)  # 内容'''
            item['url'] = response.url
            item['source'] = u"中国政府法制信息网"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            uid = str(uuid.uuid1()).replace('-', '')
            #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild
            self.statuteData.insert_statute(
                (uid, u'现行有效', item['effect_time'], item['level'],
                 item['pubish_time'], item['title'], item['anNo'],
                 item['source'], item['pubish_org'], item["content"], 0))
            del item["content"]
            print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#11
0
class XAMSpider(scrapy.Spider):
    # 兴安盟行政公署

    name = "xinganmeng"
    statuteData = StatuteData()

    provinceName = u"内蒙古"
    cityName = u"兴安盟"
    provinceCode = '15'
    cityCode = '1512'
    level = u"地方法规"
    pubish_time = '2019-04-23'

    allowed_domains = ["xam.gov.cn"]

    start_urls = ['http://www.xam.gov.cn/xam/_300473/_300600/index.html']

    page_domain = "http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx%s"

    page_domain = "http://www.xam.gov.cn%s"

    def parse(self, response):
        send_requests = []

        for item in response.css('#tsxa_r a[href^="/xam"]'):
            url = ''.join(item.css("::attr(href)").extract()).replace(
                '../..', '')
            send_requests.append(
                scrapy.Request(self.page_domain % url,
                               callback=self.parse_list,
                               method='get',
                               errback=self.handle_error))

        return send_requests

    def parse_list(self, response):
        for item in response.css(".gkml_list a"):
            detail_url = ''.join(item.css("::attr(href)").extract()).replace(
                '../..', '')
            yield scrapy.Request(self.page_domain % detail_url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error)

        #处理分页
        nexturl = ''.join(
            response.css('.gkml_con .page a:nth-child(5)::attr(tagname)').
            extract()).replace('../..', '')
        if nexturl <> '[NEXTPAGE]':
            yield scrapy.Request(self.page_domain % nexturl,
                                 callback=self.parse_list,
                                 method='get',
                                 errback=self.handle_error)

        pass

    def parse_detail(self, response):
        item = {}
        title = ''.join(response.css('.qzqd_tit::text').re('[^\s+]'))
        if title != '':
            item['title'] = title
            item['anNo'] = ''.join(
                response.css('.xxgk_top li:nth-child(3)::text').re('[^\s+]'))
            item['pubish_time'] = ''.join(
                response.css('.xxgk_top li:nth-child(4)::text').re(
                    '[^\s+]')).replace(u"年",
                                       "-").replace(u"月",
                                                    "-").replace(u"日", "")
            item['effect_time'] = None
            item['pubish_org'] = ''.join(
                response.css('.xxgk_top li:nth-child(2)::text').re('[^\s+]'))
            item['level'] = self.level
            item['time_liness'] = u"现行有效"
            content = ''.join(response.css('.content_xilan').extract())
            item["content"] = re.sub(
                '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)|(?i)(<SCRIPT)[\\s\\S]*?((</SCRIPT>)|(/>))|(?i)(<style)[\\s\\S]*?((</style>)|(/>))',
                '', content)  # 内容'''
            #item["content"] = ''
            item['url'] = response.url
            item["provinceName"] = self.provinceName
            item["cityName"] = self.cityName
            item["provinceCode"] = self.provinceCode
            item["cityCode"] = self.cityCode
            item['sIndex'] = ''.join(
                response.css('.xxgk_top li:nth-child(1)::text').re('[^\s+]'))

            #判断文件类型
            stype = ''
            if '通知' in title:
                stype = u'通知'
            elif '通告' in title:
                stype = u'通告'
            elif '批复' in title:
                stype = u'批复'
            elif '命令' in title:
                stype = u'命令'
            elif '通报' in title:
                stype = u'通报'
            elif '意见' in title:
                stype = u'意见'
            elif '决定' in title:
                stype = u'决定'
            elif '公告' in title:
                stype = u'公告'
            else:
                stype = u'其他'

            item["sTypeName"] = stype
            item['source'] = u"兴安盟行政公署"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            item["Id"] = str(uuid.uuid1()).replace('-', '')
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#12
0
 def __init__(self):
     self.statuteData = StatuteData()
示例#13
0
class WYFGSpider(scrapy.Spider):
    """中国法院网法规"""

    name = "fyfagui"
    statuteData = StatuteData()
    allowed_domains = ["www.chinacourt.org"]
    #抓取法规最后发布时间
    last_faguis = [{
        'source': u'地方法规',
        'pubish_time': '1950-03-29'
    }, {
        'source': u'国家法律法规',
        'pubish_time': '1949-09-29'
    }, {
        'source': u'司法解释',
        'pubish_time': '2011-03-01'
    }, {
        'source': u'政策参考',
        'pubish_time': '1987-05-31'
    }]

    start_urls = [
        'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml',  #国家法规库
        'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D/page/1.shtml',  #司法解释
        'https://www.chinacourt.org/law/more/law_type_id/MzAwMkAFAA%3D%3D/page/1.shtml',  #地方法规
        'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml'  #政策参考
    ]
    page_domain = "http://www.chinacourt.org%s"

    def find_last_faguis(self, source, pub_time):
        hax_next = 1
        for item in self.last_faguis:
            if item['source'] == source and datetime.strptime(
                    item['pubish_time'], "%Y-%m-%d") < datetime.strptime(
                        pub_time, "%Y-%m-%d"):
                hax_next = 0
                break
        return hax_next

    def find_spider_max_fagui(self, response):
        hax_next = 1
        source = response.xpath('//div[@id="title"]/text()').extract_first()
        for item in response.xpath('//div[@class="law_list"]')[0].css(
                "ul li").css('.right::text').extract():
            hax_next = self.find_last_faguis(source, item)
            if hax_next == 1:
                break
        return hax_next

    def parse(self, response):
        page_list = []
        page_list.extend(self.parse_list(response))
        if self.find_spider_max_fagui(response) == 0:
            pageIndex = int(
                re.match('.*/page/(?P<page>\d+)\.shtml',
                         response.url).group('page'))
            if response.xpath(u'//a[text()="下一页"]') != None:
                pageurl = re.sub(r'(?P<page>\d+)\.shtml',
                                 '{0}.shtml'.format(str(pageIndex + 1)),
                                 response.url)
                page_list.append(
                    scrapy.Request(pageurl,
                                   callback=self.parse,
                                   method='get',
                                   errback=self.handle_error))
        return page_list

    def parse_list(self, response):
        page_list = []
        source = response.xpath('//div[@id="title"]/text()').extract_first()
        for item in response.xpath('//div[@class="law_list"]')[0].css("ul li"):
            detailurl = item.css('.left a::attr(href)').extract_first()
            if self.find_last_faguis(
                    source,
                    ''.join(item.xpath('span[2]/text()').extract())) == 0:
                detail_url = self.page_domain % item
                page_list.append(
                    scrapy.Request(self.page_domain % detailurl,
                                   callback=self.parse_detail,
                                   method='get',
                                   errback=self.handle_error))
        return page_list

    def parse_detail(self, response):
        item = {}
        title = "".join(response.xpath("//strong[1]/text()").re('[^\s]'))
        item['title'] = title
        if response.css(".law_content .STitle") != None and title != '':
            STitle = "".join(
                response.css(".law_content .STitle").re('[^\s]')).split("<br>")
            item['anNo'] = None
            item['pubish_time'] = None
            item['effect_time'] = None
            item['pubish_org'] = None
            item['level'] = None
            for st in STitle:
                sindex = st.find(u"】") + 1
                if st.find(u"发布文号") != -1:
                    # 发文字号
                    item['anNo'] = st[sindex:len(st)]
                if st.find(u"发布日期") != -1:
                    # 颁布日期
                    item['pubish_time'] = st[sindex:len(st)]

                if st.find(u"生效日期") != -1:
                    # 生效日期
                    item['effect_time'] = st[sindex:len(st)]

                    # 颁布机构
                if st.find(u"发布单位") != -1:
                    item['pubish_org'] = st[sindex:len(st)]

                # 效力级别
                if st.find(u"所属类别") != -1:
                    item['level'] = st[sindex:len(st)]
            # 时效性
            item['time_liness'] = ""
            content = ''.join(response.css('.content_text').extract())
            item["content"] = re.sub('(class|style|color|href)="[^"]*?"', '',
                                     content)  # 内容'''
            item['url'] = response.url
            item['source'] = u"中国法院网"
            # 是否导入正式数据库
            item['export'] = '0'
            item['collection'] = 'fagui'
            uid = str(uuid.uuid1()).replace('-', '')
            #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild
            # self.statuteData.insert_statute((uid,u'',item['effect_time'],item['level'],item['pubish_time'],item['title'],item['anNo'],item['source'],item['pubish_org'],item["content"],0))
            del item["content"]
            print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
示例#14
0
class LawtimeSpider(scrapy.spiders.Spider):
    name = "lawtime"
    start_urls = [
        "http://law.lawtime.cn/gjfg_2_100.html",
        "http://law.lawtime.cn/gjfg_3_101.html",
        "http://law.lawtime.cn/gjfg_4_102.html",
        "http://law.lawtime.cn/gjfg_5_117.html",
        "http://law.lawtime.cn/sfjs.html",
        "http://law.lawtime.cn/dffg_2_107.html",
        "http://law.lawtime.cn/dffg_3_114.html",
        "http://law.lawtime.cn/dffg_4_116.html",
        "http://law.lawtime.cn/dffg_5_115.html",
        "http://law.lawtime.cn/gjty.html", "http://law.lawtime.cn/hygf.html",
        "http://law.lawtime.cn/lfca.html",
        "http://law.lawtime.cn/lifadongtai.html"
    ]
    statuteData = StatuteData()

    #爬虫入口
    def parse(self, response):
        return self.parse_article_count(response)

    # 解析文章数量
    def parse_article_count(self, response):
        page_count = response.xpath(
            '//div[@class="paging"]/a[@class="dot"]/following-sibling::*[1]/text()'
        ).extract_first()
        if page_count is None:
            return

        for index in range(1, int(page_count) + 1):
            extension_index = response.url.index(".html")
            url = response.url[0:extension_index] + "_" + str(index) + ".html"
            yield scrapy.Request(url, callback=self.parse_article_list)

    #获取指定页面的列表
    def parse_article_list(self, response):
        spans = response.xpath('//ul[@class="kc_complex_ul"]/li/span')
        for span in spans:
            article_url = span.xpath("a/@href").extract_first()
            if article_url is not None:
                pass
                yield scrapy.Request("http:" + article_url,
                                     callback=self.parse_article)

    #解析文章内容
    def parse_article(self, response):
        item = {}
        item["url"] = response.url
        top = response.xpath('//div[@class="a_cont_top"]')
        item['title'] = top.xpath('h1/text()').extract()[0]
        organization = top.xpath(
            'p/span[contains(text(),"颁布单位")]/following-sibling::span[1]/text()'
            .decode("utf-8")).extract()
        publish_time = top.xpath(
            'p/span[contains(text(),"颁布时间")]/following-sibling::span[1]/text()'
            .decode("utf-8")).extract()
        release_time = top.xpath(
            'p/span[contains(text(),"实施日期")]/following-sibling::span[1]/text()'
            .decode("utf-8")).extract()
        level = top.xpath(
            'p/span[contains(text(),"效力级别")]/following-sibling::span[1]/text()'
            .decode("utf-8")).extract()
        effect = top.xpath(
            'p/span[contains(text(),"时效性")]/following-sibling::span[1]/text()'.
            decode("utf-8")).extract()
        anNo = top.xpath(
            'p/span[contains(text(),"发文字号")]/following-sibling::span[1]/text()'
            .decode("utf-8")).extract()
        item['anNo'] = ''
        item['pubish_time'] = ''
        item['effect_time'] = ''
        item['pubish_org'] = ''
        item['level'] = ''
        item['time_liness'] = u'已失效'
        if (len(organization) != 0):
            item['pubish_org'] = organization[0].replace("\r\n",
                                                         "").split(u" ")[0]
        if (len(publish_time) != 0):
            item['pubish_time'] = publish_time[0]
        if (len(release_time) != 0):
            item['effect_time'] = release_time[0]
        if (len(level) != 0):
            item['level'] = level[0].replace("\r\n", "")
        if (len(effect) != 0):
            df = effect[0].replace(" ", "").replace("\n", "")
            if df == u'有效':
                item['time_liness'] = u'现行有效'
            else:
                item['time_liness'] = u'已失效'
        if (len(anNo) != 0):
            item['anNo'] = anNo[0].replace("\r\n", "").replace('-', '')
        # 是否导入正式数据库
        item['export'] = '0'
        item['source'] = u"法律快车"
        content = "".join(
            response.xpath('//div[@class="a_cont_main"]').extract()).replace(
                '\r\n', '')
        content = re.sub('(class|style|color|href|target|align)="[^"]*?"', '',
                         content)  # 内容'''
        item['content'] = content
        uid = str(uuid.uuid1()).replace('-', '')
        self.statuteData.insert_statute(
            (uid, item['time_liness'], item['effect_time'], item['level'],
             item['pubish_time'], item['title'], item['anNo'], item['source'],
             item['pubish_org'], item["content"], 0))
        del item['content']
        print item
示例#15
0
class cjfayuan_fagui_spider(scrapy.Spider):
    """财经法规"""

    name = "cjfyfagui"
    statuteData = StatuteData()
    allowed_domains = ["www.chinaacc.com"]

    start_urls = [
        # 'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml', #国家法规库
        #'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D.shtml', #司法解释
        'http://www.chinaacc.com/fagui/search.shtm',  #地方法规
        #'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml'#政策参考
    ]
    page_domain = "http://www.chinaacc.com%s"

    def parse(self, response):
        pageurl = 'http://www.chinaacc.com/dffg/page{0}.shtm'
        for item in range(1, 4120):
            yield scrapy.Request(pageurl.format(str(item)),
                                 callback=self.parse_list,
                                 method='get',
                                 errback=self.handle_error)

    def parse_list(self, response):
        for item in response.xpath('//div[@class="lqnr clearfix"]/dl/dd'):
            detailurl = item.css('a::attr(href)').extract_first()
            detail_url = self.page_domain % detailurl
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 method='get',
                                 errback=self.handle_error)

    def parse_detail(self, response):
        item = {}
        item['title'] = response.xpath(
            "//div[@class='top clearfix']/h1/text()").extract_first()
        # 发文字号
        item['anNo'] = ''.join(
            response.xpath("//div[@class='top clearfix']").css(
                ".c::text").extract())
        # 颁布日期
        item['pubish_time'] = ''.join(
            response.xpath("//div[@class='top clearfix']").css(".b::text").re(
                '[^\s]')).replace(u'颁布时间:', '').replace('\n', '').replace(
                    '\r', '').replace('\t', '').replace(' ', '')

        # 生效日期
        item['effect_time'] = ''.join(
            response.xpath("//div[@class='top clearfix']").css(".b::text").re(
                '[^\s]')).replace(u'颁布时间:', '').replace('\n', '').replace(
                    '\r', '').replace('\t', '').replace(' ', '')

        # 颁布机构
        item['pubish_org'] = ''.join(
            response.xpath("//div[@class='top clearfix']").css(
                ".b span::text").re('[^\s]')).replace(u'发文单位:', '').replace(
                    '\n', '').replace('\r', '').replace('\t',
                                                        '').replace(' ', '')

        # 效力级别
        item['level'] = u'地方法规'
        # 时效性
        item['time_liness'] = ""
        content = ''.join(
            response.xpath('//div[@class="cen clearfix"]').extract())
        item["content"] = re.sub(
            '(class|style|color|href|target|align)="[^"]*?"', '',
            content).replace(u'【', '').replace(u'我要纠错', '').replace(
                u'】 责任编辑:',
                '').replace(u'大白兔', '').replace(u'小海鸥',
                                                '').replace('qzz', '')  # 内容'''
        item['url'] = response.url
        item['source'] = u"中华会计网校"
        # 是否导入正式数据库
        item['export'] = '0'
        item['collection'] = 'fagui'
        uid = str(uuid.uuid1()).replace('-', '')
        # Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild
        self.statuteData.insert_statute(
            (uid, u'', item['effect_time'], item['level'], item['pubish_time'],
             item['title'], item['anNo'], item['source'], item['pubish_org'],
             item["content"], 0))
        del item["content"]
        print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)