예제 #1
0
    def parse(self, response):
        print(self.hello)
        print('--' * 1000)
        li_list = response.xpath("//ul[@class='title-state-ul']/li")
        for li in li_list:
            item = YangguangItem()
            item['id'] = li.xpath(
                "./span[@class='state1']/text()").extract_first()
            item['title'] = li.xpath(
                "./span[@class='state3']/a/text()").extract_first()
            item['href'] = li.xpath(
                "./span[@class='state3']/a/@href").extract_first()
            item['href'] = 'http://wzzdg.sun0769.com/' + item['href']
            item['public_date'] = li.xpath(
                "./span[@class='state5']/text()").extract_first()
            item['sleep_date'] = li.xpath(
                "./span[@class='state4']/text()").extract_first()

            yield scrapy.Request(item['href'],
                                 callback=self.parse_detail,
                                 meta={'item': item})

            # 翻页
            next_url = response.xpath(
                "//a[@class='arrow-page prov_rota']/@href").get()
            next_url = 'http://wzzdg.sun0769.com' + next_url
            if next_url is not None:
                yield scrapy.Request(next_url, callback=self.parse)
예제 #2
0
파일: yg.py 프로젝트: zhouzhou0/python_zhou
    def parse(self, response):
        # self.settings["MONGO_HOST"]
        # self.settings.get("MONGO_HOST","")
        #分组
        print(self.hello, "*" * 100)
        item = YangguangItem()
        tr_list = response.xpath(
            "//div[@class='greyframe']//table[2]//tr/td/table//tr")
        for tr in tr_list:
            item['id'] = tr.xpath("./td[1]/text()").extract_first()
            item['title'] = tr.xpath("./td[2]/a[2]/text()").extract_first()
            item['content_url'] = tr.xpath(
                "./td[2]/a[2]/@href").extract_first()
            item['city'] = tr.xpath("./td[2]/a[3]/text()").extract_first()
            item['state'] = tr.xpath("./td[3]/span/text()").extract_first()
            item['complainant'] = tr.xpath("./td[4]/text()").extract_first()
            item['time'] = tr.xpath("./td[5]/text()").extract_first()
            # logger.warning(item)
            # yield item
            yield scrapy.Request(item['content_url'],
                                 callback=self.parse_detail,
                                 meta={"item": item})
        next_url = response.xpath("//a[text()='>']/@href").extract_first()

        if next_url != None:
            yield scrapy.Request(next_url, callback=self.parse)
예제 #3
0
파일: yg.py 프로젝트: wanghp1982/yangguang
 def parse(self, response):
     tr_list = response.xpath(
         '//div[@class="width-12"]//ul[@class="title-state-ul"]/li')
     for tr in tr_list:
         item = YangguangItem()
         item["title_NO"] = tr.xpath(
             './span[@class="state1"]/text()').extract_first()
         item["status"] = tr.xpath(
             './span[@class="state2"]/text()').extract_first()
         item["title"] = tr.xpath(
             './span[@class="state3"]/a/text()').extract_first()
         item["title_url"] = tr.xpath(
             './span[@class="state3"]/a/@href').extract_first()
         item["title_url"] = self.base_url + item["title_url"]
         item["askTime"] = tr.xpath(
             './span[@class="state4"]/text()').extract_first()
         item["answerTime"] = tr.xpath(
             './span[@class="state5 "]/text()').extract_first()
         # print(item)
         logger.warning(item)
         yield scrapy.Request(item["title_url"],
                              callback=self.parse_detail,
                              meta={"item": item})
         next_url = response.xpath(
             '//div[@class="mr-three paging-box"]//a[@class="arrow-page prov_rota"]/@href'
         ).extract_first()
         if next_url is not None:
             yield scrapy.Request(self.base_url + next_url,
                                  callback=self.parse)
예제 #4
0
    def parse(self, response):
        # print(response.)
        # 分组
        tr_list = response.xpath('//div[@id="morelist"]//table//table//tr')
        # print(len(tr_list))

        for tr in tr_list:
            item = YangguangItem()
            item['title'] = tr.xpath(
                './/a[@class="news14"]/text()').extract_first()
            item['author'] = tr.xpath('.//td[last()-1]/text()').extract_first()
            item['href'] = tr.xpath('.//td[2]/a[2]/@href').extract_first()
            item['status'] = tr.xpath(
                './/td[last()-2]/span/text()').extract_first()
            item['publish_time'] = tr.xpath(
                './/td[last()]/text()').extract_first()

            # 处理详情页
            yield scrapy.Request(item['href'],
                                 callback=self.parse_detail,
                                 meta={"item": item})

        # 构造下一页url
        page = response.xpath(
            '//div[@class="pagination"]/span/text()').extract_first()
        next_url = response.xpath('//a[text()=">"]/@href').extract_first()
        if next_url is not None:
            print('第{}页成功'.format(page))
            yield scrapy.Request(next_url, callback=self.parse)
예제 #5
0
    def parse_item(self, response):
        item = YangguangItem()

        item['title'] = re.search(r"提问:(.*)", response.css("div.wzy1 span.niae2_top::text").extract_first()).group(1)
        item['number'] = re.search(r"编号:(.*)", response.css("div.wzy1 span.niae2_top+span::text").extract_first()).group(1)
        item['content'] = response.css("div.wzy1 table:nth-child(2) tr:first-child td::text").extract_first().strip('\xa0')
        item['url'] = response.url

        yield item
예제 #6
0
    def parse(self, response):
        tr_list = response.xpath("//div[@class='pagecenter']/table[2]/tr/td/table/tr")
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath("./td[2]/a[@class='news14]/@title").extract_first()
            item["href"] = tr.xpath("./td[2]/a[@class='news14]/@href").extract_first()
            item["publish_date"] = tr.xpath("./td[last()]/text()").extract_first()

            yield scrapy.Request{
                item["href"]
                callback = self.parse_detail(meta)
                meta = {"item": item}
            }
예제 #7
0
파일: can.py 프로젝트: aimlv/-
    def parse(self, response):
        tr_list = response.xpath("//div[@class='newsHead clearfix']/table[2]/tr")
        for tr in tr_list:
        	item = YangguangItem()
        	item['title'] = tr.xpath("./td[3]/a[1]/text()").extract_first()
        	item['href'] = tr.xpath("./td[3]/a[1]/@href").extract_first()
        	item['publish_data'] = tr.xpath("./td[6]/text()").extract_first()


        	yield scrapy.Request(
        			item['href'],
        			callback = self.parse_detail,
        			meta = {'item':item}
        		)
예제 #8
0
파일: yg.py 프로젝트: avalonFate/python-
 def parse(self, response):
     tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
     for tr in tr_list:
         item = YangguangItem()  #实例化一个Itme
         item["num"] = tr.xpath("./td[1]/text()").extract_first()
         item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first()
         item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first()
         item["stats"] = tr.xpath("./td[3]/span/text()").extract_first()
         item["author_name"] = tr.xpath("./td[4]/text()").extract_first()
         yield scrapy.Request(  #发送关于详情页的请求
             item["href"],
             callback=self.parse_detail,
             meta={"item":item}  #meta携带参数,meta是个字典,给回调函数的parse传递参数
         )
예제 #9
0
 def parse(self, response):
     # 提取当前页的数据
     # 分组在提取
     tr_list = response.xpath(
         "//div[@class='greyframe']/table[2]/tr/td/table/tr")
     # 获取下一页
     print(tr_list)
     for tr in tr_list:
         item = YangguangItem()
         item['num'] = tr.xpath("./td[1]/text()").extract_first()
         item['title'] = tr.xpath("./td[2]/a/text()").extract_first()
         item['href'] = tr.xpath("./td[2]/a/text()").extract_first()
         item['status'] = tr.xpath("./td[3]/span/text()").extract_first()
         item['name'] = tr.xpath("./td[4]/text()").extract_first()
         item['publish_data'] = tr.xpath("./td[5]/text()").extract_first()
         print(item)
예제 #10
0
파일: yg.py 프로젝트: lxconfig/web_spider
 def parse(self, response):
     tr_list = response.xpath('//div[@class="greyframe"]/table[2]/tr/td/table/tr')
     # 分组
     for tr in tr_list:
         item = YangguangItem()
         item["ID"] = tr.xpath('./td[1]/text()').extract_first()
         item["title"] = tr.xpath('./td[2]/a[2]/@title').extract_first()
         item["href"] = tr.xpath('./td[2]/a[2]/@href').extract_first()
         item["status"] = tr.xpath('./td[3]/span/text()').extract_first()
         item["user_name"] = tr.xpath('./td[4]/text()').extract_first()
         item["publish_time"] = tr.xpath('./td[5]/text()').extract_first()
         yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": item})
     # 翻页
     next_url = response.xpath('//div[@class="pagination"]/a[text()=">"]/@href').extract_first()
     if next_url is not None:
         yield scrapy.Request(next_url, callback=self.parse)
예제 #11
0
파일: yg.py 프로젝트: python-steven/spider
    def parse(self, response):
        tr_list = response.xpath(
            "//div[@class='greyframe']/table[2]/tr/td/table/tr")
        print(len(tr_list))
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath("./td[2]/a[@class='news14']/@title").get()
            item["href"] = tr.xpath("./td[2]/a[@class='news14']/@href").get()
            item["publish_date"] = tr.xpath("./td[last()]/text()").get()

            yield scrapy.Request(item['href'],
                                 callback=self.parse_detail,
                                 meta={"item": deepcopy(item)})
            #翻页
            next_url = response.xpath("//a[text()='>']/@href").get()
            if next_url is not None:
                yield scrapy.Request(next_url, callback=self.parse)
예제 #12
0
 def parse(self, response):
     tr_list = response.xpath(
         "//div[@class='newsHead clearfix']/table[2]/tr")
     for tr in tr_list:
         item = YangguangItem()
         item['title'] = tr.xpath("./td[3]/a[1]/@title").extract_first()
         item['href'] = tr.xpath('./td[3]/a[1]/@href').extract_first()
         item['update_time'] = tr.xpath(
             './td[last()]/text()').extract_first()
         yield scrapy.Request(
             item['href'],
             callback=self.parse_detail,  # 指定处理详情页的函数
             meta={"item": item})
     # 构建下一页url地址
     next_url = response.xpath("//a[text()='>']/@href").extract_first()
     if next_url is not None:
         yield scrapy.Request(next_url, callback=self.parse)
예제 #13
0
 def parse(self, response):
     tr_list=response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
     for tr in tr_list:
         item=YangguangItem()
         item['title']=tr.xpath("./td[2]/a[@class='new14']/@title").extract_first()
         item['href']=tr.xpath("./td[2]/a[@class='new14']/@href").extract_first()
         item['publish_date']=tr.xpath("./td[last()]/text()").extract_first()
         yield scrapy.Request(
                 item["href"],
                 callback=self.parse_detail,
                 meta={'item':item}
                 )
     next_url=response.xpath("//a[text()='>']/@href").extract_first()
     if next_url is not None:
         yield scrapy.Request(next_url,
                              callback=self.parse
                              
                              )
예제 #14
0
    def parse(self, response):
        # 由pinelines.py中的open_spider还是在爬虫创建执行赋值的属性 可以调用
        # print(self.hello,'*'*100)
        li_list = response.css('body > div.public-content > div.width-12 > ul.title-state-ul > li')
        for li in li_list:
            item = YangguangItem()
            item['id'] = li.css('.state1::text').extract_first()
            item['title'] = li.css('.state3 .color-hover::text').extract_first()
            item["href"] = 'http://wz.sun0769.com' + li.css('.state3 .color-hover::attr(href)').extract_first()
            item['create_time'] = li.css('.state5::text').extract_first()
            # 解析出对应的连接跳转 并用callback函数进行处理,并用meta传递参数
            yield scrapy.Request(url=item["href"], callback=self.parse_detail, meta={"item": item})

        # 实现翻页
        next_url = 'http://wz.sun0769.com' + response.css('.prov_rota::attr(href)').extract_first()
        print('下一页:' + next_url)
        if next_url is not None:
            yield scrapy.Request(url=next_url, callback=self.parse)
예제 #15
0
 def parse(self, response):
     tr_list = response.xpath(
         "//div[@class='greyframe']/table[2]/tr/td/table/tr")
     for tr in tr_list:
         item = YangguangItem()
         item["number"] = tr.xpath("./td[1]/text()").extract_first()
         item["title"] = tr.xpath(
             "./td[2]/a[@class='news14']/text()").extract_first()
         item["href"] = tr.xpath(
             "./td[2]/a[@class='news14']/@href").extract_first()
         item["name"] = tr.xpath("./td[4]/text()").extract_first()
         item["public_time"] = tr.xpath("./td[5]/text()").extract_first()
         yield scrapy.Request(item["href"],
                              callback=self.parse_content,
                              meta={"item": item})
     next_url = response.xpath("//a[text()='>']/@href").extract_first()
     if not next_url:
         yield scrapy.Request(next_url, callback=self.parse)
예제 #16
0
    def parse_item(self, response):
        item = YangguangItem()
        # 标题
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract_first()
        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        # 文字内容,默认先取出有图片情况下的文字内容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract_first()
        # 如果没有内容,则取出没有图片情况下的文字内容列表
        if not content:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract_first()
            # content为列表,通过join方法拼接为字符串,并去除首尾空格
            item['content'] = content
        else:
            item['content'] = content
        # 链接
        item['url'] = response.url

        yield item
예제 #17
0
 def parse(self, response):
     tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
     for tr in tr_list:
         item = YangguangItem()
         item['num'] = tr.xpath("./td[1]/text()").extract_first()
         item['title'] = tr.xpath("./td[2]/a[2]/text()").extract_first()
         item['href'] = tr.xpath("./td[2]/a[2]/@href").extract_first()
         item['status'] = tr.xpath("./td[3]/span/text()").extract_first()
         item['author'] = tr.xpath("./td[4]/text()").extract_first()
         item['publish_date'] = tr.xpath("./td[5]/text()").extract_first()
         # yield item
         yield scrapy.Request(
             item['href'],
             callback=self.parse_detail,
             meta={'item': item}
         )
     next_url = response.xpath("//a[text()='>']/@href").extract_first()
     if next_url is not None:
         yield scrapy.Request(next_url, callback=self.parse)
예제 #18
0
 def parse(self, response):
     #分组
     tr_list = response.xpath(
         "//div[@class='greyframe']/table[2]/tr/td/table/tr")
     for tr in tr_list:
         item = YangguangItem()
         item["title"] = tr.xpath(
             "./td/a[@class='news14']/@title").extract_first()
         item["href"] = tr.xpath(
             "./td/a[@class='news14']/@href").extract_first()
         item["publish_date"] = tr.xpath(
             "./td[@class='t12wh']/text()").extract_first()
         yield scrapy.Request(item["href"],
                              callback=self.parse_detail,
                              meta={"item": item})
         #翻页
         next_url = response.xpath("//a[text()='>']/@href").extract_first()
         if next_url is not None:
             yield scrapy.Request(next_url, callback=self.parse)
예제 #19
0
    def parse(self, response):
        tr_list = response.xpath('//div[@class= "greyframe"]/table[2]/tbody/tr/td/table/tr')
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath('./td[2]/a[@class="news14"]/@title').extract_first()
            item["href"] = tr.xpath('./td[2]/a[@class="news14"]/@href').extract_first()
            item["publish_date"] = tr.xpath('//div[@class= "greyframe"]/table[2]/tbody/tr/td/table/tbody/tr/td[@class="t12wh"]/text()').extract_first()

            yield scrapy.Request(
                item["href"],
                callback=self.parse_detail,
                meta={"item":item}
            )

            next_url = response.xpath("//a[text()='>']/@href").extract_first()
            if next_url is not None:
                yield scrapy.Request(
                    next_url,
                    callback=self.parse
                )
예제 #20
0
    def parse(self, response):
        li_list = response.xpath(
            '//div[@class="width-12"]/ul[@class="title-state-ul"]/li')

        for li in li_list:
            item = YangguangItem()
            item['title'] = li.xpath(
                './span[@class="state3"]/a/text()').extract_first()
            item['href'] = 'http://wz.sun0769.com' + li.xpath(
                './span[@class="state3"]/a/@href').extract_first()

            yield scrapy.Request(url=item['href'],
                                 callback=self.parse_detail,
                                 meta={'item': item})

        next_url = 'http://wz.sun0769.com' + response.xpath(
            '//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
        # print(next_url)
        if next_url is not None:
            yield scrapy.Request(url=next_url, callback=self.parse)
예제 #21
0
    def parse(self, response):
        html = response.xpath("//td[@valign='top']//tr")
        # print(html)

        for html_list in html:
            # item = {}
            item = YangguangItem()
            item["number"] = html_list.xpath(".//td[1]/text()").extract_first()
            item["href"] = html_list.xpath(
                ".//td[2]/a[2]/@href").extract_first()
            item["title"] = html_list.xpath(
                ".//td[2]/a[2]/@title").extract_first()
            item["time"] = html_list.xpath(
                ".//td[last()]/text()").extract_first()
            # print(item)
            yield scrapy.Request(item["href"],
                                 callback=self.details,
                                 meta={"item": item})

        next_url = html.xpath("//a[text()='>']/@href").extract_first()
        print(next_url)
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)
예제 #22
0
    def parse(self, response):
        # self.settings["MONGO_HOST"]
        # self.settings.get("MONGO_HOST","")
        # print(self.hello,"*"*100)
        #分组
        tr_list = response.xpath(
            "//div[@class='greyframe']/table[2]/tr/td/table/tr")
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath(
                "./td[2]/a[@class='news14']/@title").extract_first()
            item["href"] = tr.xpath(
                "./td[2]/a[@class='news14']/@href").extract_first()
            item["publish_date"] = tr.xpath(
                "./td[last()]/text()").extract_first()

            yield scrapy.Request(item["href"],
                                 callback=self.parse_detail,
                                 meta={"item": item})
        #翻页
        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)
예제 #23
0
 def parse(self, response):
     li_list = response.xpath('//li[@class="clear"]')
     for li in li_list:
         item = YangguangItem()
         item['id'] = li.xpath('./span[@class="state1"]/text()').get()
         item['status'] = li.xpath(
             './span[@class="state2"]/text()').get().strip()
         item['title'] = li.xpath('./span[@class="state3"]/a/text()').get()
         item['href'] = BASE_URL + li.xpath(
             './span[@class="state3"]/a/@href').get()
         item['response_time'] = re.findall(
             '等待处理:(.*)',
             li.xpath('./span[@class="state4"]/text()').get().strip())[0]
         item['publish_time'] = response.xpath(
             '/html/body/div[2]/div[3]/ul[2]/li[1]/span[5]/text()').get()
         yield scrapy.Request(item['href'],
                              callback=self.parse_detail,
                              meta={'item': item})
     # 翻页
     next_url = BASE_URL + response.xpath(
         '//div[@class="mr-three paging-box"]/a[2]/@href').get()
     if next_url is not None:
         yield scrapy.Request(next_url, callback=self.parse)
예제 #24
0
    def parse(self, response):
        print('spider start')
        # 分组
        try:
            li_list = response.css('li.clear')
            for li in li_list:
                item = YangguangItem()
                item['id'] = li.css('span.state1::text').get()
                item['state'] = li.css('span.state2::text').get()
                item['title'] = li.xpath(
                    "span[@class='state3']/a[@class='color-hover']/text()"
                ).get()
                item['href'] = li.xpath(
                    "span[@class='state3']/a[@class='color-hover']/@href").get(
                    )
                item['href'] = 'http://wz.sun0769.com' + item['href']
                # item['href'] = ['http://wz.sun0769.com'+ i for i in item['href']]
                item['sleepTime'] = li.css('span.state4::text').get()
                item['time'] = li.css('span.state5::text').get()
                # print(item)
                yield scrapy.Request(item['href'],
                                     callback=self.parseDetail,
                                     meta={"item": item})  # 处理详情页面

            # 翻页
            next_url = response.xpath(
                "//div[@class='mr-three paging-box']/a/@href").get()
            print(next_url)
            next_url = 'http://wz.sun0769.com' + next_url
            if next_url is not None:
                yield scrapy.Request(next_url, callback=self.parse)
                print(next_url)
            # next_url = response.xpath("//a[text()='>']/@href")

        except TypeError:
            print('爬虫完成')
예제 #25
0
    def parse(self, response):  # 解析start_urls
        # host = self.settings["MONGO_HOST"]  # 方式1
        # host = self.settings.get("MONGO_HOST")  # 方式2,推荐
        tr_list = response.xpath(
            "//div[@class='greyframe']/table[2]/tr/td/table/tr")
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath(
                "./td[2]/a[@class='news14']/@title").extract_first()
            item["href"] = tr.xpath(
                "./td[2]/a[@class='news14']/@href").extract_first()
            item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
            print(item)

            # 提交(yield)至详情页方法,并将item传递给详情页(meta)
            yield scrapy.Request(item["href"],
                                 callback=self.parse_detail,
                                 meta={"item": item})

        # 找寻下一页的地址
        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(next_url,
                                 callback=self.parse)  # 提交(yield)至解析页方法parse
예제 #26
0
    def parse(self, response):

        tr_list = response.xpath(
            "//div[@class='newsHead clearfix']/table[2]/tr")

        for tr in tr_list:
            item = YangguangItem()
            item['title'] = tr.xpath("./td[3]/a[1]/@title").extract_first()
            item['href'] = tr.xpath("./td[3]/a[1]/@href").extract_first()
            item['publish_date'] = tr.xpath(
                "./td[last()]/text()").extract_first()

            yield scrapy.Request(item['href'],
                                 callback=self.parse_dateil,
                                 meta={'item': item})
        #翻页
        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        # print(next_url)
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.parse,
                dont_filter=True,
            )
예제 #27
0
파일: yg.py 프로젝트: BrandonSong/spider
    def parse(self, response):

        tr_list = response.xpath(
            "//div[@class='greyframe']/table[2]/tr//table/tr")

        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath("./td[2]/a[2]/text()").extract_first()
            item["status"] = tr.xpath("./td[3]/span/text()").extract_first()
            item["name"] = tr.xpath("./td[4]/text()").extract_first()
            item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()

            # 每个问题详情页的url地址
            detail_url = tr.xpath(
                "./td[2]/a[@class='news14']/@href").extract_first()

            # 请求详情页
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={"item": item})

        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)