Python LadItem示例，lad.items.LadItem Python示例

示例#1

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["newsType"] = '警事要闻'
        item["title"] = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[2]/text()').extract_first()
        item["time"] = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[3]/div[2]/text()').extract_first().split('|')[1].strip()

        text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/div/p/font')
        if len(text_list) == 0:
            text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/p')
        if len(text_list) == 0:
            text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/span')
        if len(text_list) == 0:
            text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div')
        if len(text_list) == 0:
            text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/p/span')
        if len(text_list) == 0:
            text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/span')

        if len(text_list) >= 2:
            for str_slt in text_list:
                if str_slt.xpath('text()').extract_first() is None:
                    self.text = self.text
                else:
                    self.text = self.text + str_slt.xpath('text()').extract_first()
        else:
            if text_list.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + text_list.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#2

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "湖南"
        item["newsType"] = "警事要闻"
        item["title"] = response.xpath(
            '/html/body/div/div[1]/div[4]/div[1]/h4/text()').extract(
            )[0].encode('utf-8')
        item["time"] = response.xpath(
            '/html/body/div/div[1]/div[4]/div[1]/div/p[2]/text()'
        ).extract_first().encode('utf-8').split('：')[1]  #rows = list(array)

        text_list = response.xpath('//*[@id="txtContent"]/div/div/div/p')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="txtContent"]/div/p/span')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="content"]/div/div/p')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="txtContent"]/div/p')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="artibody"]/p')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="txtContent"]/div/div/p')
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="txtContent"]/p')

        for str_slt in text_list:
            if str_slt.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + str_slt.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#3

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "海南"
        item["newsType"] = "警事要闻"
        item["title"] = response.xpath('//*[@id="artibody"]/table/tr[1]/td/font/text()').extract_first()
        item["time"] = '2015-9-24'

        text_list = response.xpath('//*[@id="artibody"]/table/tr[4]/td')

        if len(text_list) >=2:
            for str_slt in text_list:
                if str_slt.xpath('text()').extract_first() is None:
                    self.text = self.text
                else:
                    self.text = self.text + str_slt.xpath('text()').extract_first()
        else:
            if text_list.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + text_list.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#4

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "西藏"
        item['newsType'] = '警事要闻'
        item["title"] = response.xpath(
            '//*[@id="container"]/div[2]/div/div/div[2]/div[1]/h1/text()'
        ).extract_first()
        if response.xpath(
                '//*[@id="container"]/div[2]/div/div/div[2]/div[2]/span[2]/text()'
        ) is None:
            item["time"] = response.xpath(
                '//*[@id="container"]/div[2]/div/div/div/div[2]/div[2]/span[2]/text()'
            ).extract_first().split(' ')[0][5:15]
        else:
            item["time"] = response.xpath(
                '//*[@id="container"]/div[2]/div/div/div[2]/div[2]/span[2]/text()'
            ).extract_first().split(' ')[0][5:15]

        text_list = response.xpath(
            '//*[@id="container"]/div[2]/div/div/div[3]/p/span')

        for str_slt in text_list:
            if str_slt.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + str_slt.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#5

0

显示文件

文件： xinjiang_spider.py 项目： nicholaskh/lad-crawler

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "新疆"
        item['newsType'] = '警事要闻'
        item["title"] = response.xpath(
            '/html/body/div[1]/div[2]/div[2]/div/div/div[1]/text()'
        ).extract_first()
        c = response.xpath(
            '/html/body/div[1]/div[2]/div[2]/div/div/div[2]/text()'
        ).extract_first().strip().split(' ')[0]
        c = re.sub("\D", "", c)
        item["time"] = c[0:4] + '-' + c[4:6] + '-' + c[6:8]

        if len(
                response.xpath(
                    '/html/body/div[1]/div[2]/div[2]/div/div/div[2]/text()').
                extract_first().strip().split(' ')[0]) == 0:
            item["time"] = response.xpath(
                '//*[@id="right"]/div[1]/div[1]/div[1]/text()').extract_first(
                ).strip().split(' ')[0]
            item["title"] = response.xpath(
                '//*[@id="right"]/div[1]/div[1]/h3/text()').extract_first()

        text_list = response.xpath(
            '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/span')
        if len(text_list) <= 1:
            text_list = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/font')
        if len(text_list) <= 1:
            text_list = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p')
        if len(text_list) <= 1:
            text_list = response.xpath('//*[@id="right"]/div[1]/div[2]/p')
        if len(text_list) <= 1:
            text_list = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/table/tbody/tr/td/p/span'
            )
        if len(text_list) <= 1:
            text_list = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/span')
        if len(text_list) <= 1:
            text_list = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/span')
        if len(text_list) > 1:
            self.text = processText(text_list)
        else:
            self.text = response.xpath(
                '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/span/text()'
            ).extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#6

0

显示文件

    def parse(self, response):

        should_deep = True

        times = response.xpath(
            '/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/table/tr/td/div/text()'
        ).extract()
        urls = response.xpath(
            '/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/table/tr/td/div/a/@href'
        ).extract()
        valid_child_urls = list()

        for time, url in zip(times, urls):
            try:
                time_now = datetime.strptime(time, '%Y-%m-%d')
                self.update_last_time(time_now)
            except:
                break

            if self.last_time is not None and self.last_time >= time_now:
                should_deep = False
                break

            valid_child_urls.append("http://www.njga.gov.cn/www/njga/2010/" +
                                    url)

        next_requests = list()
        if should_deep:
            # 表示有新的url
            # 翻页
            if len(response.url) == 45:
                next_url = 'http://www.njga.gov.cn/www/njga/2010/zabb_p1.htm'
            else:
                num = int(response.url.split('/')[6][6])
                next_url = 'http://www.njga.gov.cn/www/njga/2010/zabb_p' + str(
                    num + 1) + ".htm"
            yield scrapy.Request(url=next_url, callback=self.parse)
            next_requests.append(
                scrapy.Request(url=next_url, callback=self.parse))

        for index, temp_url in enumerate(valid_child_urls):
            req = scrapy.Request(url=temp_url, callback=self.parse_info)

            hit_time = times[index]
            m_item = LadItem()
            m_item['time'] = hit_time
            # 相当于在request中加入了item这个元素
            req.meta['item'] = m_item
            next_requests.append(req)

        for req in next_requests:
            yield req

示例#7

0

显示文件

文件： jiangsu_spider_new.py 项目： nicholaskh/lad-crawler

    def parse(self, response):

        should_deep = True

        times = response.xpath('//*[@width="200px"]/text()').extract()
        urls = response.xpath(
            '/html/body/div[3]/div/div/div/div[3]/div/table/tbody/tr/td/div/a/@href'
        ).extract()
        valid_child_urls = list()

        for time, url in zip(times, urls):
            try:
                time_now = datetime.strptime(time[1:11], '%Y-%m-%d')
                self.update_last_time(time_now)
            except:
                break

            if self.last_time is not None and self.last_time >= time_now:
                should_deep = False
                break

            valid_child_urls.append("http://www.jsga.gov.cn" + url)

        next_requests = list()
        if should_deep:
            # 表示有新的url
            # 翻页
            if len(response.url) == 43:
                next_url = "http://www.jsga.gov.cn/jwzx/aqff/index_2.html"
            else:
                num = int(response.url.split('index')[1][1])
                next_url_part = "index_" + str(num + 1) + ".html"
                next_url = "http://www.jsga.gov.cn/jwzx/aqff/" + next_url_part
            yield scrapy.Request(url=next_url, callback=self.parse)
            next_requests.append(
                scrapy.Request(url=next_url, callback=self.parse))

        for index, temp_url in enumerate(valid_child_urls):
            req = scrapy.Request(url=temp_url, callback=self.parse_info)

            hit_time = times[index]
            m_item = LadItem()
            m_item['time'] = hit_time
            # 相当于在request中加入了item这个元素
            req.meta['item'] = m_item
            next_requests.append(req)

        for req in next_requests:
            yield req

示例#8

0

显示文件

文件： kunming_spider_new.py 项目： nicholaskh/lad-crawler

    def parse(self, response):

        should_deep = True

        times = response.xpath(
            '//*[@class="lists"]/ul/li/span/text()').extract()
        urls = response.xpath('//*[@class="lists"]/ul/li/a/@href').extract()

        valid_child_urls = list()

        for time, url in zip(times, urls):
            try:
                time_now = datetime.strptime(time[1:11], '%Y-%m-%d')
                self.update_last_time(time_now)
            except:
                break

            if self.last_time is not None and self.last_time >= time_now:
                should_deep = False
                break
            valid_child_urls.append('http://gaj.km.gov.cn' + url)

        next_requests = list()
        if should_deep:
            # 表示有新的url
            # 翻页
            if len(response.url) == 31:
                next_url = "http://gaj.km.gov.cn/zxdt/jwdt/index_2.shtml"
            else:
                part_str = response.url.split('/')[5]
                num = int(part_str[6])
                next_url_part = "index_" + str(num + 1) + ".shtml"
                next_url = response.url.split('index')[0] + next_url_part
            next_requests.append(
                scrapy.Request(url=next_url, callback=self.parse))

        for index, temp_url in enumerate(valid_child_urls):
            req = scrapy.Request(url=temp_url, callback=self.parse_info)

            hit_time = times[index]
            m_item = LadItem()
            m_item['time'] = hit_time
            # 相当于在request中加入了item这个元素
            req.meta['item'] = m_item
            next_requests.append(req)

        for req in next_requests:
            yield req

示例#9

0

显示文件

文件： beijing_spider_new.py 项目： nicholaskh/lad-crawler

    def parse(self, response):

        should_deep = True

        times = response.xpath('//*[@id="yun1"]/tr/td/text()[2]').extract()[1:]
        urls = response.xpath('//*[@id="yun1"]/tr/td/a/@href').extract()

        valid_child_urls = list()

        for time, url in zip(times, urls):
            try:
                time_now = datetime.strptime(time[3:13], '%Y-%m-%d')
                self.update_last_time(time_now)
            except:
                break

            if self.last_time is not None and self.last_time >= time_now:
                should_deep = False
                break

            valid_child_urls.append("http://www.bjgaj.gov.cn" + url)

        next_requests = list()
        if should_deep:
            # 表示有新的url
            # 翻页
            if len(response.url) <= 42:
                next_url = 'http://www.bjgaj.gov.cn/web/listPage_allJfts_col1167_30_2.html'
            else:
                num = int(response.url[56])
                next_url = response.url[0:56] + str(num + 1) + ".html"
            next_requests.append(
                scrapy.Request(url=next_url, callback=self.parse))

        for index, temp_url in enumerate(valid_child_urls):
            req = scrapy.Request(url=temp_url, callback=self.parse_info)

            hit_time = times[index]
            m_item = LadItem()
            m_item['time'] = hit_time
            # 相当于在request中加入了item这个元素
            req.meta['item'] = m_item
            next_requests.append(req)

        for req in next_requests:
            yield req

示例#10

0

显示文件

文件： chongqing_spider.py 项目： nicholaskh/lad-crawler

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "重庆"
        item["newsType"] = "警事要闻"
        item["title"] = response.xpath(
            '/html/body/table[4]/tr/td/table[2]/tr/td/text()').extract_first(
            ).strip()
        item["time"] = response.xpath(
            '/html/body/table[4]/tr/td/table[4]/tr/td/text()[1]'
        ).extract_first().strip()[10:21]

        text_list = response.xpath(
            '//*[@id="Zoom"]/articlepagebegin/div/div/p/span')
        if len(text_list) == 0:
            text_list = response.xpath(
                '//*[@id="Zoom"]/articlepagebegin/div/div/p')
        if len(text_list) == 0:
            text_list = response.xpath(
                '//*[@id="Zoom"]/articlepagebegin/p/span')
        if len(text_list) == 0:
            text_list = response.xpath(
                '//*[@id="Zoom"]/articlepagebegin/p/font/text()')
            self.lag = 1
        if len(text_list) == 0:
            text_list = response.xpath('//*[@id="Zoom"]/articlepagebegin/p')

        if self.flag == 1:
            for str_slt in text_list:
                if str_slt.extract() is None:
                    self.text = self.text
                else:
                    self.text = self.text + str_slt.extract()
            self.flag = 0
        else:
            for str_slt in text_list:
                if str_slt.xpath('text()').extract_first() is None:
                    self.text = self.text
                else:
                    self.text = self.text + str_slt.xpath(
                        'text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#11

0

显示文件

文件： shandong_spider.py 项目： nicholaskh/lad-crawler

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "南京"
        item["newsType"] = "治安播报"
        item["title"] = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[1]/td/div/strong/text()').extract_first().strip()
        time_leng = len(response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[2]/td/div/text()[1]').extract_first().strip().split(']')[0].strip())
        item["time"] = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[2]/td/div/text()[1]').extract_first().strip().split(']')[0].strip()[time_leng - 10 : time_leng]
        text_list = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[3]/tr/td/div/div/div/span/text()')

        for p_slt in text_list:
            if p_slt.extract() is None:
                self.text = self.text
            else:
                self.text = self.text + p_slt.extract()
        item["text"] = self.text
        self.text = ""

        yield item

示例#12

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "辽宁"
        item["newsType"] = "警事要闻"
        item["title"] = response.xpath(
            '//*[@id="activity-name"]/text()').extract_first().strip()
        item["time"] = response.xpath(
            '//*[@id="post-date"]/text()').extract_first()

        text_list = response.xpath('//*[@id="js_content"]/p/span/text()')

        for p_slt in text_list:
            if p_slt.extract() is None:
                self.text = self.text
            else:
                self.text = self.text + p_slt.extract()
        item["text"] = self.text
        self.text = ""

        yield item

示例#13

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "浙江"
        item["newsType"] = "警事要闻"
        item["title"] = response.xpath(
            '/html/body/table[6]/tr/td/table[2]/tr/td/table[2]/tr/td/text()'
        ).extract_first()
        time_leng = len(
            response.xpath(
                '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[2]/tr/td/text()'
            ).extract_first().strip())
        item["time"] = response.xpath(
            '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[2]/tr/td/text()'
        ).extract_first().strip()[time_leng - 10:time_leng]

        text_list = response.xpath(
            '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[6]/tr/td[1]/div/p/font'
        )
        if len(text_list) == 0:
            text_list = response.xpath(
                '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[6]/tr/td[1]/div/p'
            )

        if len(text_list) == 1:
            if text_list.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + text_list.xpath(
                    'text()').extract_first()

        for str_slt in text_list:
            if str_slt.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + str_slt.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#14

0

显示文件

    def parse(self, response):

        should_deep = True

        times = response.xpath(
            '//*[@class="article_list"]/li/span/text()').extract()
        urls = response.xpath(
            '//*[@class="article_list"]/li/a/@href').extract()

        valid_child_urls = list()

        for time, url in zip(times, urls):
            try:
                time_now = datetime.strptime(time, '%Y-%m-%d')
                self.update_last_time(time_now)
            except:
                break

            if self.last_time is not None and self.last_time >= time_now:
                should_deep = False
                break

            valid_child_urls.append("http://www.qhga.gov.cn" + url)

        next_requests = list()

        for index, temp_url in enumerate(valid_child_urls):
            req = scrapy.Request(url=temp_url, callback=self.parse_info)

            hit_time = times[index]
            m_item = LadItem()
            m_item['time'] = hit_time
            # 相当于在request中加入了item这个元素
            req.meta['item'] = m_item
            next_requests.append(req)

        for req in next_requests:
            yield req

示例#15

0

显示文件

    def parse_info(self, response):
        item = LadItem()

        item["city"] = "北京"
        item["newsType"] = '警事要闻'
        item["title"] = response.xpath(
            '/html/body/table[3]/tr/td/table[2]/tr/td[3]/table/tr/td/table/tr[2]/td/table/tr[1]/td/font/b/text()'
        ).extract_first()
        item["time"] = response.xpath(
            '/html/body/table[3]/tr/td/table[2]/tr/td[3]/table/tr/td/table/tr[2]/td/table/tr[2]/td/text()'
        ).extract_first().split('www.bjgaj.gov.cn')[1].strip()

        text_list = response.xpath('//*[@id="articleContent"]/p')

        for p_slt in text_list:
            if p_slt.xpath('text()').extract_first() is None:
                self.text = self.text
            else:
                self.text = self.text + p_slt.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item

示例#16

0

显示文件

文件： haikou_spider.py 项目： nicholaskh/lad-crawler

    def parse_info(self, response):
        item = LadItem()

        item["newsType"] = '警事要闻'
        item["title"] = response.xpath(
            '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[1]/td/div/span/text()'
        ).extract_first()
        item["time"] = response.xpath(
            '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[4]/td/div/text()[2]'
        ).extract_first().strip()

        text_list = response.xpath(
            '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[4]/td/table[1]/tr[2]/td/span/div/div/div/p'
        )

        if text_list.xpath('text()').extract_first() is None:
            self.text = self.text
        else:
            self.text = self.text + text_list.xpath('text()').extract_first()
        item["text"] = self.text
        self.text = ""

        yield item