예제 #1
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath(
                    "//div[@class='title_cen mar-t2 text']/ucaptitle/text()"
                ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//span[@id='ly']/text()").extract()
                if author == []:
                    author = "海南省人民政府"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='zoom']/div[@id='font']/ucapcontent/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hainan.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #2
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title_arr = response.xpath(
                    """//div[@class='xl-nr clearflx']//h3/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//div[@class='xl-nr clearflx']//h5/span/text()"""
                ).extract()
                author = "".join(author_arr).replace("[",
                                                     "").replace("]",
                                                                 "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class='xl-nr clearflx']//h5/text()""").extract()
                public_time = "".join(content_time).replace("字号:", "").replace(
                    "|", "").strip()
                public_time = str(public_time) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.fujian.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #3
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath(
                    "//div[@class='article']/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@class='article-inf-left']/text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "广西壮族自治区人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='article-con']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.gxzf.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #4
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title_arr = response.xpath(
                    "//h2[@class='title']/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author_arr = response.xpath(
                    "//span[@class='fl']/span/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "重庆市人民政府"
                else:
                    author = author.split("来源:")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='conTxt']")[0].xpath('string(.)').extract()
                content = "".join(content_arr).split("终审 :")[0].strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.cq.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #5
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath("//h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    "//head/meta[@name='ContentSource']/@content").extract()
                if author == []:
                    author = "贵州省人民政府"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='view TRS_UEDITOR trs_paper_default trs_web']/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.guizhou.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #6
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("//div[@class='xz-xl-tit']/h3/text()"
                                       ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath(
                    "//div[@class='xz-xl-info']/p/span/text()").extract()
                if len(author) == 2:
                    author = author[1]
                else:
                    author = "西藏自治区人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='xz-xl-article']"
                                             )[0].xpath("string(.)").extract()
                content = "".join(content_arr).split(
                    "//显示下载附件")[0][:-106].strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xizang.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #7
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//div[@class='wztit']//text()""").extract()
                title = "".join(titles).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[3]""").extract()
                author = "".join(author_arr).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_date = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[1]""").extract()[0]
                content_time = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[2]""").extract()[0]
                public_time = str(content_date) + " " + str(
                    content_time) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ah.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='wzcon']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #8
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//*[@id='title']//text()""").extract()
                title = "".join(titles).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                authors = response.xpath(
                    """//*[@id='source']//text()""").extract()
                author = "".join(authors).strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            source = "http://www.henan.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//*[@class='content']//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #9
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    "//div[starts-with(@class,'detail-article-title')]//text()"
                ).extract()
                title = "".join(content_title).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                content_author = response.xpath(
                    """/html/body/div/div/div/div/ul/li/span[2]/text()"""
                ).extract()
                author = "".join(content_author).strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """/html/body/div/div/div/div/ul/li/span[1]/text()"""
                ).extract()
                public_time = str(content_time[0].strip()) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.shanxi.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #10
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//td[@align="center"]/text()""").extract()
                title = "".join(content_title)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                authors = response.xpath(
                    """//ul[@class="list"]/li[2]/text()""").extract()
                author = "".join(authors).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//ul[@class="list"]/li[1]/text()""").extract()
                public_time = str(content_time[0]).replace("发布日期:", "").strip()
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jl.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@id="zoom"]//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #11
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    """//*[@class='text-center']/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = str(
                    response.xpath(
                        """//*[@class='list-unstyled list-inline']/li[2]/span/text()"""
                    ).extract()[0]).replace("来源:", "")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = str(
                    response.xpath(
                        """//*[@class='list-unstyled list-inline']/li[1]/span/text()"""
                    ).extract()[0]).replace("发布时间:", "").strip() + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.hubei.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//*[@class='TRS_Editor']//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #12
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//div[@class='artile_zw']/div/p/text()""").extract()
                title = "".join(content_title).strip()
                print(title)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    """//div[@id='zoom']//font/text()[2]""").extract()[0]

            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class='sp_time screen']/font[1]/text()"""
                ).extract()[0]
                public_time = str(content_time).replace("发布时间:", "")
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jiangxi.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@id='zoom']/p/text()""").extract()
                content = "".join(content_detail)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #13
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    """//div[@class='sp_title']/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                authors = response.xpath(
                    """//div[@class='sp_time']/font[2]/text()""").extract()
                author = "".join(authors).replace("来源:", "").strip()
                if author == "":
                    author = "江苏人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_times = response.xpath(
                    """//div[@class='sp_time']/font[1]/text()""").extract()[0]
                public_time = str(str(public_times).replace("发布日期:",
                                                            "")) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='zoom']//text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.js.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #14
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//div[@id="dbt"]//text()""").extract()
                title = "".join(content_title).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = "吉林省人民政府"

            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class="c_xx"]//text()""").extract()
                public_times = str(content_time[0]).split("   ")
                public_time = str(public_times[1])
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jl.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #15
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    "//head/title/text()").extract()[0].split("–")[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//dl/dd")[0].xpath(
                    'string(.)').extract()[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            source = "https://www.gov.mo/"

            try:
                content_arr = response.xpath("//article/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #16
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath("//h2/ucaptitle/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath("//ul[@id='articleattribute']/li/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "四川省人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}时\d{1,2}分)", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("时",":").replace("分","")+":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("//div[@id='cmsArticleContent']")[0].xpath('string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.sc.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #17
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("""//div[@class='xq-tit']/span/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = str(response.xpath("""//div[@class='R-tit']/span[2]/text()""").extract()[0]).replace("来源:","")

            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.shangdong.gov.cn/"

            try:
                content_detail = response.xpath("""//div[@class='article']//text()""").extract()
                content = "".join(content_detail)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item

            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
예제 #18
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("//div[@id='ivs_title']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath("//small[@class='PBtime']/text()").extract()[0].split("来源:")[1]
                if author == "":
                    author = "上海市人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\s\s\d{1,2}\s:\s\d{1,2})", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("   "," ").replace(" : ",":")+":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='ivs_content']")[0].xpath('string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.shanghai.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #19
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("""//h2[@class="cont_title"]/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author="河北省人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_timess = response.xpath("""//li[@class="xl_shijian"]//text()""").extract()[0]
                public_times = str(public_timess).replace("年", "-").replace("月", "-").replace("日", "").strip()
                public_time = str(public_times)+" 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='zoom']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hebei.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #20
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            # print(text)
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//td[@align="center"]/text()""").extract()
                title = "".join(titles).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                content_author = response.xpath(
                    """//table[@class="time"]//td[@align="left"]/text()"""
                ).extract()
                authors = content_author[0].split("  信息来源:")
                author = str(authors[1])
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//table[@class="time"]//td[@align="left"]/text()"""
                ).extract()
                public_times = str(content_time[0]).split("  信息来源:")
                public_time = str(
                    str(public_times[0]).replace("发布时间:", "").replace(
                        "年", "-").replace("月", "-").replace("日", "") +
                    " 00:00:00").strip()

            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ln.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']/div/p/text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)