예제 #1
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//founder-content/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://dz.jjckb.cn/www/pages/webpage2009/"

            try:
                author_arr = response.xpath(
                    "//td[@class='black12']//text()").extract()
                author = "".join(author_arr).split("来源:")[1].strip()
                if author == "":
                    author = "经济参考报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//tr/td[@class='hei16b']//text()").extract()
                title_arr1 = response.xpath(
                    "//tr/td[@class='hui12']//text()").extract()
                title = "".join(title_arr).strip() + "".join(
                    title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #2
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='content']/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://szb.ddswcm.com/"

            try:
                author = response.xpath(
                    "//div[@class='property']//text()").extract()[0].split(
                        "作者:")[1].split("浏览次数")[0].strip()
                if author == "":
                    author = "当代商报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@class='papertitle']//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #3
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='txtContent']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.21jingji.com/"

            try:
                author_arr = response.xpath(
                    "//div[@class='newsInfo']//text()").extract()
                author = "".join(author_arr).split(" ")[1].strip()
                if author == "":
                    author = "21世纪经济报道数字报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr1 = response.xpath(
                    "//div[@class='titleHead']/h1//text()").extract()
                title = "".join(title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #4
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='content']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "https://hzdaily.hangzhou.com.cn/hzrb/"

            try:
                author = "杭州日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr1 = response.xpath(
                    "//div[@class='head']/h1//text()").extract()
                title_arr2 = response.xpath(
                    "//div[@class='head']/h2//text()").extract()
                title_arr3 = response.xpath(
                    "//div[@class='head']/h3//text()").extract()
                title = "".join(title_arr1).strip() + "".join(
                    title_arr2).strip() + "".join(title_arr3).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #5
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//span[@id='contenttext']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://bddsb.bandao.cn/pc/bddsb/"

            try:
                author = "半岛都市报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@class='neirong']/h3//text()").extract()
                title_arr1 = response.xpath(
                    "//div[@class='neirong']/h2//text()").extract()
                title = "".join(title_arr).strip() + "".join(
                    title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #6
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("/", "-")
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='article_content']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.qhdb.com.cn/"

            try:
                author_arr = response.xpath(
                    "//span[@id='sZuoZhe']/text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "期货日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div[@class='article_title']//text()"
                                       ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #7
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//p[@class='info_p']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.qdcaijing.com/"

            try:
                author = response.xpath("//p[@class='txtc']/span/text()"
                                        ).extract()[0].split("来源:")[0].strip()
                if author == "":
                    author = "青岛财经日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath(
                    "//h3[@class='txtc']//text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #8
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='ozoom']/div/span/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://media.workercn.cn/"

            try:
                author_arr = response.xpath(
                    "//div[@class='lai']/span//text()").extract()
                author = "".join(author_arr).strip().split("(")[0]
                if author == "":
                    author = "劳动新闻"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//h1//text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #9
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//td[@class='xilan_content_tt']/p/text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://fjrb.fjsen.com/"

            try:
                authors = response.xpath("//td[@class='bt4']/text()").extract()
                author_arr = "".join(authors).split("记者 ")
                if len(author_arr) == 2:
                    author = author_arr[1]
                else:
                    author = "福建日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("//td[@class='bt1']/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #10
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='ozoom']/founder-content/text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.ssrb.com.cn/"

            try:
                author = "石狮日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//td[@class='font01']/founder-title/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #11
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//founder-content//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://124.224.204.62:8081/szb/pc/"

            try:
                author_arr = response.xpath("//h3[@class='title-author']/text()").extract()
                if author_arr == []:
                    author = "中卫日报"
                else:
                    author=author_arr[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//h2[@id='Title']/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
예제 #12
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace("年", "-").replace(
                    "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//founder-content//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hngrrb.cn/"

            try:
                author = "河南工人日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("//table[3]/tbody/tr/td//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)