Пример #1
0
    def parsebody(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath(
                    "//div[@class='content-article']/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "https://news.qq.com/"

            try:
                if content != "" and str(
                        response.meta["public_time"]).startswith(
                            spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #2
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath("""//div[contains(@id,'detail')]//p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xinhuanet.com/"

            try:
                if content != "" and str(response.meta["public_time"]).startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #3
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath(
                    """//article//p//text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.sohu.com/"

            try:
                if content != "" and str(
                        response.mete['public_time']).startswith(
                            spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["public_time"] = response.mete['public_time']
                    item["url"] = response.mete['url']
                    item["title"] = response.mete['title']
                    item["author"] = response.mete['author']
                    item["source"] = source
                    item["content"] = content
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(self.item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #4
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath(
                    "//div[@class='section-main']/p/text()").extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                public_time = re.search(r"(\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                                        response.text).group(0) + ":00"
                public_time = response.url.split(
                    "/")[-3][:-2] + "-" + public_time
            except:
                # spiderUtil.log_level(8, response.url)
                pass

            try:
                title_arr = response.xpath("//head/title//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//span[@id='copyfrom']//text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "中青在线"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.cyol.com/"

            try:
                if content != "" and len(
                        content) >= 100 and public_time.startswith(
                            spiderUtil.get_first_hour()):

                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #5
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                # content_time = response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)

            except:
                spiderUtil.log_level(8, response.url)

            try:
                contents = response.xpath(
                    """//*[@id="articleBody"]/p/text()""").extract()
                content = "".join(contents)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.china.com.cn/"

            try:
                author_arr = response.xpath(
                    """//*[@id="source_baidu"]//text()""").extract()
                author = "".join(author_arr)
                if author == '':
                    author = "中国网"
                else:
                    author = author.split("来源:")[1].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "/html/body/div/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #6
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("年", "-").replace(
                        "月", "-").replace("日", "") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """/html/body/div/div/div/p/text()""").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.southcn.com/"

            try:
                author_arr = response.xpath(
                    """/html/body/div/div/div/div/span/i/a/text()""").extract(
                    )
                author = "".join(author_arr).strip()
                if author == "":
                    author = "央视网"
                else:
                    author = author.replace("来源:", "")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    """/html/body/div/div/div/h1/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #7
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("年", "-").replace(
                        "月", "-").replace("日", "") + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath(
                    "//div[@class='left_zw']/p//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath(
                    "//div/div/div/div/h1//text()").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@class='left-t']//text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "中国新闻网"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.chinanews.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #8
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath(
                    """//div/div/div/div/div/p/text()""").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//div/div/h2/text()").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//*[@id="xl-headline"]/div/div/text()""").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "大众网"
                else:
                    author = author.split("来源: ")[1].split("作者:")[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.dzwww.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #9
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    "//main/section/section/div/span[4]/text()").extract()
                public_time = str(str(content_time[0]) + ":00")

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "/html/body/main/section/section/article/section/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.rednet.cn/"

            try:
                author_arr = response.xpath(
                    "//main/section/section/div/span[1]/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "红网"
                else:
                    author = author.split("来源:")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//main/section/section/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                # if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                item = NewsAllItem()
                item["source"] = source
                item["content"] = content
                item["public_time"] = public_time
                item["url"] = response.url
                item["title"] = title
                item["author"] = author
                item["crawl_time"] = spiderUtil.get_time()
                item["html_size"] = html_size
                print(item)
                # yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #10
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//*[@id="main"]/div/div/div/div/div/span[1]/text()"""
                ).extract()
                public_time = str(str(content_time[0]) + ":00")

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """//*[@id="article-content"]/p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.71.cn/"

            try:
                author_arr = response.xpath(
                    """//*[@id="main"]/div/div/div/div/div/span[2]/text()"""
                ).extract()
                author = "".join(author_arr)
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath(
                    """//*[@id="main"]/div/div/div/div/h1/text()""").extract(
                    )[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    print(content, public_time, title, author)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #11
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath("//div[@class='TRS_Editor']"
                                             )[0].xpath('string(.)').extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author_arr = response.xpath(
                    "//meta[@name='author']/@content").extract()
                if author_arr == []:
                    author = "中国青年网"
                else:
                    author = author_arr[0]
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.youth.cn/"

            try:
                title = "".join(
                    response.xpath("//head/title/text()").extract()[0].split(
                        "_")[:-2]).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #12
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//meta[@name="PubDate"]//@content""").extract()
                # print(content_time)
                # content_times = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", content_time[0]).group(0)
                # print(content_times)
                public_time = str(content_time[0])

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """//div[@class="content"]//p//text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.comnews.cn/"

            try:
                author = response.xpath(
                    """//meta[@name="ContentSource"]//@content""").extract(
                    )[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                titles = response.xpath(
                    """//meta[@name="ArticleTitle"]//@content""").extract()
                title = "".join(titles)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                # if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()):
                item = NewsAllItem()
                item["source"] = source
                item["content"] = content
                item["public_time"] = public_time
                item["url"] = response.url
                item["title"] = title
                item["author"] = author
                item["crawl_time"] = spiderUtil.get_time()
                item["html_size"] = html_size
                print(item)
                # yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #13
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + "00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='TRS_Editor']/div/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.wenming.cn/"

            try:
                author_arr = response.xpath(
                    "//div[@class='box01']/div[@class='fl']/a//text()"
                ).extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "文明网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@id='title_tex']//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_yesterday_date()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #14
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath(
                    "//div[@id='Content']//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//head/title//text()").extract()
                title = "".join(title_arr).strip().strip()[:-8]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath(
                    "//head/meta[@name='author']/@content").extract()
                if author == []:
                    author = "中国日报网"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)
            source = "http://www.chinadaily.com.cn/"

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #15
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='left newstext']")[
                    0].xpath('string(.)').extract()[0]
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author_list = [
                    "三湘风纪", "搜狐", "英国报姐", "我们爱历史", "最爱历史", "慢青年", "伟人勘察",
                    "大象公会", "中国历史文化网", "凤凰", "腾讯", "新浪", "解放日报", "参考消息", "新华网",
                    "红瞰天下", "海外网", "人民网", "中华读书报", "今日头条", "中国新闻网"
                ]
                author = author_list[random.randint(0, len(author_list) - 1)]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath(
                    "//head/meta[@name='description']/@content").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            source = "http://www.xilu.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #16
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("T", " ") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='post_description']"
                                             )[0].xpath('string(.)').extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title = response.xpath("//head/title/text()").extract()[0][:-3]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@id='post_author']/text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "亿欧网"
            except:
                spiderUtil.log_level(9, response.url)

            source = "https://www.iyiou.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #17
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = response.xpath(
                    "//head/meta[@name='publishdate']/@content"
                ).extract()[0].replace("年", "-").replace("月", "-").replace(
                    "日", " ") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='cl']")[0].xpath(
                    'string(.)').extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author = response.xpath("//head/meta[@name='source']/@content"
                                        ).extract()[0].split("-")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath(
                    "//head/meta[@itemprop='name']/@content").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            source = "https://www.dahe.cn/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #18
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//*[@id="pubtime_baidu"]/text()""").extract()
                public_time = str(content_time[0])

            except:
                spiderUtil.log_level(8, response.url)

            try:
                contents = response.xpath(
                    """//*[@id="allList"]/div/div/div/p/text()""").extract()
                content = "".join(contents)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.cankaoxiaoxi.com/"

            try:
                author = str(
                    response.xpath("""//*[@id="source_baidu"]/text()""").
                    extract()[0].strip()).replace("来源:", "")

            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div/div/h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #19
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            # 标题解析
            try:
                title = response.xpath("//h1[@class='article-title']/text()"
                                       ).extract()[0].replace(
                                           '\t',
                                           '').replace('\n',
                                                       '').replace('\r', '')
            except:
                spiderUtil.log_level(6, response.url)

            try:
                public_time_tmp = response.xpath(
                    "//div[@class='article-infos']/span[@class='date']/text()"
                ).extract()[0]
                if len(public_time_tmp) == 16:
                    public_time = public_time_tmp + ":00"
                else:
                    public_time = public_time_tmp
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.stnn.cc/"
            try:
                content_arr = response.xpath(
                    "//div[@class='article-content fontSizeSmall BSHARE_POP']/p/text()"
                ).extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)
            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.meta["url"]
                    item["title"] = title
                    item["author"] = "星岛环球网"
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #20
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = str(response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()[0].strip())+":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("""//*[@id="content"]/p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.southcn.com/"

            try:
                author_arr = response.xpath("""//*[@id="source_baidu"]/text()""").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "澎湃新闻"
                else:
                    author = author.replace("来源:","")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("""//*[@id="article_title"]/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #21
0
    def parsebody(self,response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            all_arr = response.xpath("""//script//text()""").extract()
            data = "".join(all_arr).split("allData = ")[1].split("var adData")[0].strip()[:-1]
            data = json.loads(data)
            doc = data['docData']
            try:
                public_time = doc['newsTime']

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content = doc['contentData']['contentList'][0]['data']
                content = "".join(etree.HTML(content).xpath("//p//text()")).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.ifeng.com/"

            try:
                author = doc['source']
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = doc['title']
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #22
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath("//div[@id='main-content']")[
                    0].xpath('string(.)').extract()[0]
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                public_time = response.meta["public_time"]
            except:
                spiderUtil.log_level(8, response.url)

            try:
                author = response.xpath(
                    "//p[@class='fromInfo']/text()").extract()[0].split(":")[1]
            except:
                spiderUtil.log_level(9, response.url)
            try:
                title = response.xpath(
                    "//head/meta[@name='title']/@content").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            source = "http://www.wenweipo.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item

            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Пример #23
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath("""//*[@id="article"]/div/div/div/span/text()""").extract()
                public_time = str(time.strftime('%Y', time.localtime(time.time()))) +"-"+ str(content_time[0]) + " " + str(content_time[1]) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("""//*[@id="article"]/div/p/span/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.baidu.com/"

            try:
                author_arr = response.xpath("""//*[@id="article"]/div/div/p/text()""").extract()
                author = "".join(author_arr)
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("""//*[@id="article"]/div/h2/text()""").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #24
0
    def parsebody(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='endText']/p/text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author_arr = response.xpath(
                    "//a[@id='ne_article_source']//text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "网易新闻"
            except:
                spiderUtil.log_level(9, response.url)

            source = "https://news.163.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = author
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #25
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath("//div[@id='articleText']//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                public_time = (re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(
                    0) + ":00").replace(
                    "年", "-").replace("月", "-").replace("日", "")
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ce.cn/"
            try:
                title = response.xpath("//head/title/text()").extract()[0].split("_")[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//head/meta[@name='author']/@content").extract()[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                if public_time.startswith(spiderUtil.get_first_hour()) and content != "":
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #26
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='content']/p/text()").extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.bjnews.com.cn/"

            try:
                author = response.xpath("//span[@class='author']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div[@class='title']/h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):

                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #27
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath("""//div[@class="info"]//text()""").extract()
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", content_time[0]).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arrs = response.xpath("""//td[@class="content"]//p//text()""").extract()
                content_arr = content_arrs.split('推荐信息')[0]
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.188cf.net/"

            try:
                author = "188财富网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("""//h1//text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #28
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath("//div[@class='post_text']/p/text()").extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author = response.xpath("//head/meta[@name='author']/@content").extract()[0]
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.e23.cn/"
            try:
                title = "".join(response.xpath("//head/title/text()").extract()[0].split("-")[:-2]).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                public_time = response.xpath("//div[@class='post_time']/p/text()").extract()[0]
            except:
                spiderUtil.log_level(8, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #29
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            data_str = response.text
            data_str = data_str[9:-1]
            data_str = eval(
                data_str,
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
            data_str = json.dumps(data_str)
            data_str = json.loads(data_str)
            try:
                content = str(data_str['normalized_content'])
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xuexi.cn/"

            try:
                if content != "" and str(
                        response.meta["public_time"]).startswith(
                            spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Пример #30
0
    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath(
                    "//dl/dd[@id='CONTENT']/p//text()").extract()
                content = "".join(content_arr).replace("\xa0", "")
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath(
                    "//dd[@class='f18 b black02 yh center']//text()").extract(
                    )
                title = "".join(title_arr).strip()
                if title == "":
                    title_arr = response.xpath(
                        "//td[@class='f22 b black02']//text()").extract()
                    title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                author_arr = response.xpath(
                    "//dd[@class='f12 black02']//text()").extract()
                author_tmp = "".join(author_arr).strip()
                if author_tmp == "":
                    author = "法制网"
                else:
                    author = author_tmp
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.legaldaily.com.cn/"

            try:
                if len(content) > 80 and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)