Пример #1
0
class TempletePipeline(object):
    utlis = utilsModel()

    def __init__(self):
        host = settings["MONGODB_HOST"]
        port = settings["MONGODB_PORT"]
        dbname = settings["MONGODB_DBNAME"]
        user = settings["MONGODB_USER"]
        password = settings["MONGODB_PASS"]

        # 创建MONGODB数据库链接
        # client = pymongo.MongoClient(host=host, port=port)
        uri = 'mongodb://' + user + ':' + password + '@' + host + ':' + port + '/' + dbname
        # uri = 'mongodb://*****:*****@222.197.219.11:27017/zhdb_FaYun'
        client = MongoClient(uri)

        # 指定数据库
        mydb = client[dbname]
        # 存放数据的数据库表名
        self.collection = mydb['news']
        self.collection_url = mydb['urlid_Collection']
        # doc = self.collection.find()

    def process_item(self, item, spider):
        data = dict(item)
        # 不存在则插入
        if self.utlis.exists_urlid(item['urlId']) == 0:
            # 如果新闻标题不为空,则判断是否在urlid_Collection表中是否存在
            # 读取urlid_Collection表,然后判断当前的item['urlid']是否存在
            self.collection.insert(data)
            self.collection_url.insert({"urlid": item['urlId']})
        else:
            print("已存在")
        return item
Пример #2
0
class SougouSearchSpider(scrapy.Spider):
    name = 'china_news'
    utils = utilsModel()
    allowed_domains = ['chinanews.com']
    keyIndex = 0
    keyword, keywordCount = utils.get_keyword(keyIndex)
    page = 1
    # base_url = 'http://news.163.com/latest/'.format(
    #     keyword, page)
    base_url = 'http://sou.chinanews.com/search.do?q=%E6%9B%B2%E9%9D%96'
    start_urls = [base_url]

    def parse(self, response):
        item = TempleteItem()
        try:
            elements = response.xpath('//div[@id="news_list"]/table')
            print(type(elements))
            for each in elements:
                url = each.xpath(
                    '//div[@id="news_list"]/table/tbody/tr[1]/td[2]/ul/li[1]/a/@href'
                ).extract()
                print(url)
                # seedUrl = str(url).replace('\']', '').replace('[\'', '')
                # print(seedUrl)

                # item['Id'] = '1801999'
                # item['indexFlag'] = False
                # item['originweb'] = '新浪新闻搜索'
                # item['seedUrl'] = seedUrl
                # item['urlId'] = self.utils.encrypt_url(str(seedUrl))
                # item['title'] = title
                # item['content'] = content
                # item['source'] = source
                # item['type'] = '新闻'
                # item['releaseTime'] = releaseTime
                # item['url_id'] = '1000'
                # item['url_key'] = '1801999'
                # item['pagehtml'] = str(pagehtml)
                # # item['releaseTimeLong'] = releaseTimeLong
                # item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                # yield item

            # if self.page < 2:
            #     self.page = self.page + 1
            #     yield scrapy.Request(
            #         'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format(
            #             self.keyword, self.page))
            # else:
            #     if self.keyIndex < self.keywordCount:
            #         # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求
            #         self.page = 1
            #         self.keyIndex = self.keyIndex + 1
            #         self.keyword, _ = self.utils.get_keyword(self.keyIndex)
            #         yield scrapy.Request(
            #             'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format(
            #                 self.keyword, self.page))
        except Exception as e:
            print(e)
Пример #3
0
class SougouSearchSpider(scrapy.Spider):
    name = 'wangyi_news'
    utils = utilsModel()
    allowed_domains = ['news.163.com']
    keyIndex = 0
    keyword, keywordCount = utils.get_keyword(keyIndex)
    page = 1
    # base_url = 'http://news.163.com/latest/'.format(
    #     keyword, page)
    base_url = 'http://news.163.com/latest/'
    start_urls = [base_url]

    def parse(self, response):
        # item = TempleteItem()
        try:
            elements = response.xpath(
                '//div[@id=instantPanel""]/div[@class="cnt"]/ul/li')
            print(elements)

            # item['Id'] = '1801999'
            # item['indexFlag'] = False
            # item['originweb'] = '新浪新闻搜索'
            # item['seedUrl'] = seedUrl
            # item['urlId'] = self.utils.encrypt_url(str(seedUrl))
            # item['title'] = title
            # item['content'] = content
            # item['source'] = source
            # item['type'] = '新闻'
            # item['releaseTime'] = releaseTime
            # item['url_id'] = '1000'
            # item['url_key'] = '1801999'
            # item['pagehtml'] = str(pagehtml)
            # # item['releaseTimeLong'] = releaseTimeLong
            # item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            # yield item

            # if self.page < 2:
            #     self.page = self.page + 1
            #     yield scrapy.Request(
            #         'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format(
            #             self.keyword, self.page))
            # else:
            #     if self.keyIndex < self.keywordCount:
            #         # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求
            #         self.page = 1
            #         self.keyIndex = self.keyIndex + 1
            #         self.keyword, _ = self.utils.get_keyword(self.keyIndex)
            #         yield scrapy.Request(
            #             'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format(
            #                 self.keyword, self.page))
        except Exception as e:
            print(e)
Пример #4
0
class SougouSearchSpider(scrapy.Spider):
    name = 'sina_search'
    utils = utilsModel()
    allowed_domains = ['search.sina.com.cn']
    keyIndex = 0
    keyword, keywordCount = utils.get_keyword(keyIndex)
    page = 1
    base_url = 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format(
        keyword, page)
    start_urls = [base_url]

    def parse(self, response):
        item = TempleteItem()
        try:
            elements = response.xpath('//div[@class="box-result clearfix"]')
            # print(elements)
            for each in elements:
                Url = each.xpath('./h2/a/@href | ./div/h2/a/@href').extract()
                # print(seedUrl)
                seedUrl = str(Url).replace('\']', '').replace('[\'', '')
                # print(url)
                if seedUrl is not None:
                    pagehtml = requests.get(url=seedUrl)
                    # print(str(pagehtml.text))
                else:
                    return None
                title = ''.join(
                    each.xpath('./h2/a//text() | ./div/h2/a//text()').extract(
                    )).strip()
                # print(title)
                sources = each.xpath(
                    './h2/span/text() | ./div/h2/span/text()').extract()
                source = ''.join(re.findall(r'[\u4e00-\u9fa5]', str(sources)))
                # print(source)
                time = ''.join(
                    re.findall(
                        r'\d{1,4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}',
                        str(sources)))
                releaseTime = time
                # 时间格式处理
                # releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(time)
                # print(js_time)
                content = ''.join(
                    each.xpath('./div/p//text()').extract()).strip()
                # print(content)

                item['Id'] = '1801999'
                item['indexFlag'] = False
                item['originweb'] = '新浪新闻搜索'
                item['seedUrl'] = seedUrl
                item['urlId'] = self.utils.encrypt_url(str(seedUrl))
                item['title'] = title
                item['content'] = content
                item['source'] = source
                item['type'] = '新闻'
                item['releaseTime'] = releaseTime
                item['url_id'] = '1000'
                item['url_key'] = '1801999'
                item['pagehtml'] = str(pagehtml)
                # item['releaseTimeLong'] = releaseTimeLong
                item['collectionTime'] = datetime.datetime.now().strftime(
                    "%Y-%m-%d %H:%M:%S")
                yield item

            if self.page < 2:
                self.page = self.page + 1
                yield scrapy.Request(
                    'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.
                    format(self.keyword, self.page))
            else:
                if self.keyIndex < self.keywordCount:
                    # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求
                    self.page = 1
                    self.keyIndex = self.keyIndex + 1
                    self.keyword, _ = self.utils.get_keyword(self.keyIndex)
                    yield scrapy.Request(
                        'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'
                        .format(self.keyword, self.page))
        except Exception as e:
            print(e)
Пример #5
0
class SougouSearchSpider(scrapy.Spider):
    name = 'sougou_search'
    utils = utilsModel()
    allowed_domains = ['news.sogou.com']
    keyIndex = 0
    keyword, keywordCount = utils.get_keyword(keyIndex)
    # print(keyword)
    page = 1
    base_url = 'https://news.sogou.com/news?mode=1&query={}&page={}'.format(
        keyword, page)
    start_urls = [base_url]

    def parse(self, response):
        item = TempleteItem()
        try:
            elements = response.xpath('//div[@class="vrwrap"]/div')
            for each in elements:
                # 获取url
                Url = each.xpath("./h3/a/@href").extract_first()
                # print(seedUrl)
                if Url == None:
                    print('无效网页!!!')
                seedUrl = str(Url).replace('\']', '').replace('[\'', '')
                # print(seedUrl)
                if seedUrl is not None:
                    pagehtml = requests.get(url=seedUrl)
                    # print(str(pagehtml.text))
                else:
                    return None

                # 获取标题
                if seedUrl == None:
                    pass
                else:
                    title = ''.join(each.xpath('./h3/a//text()').extract()).strip()
                    # print(title)

                # 获取来源
                if seedUrl == None:
                    pass
                else:
                    sources = each.xpath('.//div/div/p[1]/text()').extract()
                    source = ''.join(re.findall(r'[\u4e00-\u9fa5]', str(sources))).strip()

                # 获取时间
                if seedUrl == None:
                    pass
                else:
                    js_time = re.findall(r'\d{1,4}-\d{1,2}-\d{1,2}',
                                         str(each.xpath('.//div/div/p[1]/text()').extract()))
                    releaseTime = str(js_time).replace('[\'', '').replace('\']', '')
                    # print(releaseTime)
                    # 处理时间 将字符串传入一个函数,来判断时间,返回时间格式
                    # 时间格式处理
                    # releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(js_time)

                # 获取内容
                if seedUrl == None:
                    pass
                else:
                    content = ''.join(each.xpath('.//p[2]/span//text()').extract()).strip()
                    # print(content)

                item['Id'] = '1801999'
                item['indexFlag'] = False
                item['originweb'] = '搜狗新闻搜索'
                item['seedUrl'] = seedUrl
                item['urlId'] = self.utils.encrypt_url(str(seedUrl))
                item['title'] = title
                item['content'] = content
                item['source'] = source
                item['type'] = '新闻'
                item['releaseTime'] = releaseTime
                # item['releaseTimeLong'] = releaseTimeLong
                item['url_id'] = '1000'
                item['url_key'] = '1801999'
                item['pagehtml'] = str(pagehtml)
                item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield item

            # # 发送下一页请求
            # if len(response.xpath('//*[@id="pagebar_container"]/a[@id="sogou_next"]/@href')) == 0:
            #     print("当前网站爬取结束!!!")
            # else:
            #     urlnext_page = response.xpath('//*[@id="pagebar_container"]/a[@id="sogou_next"]/@href').extract()[0]
            #     yield scrapy.Request(url="https://news.sogou.com/news" + urlnext_page, callback=self.parse)

            if self.page < 2:
                self.page = self.page + 1
                yield scrapy.Request(
                    'https://news.sogou.com/news?mode=1&query={}&page={}'.format(
                        self.keyword, self.page))
            else:
                if self.keyIndex < self.keywordCount:
                    # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求
                    self.page = 1
                    self.keyIndex = self.keyIndex + 1
                    self.keyword, _ = self.utils.get_keyword(self.keyIndex)
                    print(self.keyword)
                    yield scrapy.Request(
                        'https://news.sogou.com/news?mode=1&query={}&page={}'.format(
                            self.keyword, self.page))


        except Exception as e:
            print(e)
Пример #6
0
class WeixinArticleSpider(scrapy.Spider):
    utils = utilsModel()
    name = 'weixin_article'
    # allowed_domains = ['weixin.sogou.com']
    keyIndex = 0
    keyword, keywordCount = utils.get_keyword(keyIndex)
    print(keyword)
    page = 1
    base_url = 'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}'.format(
        keyword, page)
    start_urls = [base_url]

    def parse(self, response):
        # 获取所有的标题,然后拼接为字符串
        elements = response.xpath("//div[@class='txt-box']")
        item = TempleteItem()
        for each in elements:
            # 获取标题
            title = ""
            content = ""
            Url = each.xpath("./h3/a/@href").extract()[0]
            seedUrl = str(Url).replace('\']', '').replace('[\'', '')
            # print(url)
            if seedUrl is not None:
                pagehtml = requests.get(url=seedUrl)
                # print(str(pagehtml.text))
            else:
                return None
            source = each.xpath("./div[@class='s-p']/a/text()").extract()[0]
            for sub_title in each.xpath("./h3/a//text()").extract():
                sub_title = sub_title.strip()
                title = title + sub_title
            # 获取内容
            for sub_content in each.xpath("./p//text()").extract():
                sub_content = sub_content.strip()
                content = content + sub_content
            # 处理时间 将字符串传入一个函数,来判断时间,返回时间格式
            js_time = each.xpath(
                ".//span[@class='s2']/script/text()").extract()[0]
            # print(js_time)
            # 时间格式处理
            releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(js_time)
            item['Id'] = '1801999'
            item['indexFlag'] = False
            item['originweb'] = '搜狗微信搜索'
            item['seedUrl'] = seedUrl
            item['urlId'] = self.utils.encrypt_url(str(seedUrl))
            item['title'] = title
            item['content'] = content
            item['source'] = source
            item['type'] = '微信'
            item['releaseTime'] = releaseTime
            item['url_id'] = '1000'
            item['url_key'] = '1801999'
            item['pagehtml'] = str(pagehtml)
            item['releaseTimeLong'] = releaseTimeLong
            item['collectionTime'] = datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")
            yield item
        if self.page < 2:
            self.page = self.page + 1
            yield scrapy.Request(
                'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}'
                .format(self.keyword, self.page))
        else:
            if self.keyIndex < self.keywordCount:
                # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求
                self.page = 1
                self.keyIndex = self.keyIndex + 1
                self.keyword, _ = self.utils.get_keyword(self.keyIndex)
                yield scrapy.Request(
                    'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}'
                    .format(self.keyword, self.page))