class TempletePipeline(object): utlis = utilsModel() def __init__(self): host = settings["MONGODB_HOST"] port = settings["MONGODB_PORT"] dbname = settings["MONGODB_DBNAME"] user = settings["MONGODB_USER"] password = settings["MONGODB_PASS"] # 创建MONGODB数据库链接 # client = pymongo.MongoClient(host=host, port=port) uri = 'mongodb://' + user + ':' + password + '@' + host + ':' + port + '/' + dbname # uri = 'mongodb://*****:*****@222.197.219.11:27017/zhdb_FaYun' client = MongoClient(uri) # 指定数据库 mydb = client[dbname] # 存放数据的数据库表名 self.collection = mydb['news'] self.collection_url = mydb['urlid_Collection'] # doc = self.collection.find() def process_item(self, item, spider): data = dict(item) # 不存在则插入 if self.utlis.exists_urlid(item['urlId']) == 0: # 如果新闻标题不为空,则判断是否在urlid_Collection表中是否存在 # 读取urlid_Collection表,然后判断当前的item['urlid']是否存在 self.collection.insert(data) self.collection_url.insert({"urlid": item['urlId']}) else: print("已存在") return item
class SougouSearchSpider(scrapy.Spider): name = 'china_news' utils = utilsModel() allowed_domains = ['chinanews.com'] keyIndex = 0 keyword, keywordCount = utils.get_keyword(keyIndex) page = 1 # base_url = 'http://news.163.com/latest/'.format( # keyword, page) base_url = 'http://sou.chinanews.com/search.do?q=%E6%9B%B2%E9%9D%96' start_urls = [base_url] def parse(self, response): item = TempleteItem() try: elements = response.xpath('//div[@id="news_list"]/table') print(type(elements)) for each in elements: url = each.xpath( '//div[@id="news_list"]/table/tbody/tr[1]/td[2]/ul/li[1]/a/@href' ).extract() print(url) # seedUrl = str(url).replace('\']', '').replace('[\'', '') # print(seedUrl) # item['Id'] = '1801999' # item['indexFlag'] = False # item['originweb'] = '新浪新闻搜索' # item['seedUrl'] = seedUrl # item['urlId'] = self.utils.encrypt_url(str(seedUrl)) # item['title'] = title # item['content'] = content # item['source'] = source # item['type'] = '新闻' # item['releaseTime'] = releaseTime # item['url_id'] = '1000' # item['url_key'] = '1801999' # item['pagehtml'] = str(pagehtml) # # item['releaseTimeLong'] = releaseTimeLong # item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # yield item # if self.page < 2: # self.page = self.page + 1 # yield scrapy.Request( # 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format( # self.keyword, self.page)) # else: # if self.keyIndex < self.keywordCount: # # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求 # self.page = 1 # self.keyIndex = self.keyIndex + 1 # self.keyword, _ = self.utils.get_keyword(self.keyIndex) # yield scrapy.Request( # 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format( # self.keyword, self.page)) except Exception as e: print(e)
class SougouSearchSpider(scrapy.Spider): name = 'wangyi_news' utils = utilsModel() allowed_domains = ['news.163.com'] keyIndex = 0 keyword, keywordCount = utils.get_keyword(keyIndex) page = 1 # base_url = 'http://news.163.com/latest/'.format( # keyword, page) base_url = 'http://news.163.com/latest/' start_urls = [base_url] def parse(self, response): # item = TempleteItem() try: elements = response.xpath( '//div[@id=instantPanel""]/div[@class="cnt"]/ul/li') print(elements) # item['Id'] = '1801999' # item['indexFlag'] = False # item['originweb'] = '新浪新闻搜索' # item['seedUrl'] = seedUrl # item['urlId'] = self.utils.encrypt_url(str(seedUrl)) # item['title'] = title # item['content'] = content # item['source'] = source # item['type'] = '新闻' # item['releaseTime'] = releaseTime # item['url_id'] = '1000' # item['url_key'] = '1801999' # item['pagehtml'] = str(pagehtml) # # item['releaseTimeLong'] = releaseTimeLong # item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # yield item # if self.page < 2: # self.page = self.page + 1 # yield scrapy.Request( # 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format( # self.keyword, self.page)) # else: # if self.keyIndex < self.keywordCount: # # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求 # self.page = 1 # self.keyIndex = self.keyIndex + 1 # self.keyword, _ = self.utils.get_keyword(self.keyIndex) # yield scrapy.Request( # 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format( # self.keyword, self.page)) except Exception as e: print(e)
class SougouSearchSpider(scrapy.Spider): name = 'sina_search' utils = utilsModel() allowed_domains = ['search.sina.com.cn'] keyIndex = 0 keyword, keywordCount = utils.get_keyword(keyIndex) page = 1 base_url = 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'.format( keyword, page) start_urls = [base_url] def parse(self, response): item = TempleteItem() try: elements = response.xpath('//div[@class="box-result clearfix"]') # print(elements) for each in elements: Url = each.xpath('./h2/a/@href | ./div/h2/a/@href').extract() # print(seedUrl) seedUrl = str(Url).replace('\']', '').replace('[\'', '') # print(url) if seedUrl is not None: pagehtml = requests.get(url=seedUrl) # print(str(pagehtml.text)) else: return None title = ''.join( each.xpath('./h2/a//text() | ./div/h2/a//text()').extract( )).strip() # print(title) sources = each.xpath( './h2/span/text() | ./div/h2/span/text()').extract() source = ''.join(re.findall(r'[\u4e00-\u9fa5]', str(sources))) # print(source) time = ''.join( re.findall( r'\d{1,4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}', str(sources))) releaseTime = time # 时间格式处理 # releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(time) # print(js_time) content = ''.join( each.xpath('./div/p//text()').extract()).strip() # print(content) item['Id'] = '1801999' item['indexFlag'] = False item['originweb'] = '新浪新闻搜索' item['seedUrl'] = seedUrl item['urlId'] = self.utils.encrypt_url(str(seedUrl)) item['title'] = title item['content'] = content item['source'] = source item['type'] = '新闻' item['releaseTime'] = releaseTime item['url_id'] = '1000' item['url_key'] = '1801999' item['pagehtml'] = str(pagehtml) # item['releaseTimeLong'] = releaseTimeLong item['collectionTime'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") yield item if self.page < 2: self.page = self.page + 1 yield scrapy.Request( 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}'. format(self.keyword, self.page)) else: if self.keyIndex < self.keywordCount: # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求 self.page = 1 self.keyIndex = self.keyIndex + 1 self.keyword, _ = self.utils.get_keyword(self.keyIndex) yield scrapy.Request( 'https://search.sina.com.cn/?q={}&c=news&time=&a=&page={}' .format(self.keyword, self.page)) except Exception as e: print(e)
class SougouSearchSpider(scrapy.Spider): name = 'sougou_search' utils = utilsModel() allowed_domains = ['news.sogou.com'] keyIndex = 0 keyword, keywordCount = utils.get_keyword(keyIndex) # print(keyword) page = 1 base_url = 'https://news.sogou.com/news?mode=1&query={}&page={}'.format( keyword, page) start_urls = [base_url] def parse(self, response): item = TempleteItem() try: elements = response.xpath('//div[@class="vrwrap"]/div') for each in elements: # 获取url Url = each.xpath("./h3/a/@href").extract_first() # print(seedUrl) if Url == None: print('无效网页!!!') seedUrl = str(Url).replace('\']', '').replace('[\'', '') # print(seedUrl) if seedUrl is not None: pagehtml = requests.get(url=seedUrl) # print(str(pagehtml.text)) else: return None # 获取标题 if seedUrl == None: pass else: title = ''.join(each.xpath('./h3/a//text()').extract()).strip() # print(title) # 获取来源 if seedUrl == None: pass else: sources = each.xpath('.//div/div/p[1]/text()').extract() source = ''.join(re.findall(r'[\u4e00-\u9fa5]', str(sources))).strip() # 获取时间 if seedUrl == None: pass else: js_time = re.findall(r'\d{1,4}-\d{1,2}-\d{1,2}', str(each.xpath('.//div/div/p[1]/text()').extract())) releaseTime = str(js_time).replace('[\'', '').replace('\']', '') # print(releaseTime) # 处理时间 将字符串传入一个函数,来判断时间,返回时间格式 # 时间格式处理 # releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(js_time) # 获取内容 if seedUrl == None: pass else: content = ''.join(each.xpath('.//p[2]/span//text()').extract()).strip() # print(content) item['Id'] = '1801999' item['indexFlag'] = False item['originweb'] = '搜狗新闻搜索' item['seedUrl'] = seedUrl item['urlId'] = self.utils.encrypt_url(str(seedUrl)) item['title'] = title item['content'] = content item['source'] = source item['type'] = '新闻' item['releaseTime'] = releaseTime # item['releaseTimeLong'] = releaseTimeLong item['url_id'] = '1000' item['url_key'] = '1801999' item['pagehtml'] = str(pagehtml) item['collectionTime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield item # # 发送下一页请求 # if len(response.xpath('//*[@id="pagebar_container"]/a[@id="sogou_next"]/@href')) == 0: # print("当前网站爬取结束!!!") # else: # urlnext_page = response.xpath('//*[@id="pagebar_container"]/a[@id="sogou_next"]/@href').extract()[0] # yield scrapy.Request(url="https://news.sogou.com/news" + urlnext_page, callback=self.parse) if self.page < 2: self.page = self.page + 1 yield scrapy.Request( 'https://news.sogou.com/news?mode=1&query={}&page={}'.format( self.keyword, self.page)) else: if self.keyIndex < self.keywordCount: # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求 self.page = 1 self.keyIndex = self.keyIndex + 1 self.keyword, _ = self.utils.get_keyword(self.keyIndex) print(self.keyword) yield scrapy.Request( 'https://news.sogou.com/news?mode=1&query={}&page={}'.format( self.keyword, self.page)) except Exception as e: print(e)
class WeixinArticleSpider(scrapy.Spider): utils = utilsModel() name = 'weixin_article' # allowed_domains = ['weixin.sogou.com'] keyIndex = 0 keyword, keywordCount = utils.get_keyword(keyIndex) print(keyword) page = 1 base_url = 'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}'.format( keyword, page) start_urls = [base_url] def parse(self, response): # 获取所有的标题,然后拼接为字符串 elements = response.xpath("//div[@class='txt-box']") item = TempleteItem() for each in elements: # 获取标题 title = "" content = "" Url = each.xpath("./h3/a/@href").extract()[0] seedUrl = str(Url).replace('\']', '').replace('[\'', '') # print(url) if seedUrl is not None: pagehtml = requests.get(url=seedUrl) # print(str(pagehtml.text)) else: return None source = each.xpath("./div[@class='s-p']/a/text()").extract()[0] for sub_title in each.xpath("./h3/a//text()").extract(): sub_title = sub_title.strip() title = title + sub_title # 获取内容 for sub_content in each.xpath("./p//text()").extract(): sub_content = sub_content.strip() content = content + sub_content # 处理时间 将字符串传入一个函数,来判断时间,返回时间格式 js_time = each.xpath( ".//span[@class='s2']/script/text()").extract()[0] # print(js_time) # 时间格式处理 releaseTime, releaseTimeLong = DateFormatHelper.js_To_Date(js_time) item['Id'] = '1801999' item['indexFlag'] = False item['originweb'] = '搜狗微信搜索' item['seedUrl'] = seedUrl item['urlId'] = self.utils.encrypt_url(str(seedUrl)) item['title'] = title item['content'] = content item['source'] = source item['type'] = '微信' item['releaseTime'] = releaseTime item['url_id'] = '1000' item['url_key'] = '1801999' item['pagehtml'] = str(pagehtml) item['releaseTimeLong'] = releaseTimeLong item['collectionTime'] = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") yield item if self.page < 2: self.page = self.page + 1 yield scrapy.Request( 'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}' .format(self.keyword, self.page)) else: if self.keyIndex < self.keywordCount: # 如果当前关键词爬取结束,则发送爬取下一个关键词的请求 self.page = 1 self.keyIndex = self.keyIndex + 1 self.keyword, _ = self.utils.get_keyword(self.keyIndex) yield scrapy.Request( 'https://weixin.sogou.com/weixin?type=2&query={}&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&page={}' .format(self.keyword, self.page))