class B2bNewSpider(Spider): name = "b2bnew" domain_url = "http://b2b.10086.cn/" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(B2bNewSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 url = 'http://b2b.10086.cn/b2b/main/listVendorNotice.html?noticeType=2' self.start_urls.append(url) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) def parse_content(self,response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') try: content = bsoup.select('div#mobanDiv')[0] except: content = self.dc.process(str(response.body)) item['content'] = content print 'url: ' + item['url'] + ' is added' return item def parse_items(self, response): elem_list = [] items = [] url = "http://b2b.10086.cn/b2b/main/listVendorNoticeResult.html?noticeBean.noticeType=2" data = "&page.currentPage=1&page.perPageSize=50¬iceBean.sourceCH=¬iceBean.source=¬iceBean.title=¬iceBean.startDate=¬iceBean.endDate=" elem_list = re.findall('<tr(.*?)</tr>', re.sub('\s', '', requests.post(url + data).text)) if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'news' item['source'] = '中国移动采购与招标' item['channel'] = 'Search engine' if elem.find("onmouseout") < 0: continue itemID = re.search("selectResult\(\'([\d]+?)\'\)", elem).group(1) item['url'] = ('http://b2b.10086.cn/b2b/main/viewNoticeContent.html?' + 'noticeBean.id=' + itemID) if self.r.exists(item['url']): continue res = re.findall('<td.*?</td>', elem) item['medianame'] = re.sub('<.*?>', '', res[0]) item['title'] = re.sub('<.*?>', '', res[2]) item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['pubtime'] = re.sub('<.*?>', '', res[-1]) + item['collecttime'][-6:] if self.tool.old_news(item['pubtime']): continue items.append(item) return items
class TiebaBBSSpider(Spider): name = "tiebabbs" domain_url = "http://tieba.baidu.com" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(TiebaBBSSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "/f/search/res?ie=utf-8&rn=20&qw=" + urllib.quote( query.encode('utf8')) + '&ct=0' self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath( u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url + url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') if bsoup.find('h1', class_='core_title_txt'): item['title'] = bsoup.find('h1', class_='core_title_txt')['title'] elif bsoup.find('h3', class_='core_title_txt'): item['title'] = bsoup.find('h3', class_='core_title_txt')['title'] else: return timeform = '%Y-%m-%d %H:%M' pubtimes = [time.strptime(item['pubtime'], timeform)] for pubtime in re.findall('/d{4}-/d{2}-/d{2} /d{2}:/d{2}', str(bsoup)): pubtimes.append(time.strptime(pubtime, timeform)) item['pubtime'] = time.strftime(timeform, min(pubtimes)) if self.tool.old_news(item['pubtime']): print item['utl'] + ' ' + item['pubtime'] return item['content'] = [] for elem in bsoup.find_all('div', class_='d_post_content'): item['content'].append(str(elem.extract())) #onlt get the first floor break if item: item['content'] = ' '.join(item['content']).encode('utf8') item['content'] = self.dc.process(item['content']) print 'url: ' + item['url'] + ' is added' yield item def parse_items(self, response): if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') main_content = bsoup.find('div', class_='s_post_list') items = [] if main_content: elem_list = main_content.find_all('div', class_='s_post') else: return items if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '百度贴吧' item['channel'] = 'Search engine' try: item['pubtime'] = elem.find('font', class_='p_date').get_text() if self.tool.old_news(item['pubtime']): continue #item['title'] = elem.span.a.get_text() item['medianame'] = elem.find( 'font', class_='p_violet').get_text() item['abstract'] = elem.find( 'div', class_='p_content').get_text() except: continue item['url'] = self.domain_url + re.findall( '(/p/.*?)[^\d]', elem.span.a['href'])[0] if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) items.append(item) #去重 new_items = [] url_list = [] for item in items: if item['url'] not in url_list: new_items.append(item) url_list.append(item['url']) items = new_items return items
class SogouNewSpider(Spider): name = "sogounew" domain_url = "http://news.sogou.com/news" start_urls = [] tool = Utools() dc = dataCleaner() test_hbase = True def __init__ (self): super(SogouNewSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def initial(self): self.log('---started----') self.getStartUrl() #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 #sort_by_time = '&sort=1' sort_by_time = '' qlist = GetQuery().get_data() for query in qlist: if query: query_url = '?query=' + urllib.quote(query.encode('utf8')) + sort_by_time self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #未成功获取query if response.url == self.domain_url: print 'error of query' return #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #抽取搜索结果页详细页面链接 requests = [] for url in sel.xpath(u'//a[@class="np"]/@href').extract(): requests.append(self.make_requests_from_url(self.domain_url + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self,response): item = response.meta['item'] try: charset = response.encoding except: charset = 'utf-8' if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) item['content'] = self.dc.process(str(bsoup).decode(charset)) except: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['content'] = self.dc.process(str(bsoup)) if len(item['content'].encode('utf8')) < len(item['abstract']): item['content'] = item['abstract'].replace('百度快照', '') if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: #去除干扰内容<!.*?> res = re.sub(r'<!.*?>', '', response.body) bsoup = BeautifulSoup(res, from_encoding='utf8') main_content = bsoup.select('div#wrapper')[0] if main_content: elem_list = main_content.find_all('div', class_='rb') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'news' item['source'] = '搜狗新闻' item['channel'] = 'Search engine' if elem.h3.a.get_text(): item['title'] = elem.h3.a.get_text() else: continue item['url'] = elem.h3.a['href'] author = elem.cite.get_text() if len(author.split()) > 1: item['medianame'] = author.split()[0] item['pubtime'] = ' '.join(author.split()[1:]) if self.tool.old_news(item['pubtime']): continue else: item['source'] = author.split()[0] if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0: item['url'] = "".join(item['url'].split("?")[0:-1]) if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue try: item['source'] = self.tool.get_realname(item['medianame']) item['medianame'] = ' ' except: pass item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['abstract']=elem.find('div',class_='ft').get_text() items.append(item) return items
class BaiduNewSpider(Spider): name = "baidunew" domain_url = "http://news.baidu.com" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(BaiduNewSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "/ns?rn=20&word=" + urllib.quote(query.encode('utf8')) + '&ct=0' self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url+url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] try: charset = response.encoding except: charset = 'utf-8' if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) item['content'] = self.dc.process(str(bsoup).decode(charset)) except: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['content'] = self.dc.process(str(bsoup)) if len(item['content'].encode('utf8')) < len(item['abstract']): item['content'] = item['abstract'].replace('百度快照', '') if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: bsoup = BeautifulSoup(response.body,from_encoding='utf-8') main_content = 0 try: main_content = bsoup.select('div#container')[0].select('div#content_left')[0] except: print 'url: ' + response.url + ' is empty' return [] if main_content: elem_list = main_content.find_all('div', class_='result') items = [] if len(elem_list)>0: for elem in elem_list: item = DataItem() item['dtype'] = 'news' item['source'] = '百度新闻' item['channel'] = 'Search engine' try: item['title'] = elem.h3.a.get_text() except: continue item['url'] = elem.h3.a['href'] author = elem.find('p',class_='c-author') if author: source_time = author.get_text().split() if re.match(r'\d{4}.*?\d{1,2}.*?\d{1,2}', source_time[0].encode('utf8')): item['medianame'] = 'None' item['pubtime'] = self.normalize_time(str(' '.join(source_time))) elif filter(str.isdigit, source_time[0].encode('utf8')) and len(source_time) == 1: item['medianame'] = 'None' item['pubtime'] = self.normalize_time(str(' '.join(source_time))) else: item['medianame'] = source_time[0] item['pubtime'] = self.normalize_time(str(' '.join(source_time[1:]))) if self.tool.old_news(item['pubtime']): continue else: print 'no element of author' continue if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0: item['url'] = "".join(item['url'].split("?")[0:-1]) if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue try: item['source'] = self.tool.get_realname(item['medianame']) item['medianame'] = ' ' except: pass item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) if elem.find('div',class_='c-summary'): item['abstract'] = elem.find('div',class_='c-summary').get_text() items.append(item) return items def normalize_time(self, time_text): time_text = time_text.encode('utf8') if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text): time_text = time_text.replace('年'.encode('utf8'), '-').replace('月'.encode('utf8'), '-').replace('日'.encode('utf8'), '') else: #非标准时间转换为时间戳,再转为标准时间 time_digit = float(filter(str.isdigit, time_text)) interval = 0; if time_text.find('天'.encode('utf8')) > 0: interval = 86400 elif time_text.find('时'.encode('utf8')) > 0: interval = 3600. elif time_text.find('分'.encode('utf8')) > 0: interval = 60 elif time_text.find('秒'.encode('utf8')) > 0: interval = 1 else: return time_text time_true = time.time() - time_digit*interval time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true)) return time_text
class BingNewSpider(Spider): name = "bingnew" domain_url = "http://cn.bing.com" start_urls = [] tool = Utools() dc = dataCleaner() test_hbase = True def __init__ (self): super(BingNewSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def initial(self): self.log('---started----') self.getStartUrl() #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 #sort_by_time = '&qft=sortbydate%3d"1"' sort_by_time = '' qlist = GetQuery().get_data() for query in qlist: if query: query_url = '/news/search?q=' + urllib.quote(query.encode('utf8')) + sort_by_time self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) requests = [] for url in sel.xpath(u'//li/a[@class="sb_pagN"]/@href').extract(): requests.append(self.make_requests_from_url(self.domain_url+url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self,response): item = response.meta['item'] try: charset = response.encoding except: charset = 'utf-8' if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) item['content'] = self.dc.process(str(bsoup).decode(charset)) except: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['content'] = self.dc.process(str(bsoup)) if len(item['content'].encode('utf8')) < len(item['abstract']): item['content'] = item['abstract'] if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: bsoup = BeautifulSoup(response.body,from_encoding='utf-8') main_content = bsoup.select('div#SerpResult')[0] if main_content: elem_list = main_content.find_all('div', class_='sn_r') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'news' item['source'] = '必应资讯' item['channel'] = 'Search engine' title = elem.find('div', 'newstitle') if title and title.a.get_text(): item['title'] = title.a.get_text() else: continue item['url'] = title.a['href'] author = elem.find('span',class_='sn_ST') if author: #m = re.search('(\d{4}\/\d{1,2}\/\d{1,2})',source_time[0]) item['medianame'] = author.cite.get_text() item['pubtime'] = self.normalize_time(str(author.span.get_text())) if self.tool.old_news(item['pubtime']): continue else: print 'no element of author' continue if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0: item['url'] = "".join(item['url'].split("?")[0:-1]) if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue try: item['source'] = self.tool.get_realname(item['medianame']) item['medianame'] = ' ' except: continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) if elem.find('span',class_='sn_snip'): item['abstract'] = elem.find('span',class_='sn_snip').get_text() else: item['abstract'] = ' ' items.append(item) return items def normalize_time(self, time_text): time_text = time_text.encode('utf8') if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}', time_text): time_text = time_text.replace('/', '-') + ' 00:00' else: #非标准时间转换为时间戳,再转为标准时间 time_digit = float(filter(str.isdigit, time_text)) interval = 0; if time_text.find('天'.encode('utf8')) > 0 or time_text.find('day') > 0: interval = 86400 elif time_text.find('时'.encode('utf8')) > 0 or time_text.find('hour') > 0: interval = 3600 elif time_text.find('分'.encode('utf8')) > 0 or time_text.find('min') > 0: interval = 60 elif time_text.find('秒'.encode('utf8')) > 0 or time_text.find('second') > 0: interval = 1 else: return time_text time_true = time.time() - time_digit*interval time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true)) return time_text
class SogouWeixinSpider(Spider): name = "sogouwx" domain_url = "http://weixin.sogou.com/weixin" UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0' start_urls = [] tool = Utools() dc = dataCleaner() time_interval = 0 cookie = [] test_hbase = True custom_settings = { "DOWNLOAD_DELAY": 0.2, "COOKIES_ENABLED": True, } def __init__(self): super(SogouWeixinSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 #过去24小时 timeTag = '&tsn=1' qlist = GetQuery().get_data() for query in qlist: if query: query_url = '?type=2&query=' + urllib.quote( query.encode('utf8')) + timeTag self.start_urls.append(self.domain_url + query_url) def start_requests(self): for i in range(len(self.start_urls)): if i % 5 == 0: self.cookie = self.update_cookies() yield Request(self.start_urls[i], cookies=self.cookie) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): print '====start %s==' % response.url #print response.body time.sleep(random.randint(self.time_interval, 2)) # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #未成功获取query if response.url == self.domain_url: print 'error of query' return #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) requests = [] for url in sel.xpath(u'//a[@class="np"]/@href').extract(): requests.append(self.make_requests_from_url(self.domain_url + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] if response.body: res = re.sub('\n|\r|\t', '', response.body) res = re.sub('<script.*?</script>', '', res) bsoup = BeautifulSoup(res, from_encoding='utf8') try: item['content'] = str( bsoup.select('div#js_content')[0]).encode('utf8') print 'url:' + item['url'] + ' is added' return item except: print 'url:' + item['url'] + ' load failed' def parse_items(self, response): if response.body: #去除干扰内容<!.*?> res = re.sub(r'<!.*?>', '', response.body) bsoup = BeautifulSoup(res, from_encoding='utf8') main_content = bsoup.select('div#wrapper')[0] if main_content: elem_list = main_content.find_all('div', class_='txt-box') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'weixin' item['source'] = '搜狗微信' item['channel'] = 'Search engine' if elem.h4.a.get_text(): item['title'] = elem.h4.a.get_text() else: continue item['url'] = elem.h4.a['href'] item['medianame'] = elem.div.a['title'] #时间戳转换时间 item['pubtime'] = time.strftime( '%Y-%m-%d %H:%M', time.localtime(float(elem.div['t']))) if self.tool.old_news(item['pubtime']): continue if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['abstract'] = elem.p.get_text() items.append(item) return items def update_cookies(self): s = requests.Session() s.headers = {"User-Agent": self.UA} r = s.post('http://weixin.sogou.com/antispider/thank.php') pcontent = re.search("setCookie\('SNUID'.*?\)", r.content).group(0) SNUID = eval(pcontent.split(',')[1]) suv = ''.join( [str(int(time.time() * 1000000) + random.randint(0, 1000))]) s.cookies['SUV'] = suv s.cookies['SNUID'] = SNUID return dict(s.cookies)
class XicibbsSpider(Spider): name = "xicibbs1" domain_url = "http://www.xici.net/" tool = Utools() dc = dataCleaner() start_urls = [] xici_dict = dict() test_hbase = True def __init__(self): super(XicibbsSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 fp = open('xici.txt', 'rb') for line in fp.readlines(): keys = line.split('\t') self.xici_dict.setdefault(keys[1], keys[0].decode('utf8')) fp.close() tag = '?sort=date' for key in self.xici_dict.keys(): self.start_urls.append(key + tag) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) def parse_content(self, response): item = response.meta['item'] main_content = response.xpath('//head').extract()[0] content_list = re.findall('({"del_w".*?})', main_content) if len(content_list) > 0: try: #store the keys content_list[0] = re.sub('<div.*?>', '<p>', content_list[0]).replace( '</div>', '</p>') tags = re.findall('<.*?>', content_list[0].encode('utf8')) tagdict = dict() for i in range(len(tags)): tagdict.setdefault('&tag_' + str(i) + ';', tags[i]) for key in tagdict.keys(): content_list[0] = content_list[0].replace( tagdict[key], key) tagdict[key] = str(tagdict[key].replace('\\"', '')) content_list[0] = content_list[0].replace('{', '').replace('}', '') maindict = json.loads('{' + content_list[0] + '}', encoding='utf8') item['medianame'] = maindict['UserName'] item['pubtime'] = maindict['really_updated_at'][:-3] if self.tool.old_news(item['pubtime']): return item['content'] = [] for content in content_list: content = re.sub('<.*?>', '', content).replace('{', '').replace('}', '') content_dict = json.loads('{' + content + '}', encoding='utf8') if content_dict.has_key('floorcontent'): #release the tags for key in tagdict.keys(): content_dict['floorcontent'] = content_dict[ 'floorcontent'].replace(key, tagdict[key]) content_dict['floorcontent'] = content_dict[ 'floorcontent'] item['content'].append(content_dict['floorcontent']) #only get the first floor break if item: item['content'] = self.dc.process( '<div>' + ' '.join(item['content']) + '</div>') print 'url: ' + item['url'] + ' ' + str( item['pubtime']) + ' is added' return item except: print item['url'] + ' load failed.' pass else: return def parse_items(self, response): elem_list = [] items = [] content = re.findall(r'"docinfo":\[.*?\]', response.body) if self.xici_dict.has_key(response.url.replace('?sort=date', '')): source_name = self.xici_dict[response.url.replace( '?sort=date', '')] else: source_name = '西祠胡同' if len(content) > 0: elem_list = re.findall('\{\".*?visited\":[a-z]{4,5}\}', content[0]) if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' elem = elem.decode('gb18030') try: elem = json.loads(elem) except: print elem continue item['url'] = 'http://www.xici.net/d%s.htm' % elem['aDocs_i_0'] if self.r.exists(item['url']): continue item['title'] = elem['aDocs_i_1'] item['source'] = source_name item['channel'] = 'Search engine' item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['pubtime'] = item['collecttime'][0:4] + '-' + elem[ 'ShortDate'] if self.tool.old_news(item['pubtime']): continue items.append(item) return items
class ToutiaoSpider(Spider): name = "toutiaonew" domain_url = "http://toutiao.com/search_content" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(ToutiaoSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "?offset=0&format=json&count=50&keyword=" + urllib.quote( query.encode('utf8')) self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath( u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url + url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] charset = 'utf-8' try: for meta_item in response.xpath('//meta[@http-equiv]').extract(): is_exsit = re.match('charset=(.*?)"', meta_item) if is_exsit: charset = is_exsit.group(0) break except: pass if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) except: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['content'] = self.dc.process(str(bsoup)) if len(item['content'].encode('utf8')) < len(item['abstract']): item['content'] = item['abstract'].replace('百度快照', '') if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self, response): if response.body: itemdatas = json.loads(response.body)['data'] else: return [] items = [] for itemdata in itemdatas: item = DataItem() item['dtype'] = 'news' item['source'] = '今日头条' item['channel'] = 'Search engine' item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['pubtime'] = itemdata['datetime'] if self.tool.old_news(item['pubtime']): continue item['url'] = itemdata['display_url'].encode('utf8') if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0: item['url'] = "".join(item['url'].split("?")[0:-1]) if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['title'] = itemdata['title'].encode('utf8') item['medianame'] = itemdata['source'].encode('utf8') item['abstract'] = itemdata['abstract'].encode('utf8') items.append(item) return items
class XiciBBSSpider(Spider): name = "xicibbs" domain_url = "http://baidu.xici.net/cse" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(XiciBBSSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 tag = '&s=11800334043319024933&srt=lds&sti=1440&nsid=0' qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "/search?q=" + urllib.quote( query.encode('utf8')) + tag self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath( u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url + url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] main_content = response.xpath('//head').extract()[0] content_list = re.findall('({"del_w".*?})', main_content) if len(content_list) > 0: try: #store the keys content_list[0] = re.sub('<div.*?>', '<p>', content_list[0]).replace( '</div>', '</p>') tags = re.findall('<.*?>', content_list[0].encode('utf8')) tagdict = dict() for i in range(len(tags)): tagdict.setdefault('&tag_' + str(i) + ';', tags[i]) for key in tagdict.keys(): content_list[0] = content_list[0].replace( tagdict[key], key) tagdict[key] = str(tagdict[key].replace('\\"', '')) content_list[0] = content_list[0].replace('{', '').replace('}', '') maindict = json.loads('{' + content_list[0] + '}', encoding='utf8') item['medianame'] = maindict['UserName'] item['pubtime'] = maindict['really_updated_at'][:-3] if self.tool.old_news(item['pubtime']): return item['content'] = [] for content in content_list: content = re.sub('<.*?>', '', content).replace('{', '').replace('}', '') content_dict = json.loads('{' + content + '}', encoding='utf8') if content_dict.has_key('floorcontent'): #release the tags for key in tagdict.keys(): content_dict['floorcontent'] = content_dict[ 'floorcontent'].replace(key, tagdict[key]) content_dict['floorcontent'] = content_dict[ 'floorcontent'] item['content'].append(content_dict['floorcontent']) #only get the first floor break if item: item['content'] = self.dc.process( '<div>' + ' '.join(item['content']) + '</div>') print 'url: ' + item['url'] + ' is added' return item except: print item['url'] + ' load failed.' pass else: return def parse_items(self, response): if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') main_content = bsoup.select('div#results')[0] if main_content: elem_list = main_content.find_all('div', class_='result') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '西祠胡同' item['channel'] = 'Search engine' try: item['title'] = elem.h3.a.get_text() except: continue item['url'] = elem.h3.a['href'].replace('user', 'www') if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) if elem.find('div', class_='c-summary'): item['abstract'] = elem.find( 'div', class_='c-content').get_text() items.append(item) #去重 new_items = [] url_list = [] for item in items: if item['url'] not in url_list: new_items.append(item) url_list.append(item['url']) items = new_items return items
class SogouWeixinSpider(Spider): name = "sogouwx" domain_url = "http://weixin.sogou.com/weixin" UA = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" start_urls = [] tool = Utools() dc = dataCleaner() time_interval = 0 cookie = [] test_hbase = True custom_settings = {"DOWNLOAD_DELAY": 0.2, "COOKIES_ENABLED": True} def __init__(self): super(SogouWeixinSpider, self).__init__() # 将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log("---started----") self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) def finalize(self): self.log("---stopped---") # url持久化 def getStartUrl(self): # 从文件初始化查询关键词 # 过去24小时 timeTag = "&tsn=1" qlist = GetQuery().get_data() for query in qlist: if query: query_url = "?type=2&query=" + urllib.quote(query.encode("utf8")) + timeTag self.start_urls.append(self.domain_url + query_url) def start_requests(self): for i in range(len(self.start_urls)): if i % 5 == 0: self.cookie = self.update_cookies() yield Request(self.start_urls[i], cookies=self.cookie) # 一个回调函数中返回多个Request以及Item的例子 def parse(self, response): print "====start %s==" % response.url # print response.body time.sleep(random.randint(self.time_interval, 2)) # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin") self.htable.close_trans() self.test_hbase = False except: raise CloseSpider("no thrift or hbase server!") # 未成功获取query if response.url == self.domain_url: print "error of query" return # 抽取并解析新闻网页内容 items = self.parse_items(response) # 构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) requests = [] for url in sel.xpath(u'//a[@class="np"]/@href').extract(): requests.append(self.make_requests_from_url(self.domain_url + url)) for item in items: yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content) for request in requests: continue yield request def parse_content(self, response): item = response.meta["item"] if response.body: res = re.sub("\n|\r|\t", "", response.body) res = re.sub("<script.*?</script>", "", res) bsoup = BeautifulSoup(res, from_encoding="utf8") try: item["content"] = str(bsoup.select("div#js_content")[0]).encode("utf8") print "url:" + item["url"] + " is added" return item except: print "url:" + item["url"] + " load failed" def parse_items(self, response): if response.body: # 去除干扰内容<!.*?> res = re.sub(r"<!.*?>", "", response.body) bsoup = BeautifulSoup(res, from_encoding="utf8") main_content = bsoup.select("div#wrapper")[0] if main_content: elem_list = main_content.find_all("div", class_="txt-box") items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item["dtype"] = "weixin" item["source"] = "搜狗微信" item["channel"] = "Search engine" if elem.h4.a.get_text(): item["title"] = elem.h4.a.get_text() else: continue item["url"] = elem.h4.a["href"] item["medianame"] = elem.div.a["title"] # 时间戳转换时间 item["pubtime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime(float(elem.div["t"]))) if self.tool.old_news(item["pubtime"]): continue if self.r.exists(item["url"]): # if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item["collecttime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item["abstract"] = elem.p.get_text() items.append(item) return items def update_cookies(self): s = requests.Session() s.headers = {"User-Agent": self.UA} r = s.post("http://weixin.sogou.com/antispider/thank.php") pcontent = re.search("setCookie\('SNUID'.*?\)", r.content).group(0) SNUID = eval(pcontent.split(",")[1]) suv = "".join([str(int(time.time() * 1000000) + random.randint(0, 1000))]) s.cookies["SUV"] = suv s.cookies["SNUID"] = SNUID return dict(s.cookies)
class XicibbsSpider(Spider): name = "xicibbs1" domain_url = "http://www.xici.net/" tool = Utools() dc = dataCleaner() start_urls = [] xici_dict = dict() test_hbase = True def __init__ (self): super(XicibbsSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 fp = open('xici.txt', 'rb') for line in fp.readlines(): keys = line.split('\t'); self.xici_dict.setdefault(keys[1], keys[0].decode('utf8')) fp.close() tag = '?sort=date' for key in self.xici_dict.keys(): self.start_urls.append(key + tag) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) def parse_content(self,response): item = response.meta['item'] main_content = response.xpath('//head').extract()[0] content_list = re.findall('({"del_w".*?})', main_content) if len(content_list) > 0: try: #store the keys content_list[0] = re.sub('<div.*?>', '<p>', content_list[0]).replace('</div>', '</p>') tags = re.findall('<.*?>', content_list[0].encode('utf8')) tagdict = dict() for i in range(len(tags)): tagdict.setdefault('&tag_' + str(i) + ';', tags[i]) for key in tagdict.keys(): content_list[0] = content_list[0].replace(tagdict[key], key) tagdict[key] = str(tagdict[key].replace('\\"', '')) content_list[0] = content_list[0].replace('{','').replace('}', '') maindict = json.loads('{' + content_list[0] + '}', encoding='utf8') item['medianame'] = maindict['UserName'] item['pubtime'] = maindict['really_updated_at'][:-3] if self.tool.old_news(item['pubtime']): return item['content'] = [] for content in content_list: content = re.sub('<.*?>', '', content).replace('{','').replace('}', '') content_dict = json.loads('{' + content + '}', encoding='utf8') if content_dict.has_key('floorcontent'): #release the tags for key in tagdict.keys(): content_dict['floorcontent'] = content_dict['floorcontent'].replace(key, tagdict[key]) content_dict['floorcontent'] = content_dict['floorcontent'] item['content'].append(content_dict['floorcontent']) #only get the first floor break if item: item['content'] = self.dc.process('<div>' + ' '.join(item['content']) + '</div>') print 'url: ' + item['url'] + ' ' + str(item['pubtime']) + ' is added' return item except: print item['url'] + ' load failed.' pass else: return def parse_items(self, response): elem_list = [] items = [] content = re.findall(r'"docinfo":\[.*?\]', response.body) if self.xici_dict.has_key(response.url.replace('?sort=date', '')): source_name = self.xici_dict[response.url.replace('?sort=date', '')] else: source_name = '西祠胡同' if len(content) > 0: elem_list = re.findall('\{\".*?visited\":[a-z]{4,5}\}', content[0]) if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' elem = elem.decode('gb18030') try: elem = json.loads(elem) except: print elem continue item['url'] = 'http://www.xici.net/d%s.htm' % elem['aDocs_i_0'] if self.r.exists(item['url']): continue item['title'] = elem['aDocs_i_1'] item['source'] = source_name item['channel'] = 'Search engine' item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['pubtime'] = item['collecttime'][0:4] + '-' + elem['ShortDate'] if self.tool.old_news(item['pubtime']): continue items.append(item) return items
class LuChengBBSSpider(Spider): name = "luchengbbs" domain_url = "http://www.zjxslm.com/" combine_url = ("forum.php?mod=forumdisplay&fid=%d&orderby=lastpost" + "&filter=dateline&dateline=86400") tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(LuChengBBSSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 ids = range(195, 209) ids.append(193) for idx in ids: self.start_urls.append("http://www.zjxslm.com/forum.php?" + "mod=forumdisplay&orderby=lastpost" + "&filter=dateline&dateline=86400&fid=%d" % idx) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') print '====start %s==' % response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #尝试寻找下一页 requests = [] if response.url.find('page') < 0: #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) page_num = sel.xpath('//div[@class="pg"]/label/span') if page_num: page_num = re.sub("<.*?>", "", page_num.extract()[0]) page_num = int(re.search("([\d]+)", page_num).group(1)) for idx in range(2, page_num + 1): url = response.url + ("&page=%d" % idx) requests.append(self.make_requests_from_url(url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: yield request def parse_content(self, response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['pubtime'] = bsoup.find_all( 'div', class_="authi")[1].em.span['title'] if self.tool.old_news(item['pubtime'][0:-3]): return item['content'] = str(bsoup.find('div', class_='pcb')) if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self, response): if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') main_content = bsoup.select('div#threadlist')[0] if main_content: elem_list = main_content.find_all('tbody') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '鹿城论坛' item['channel'] = 'Search engine' #抓取id获取url try: tid = elem['id'] except: continue if tid.find('_') < 0: continue else: tid = tid.split('_')[1] item['url'] = self.domain_url + 'thread-' + tid + '-1-1.html' if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['title'] = elem.find('th').get_text().split('\n')[2] item['medianame'] = elem.tr.find( 'td', class_='by').cite.get_text().replace('\n', '') item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) items.append(item) return items def normalize_time(self, time_text): time_text = time_text.encode('utf8') if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text): time_text = time_text.replace('年'.encode('utf8'), '-').replace( '月'.encode('utf8'), '-').replace('日'.encode('utf8'), '') else: #非标准时间转换为时间戳,再转为标准时间 time_digit = float(filter(str.isdigit, time_text)) interval = 0 if time_text.find('天'.encode('utf8')) > 0: interval = 86400 elif time_text.find('时'.encode('utf8')) > 0: interval = 3600. elif time_text.find('分'.encode('utf8')) > 0: interval = 60 elif time_text.find('秒'.encode('utf8')) > 0: interval = 1 else: return time_text time_true = time.time() - time_digit * interval time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true)) return time_text
class TianyaBBSSpider(Spider): name = "tianyabbs" domain_url = "http://search.tianya.cn/" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(TianyaBBSSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 #发帖时间 pageTag = '&s=4' #回复时间 #pageTag = '&s=6' #默认相关性排序 qlist = GetQuery().get_data() for query in qlist: if query: query_url = '/bbs?q=' + urllib.quote(query.encode('utf8')) + pageTag self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): #print '====start %s==' %response.url # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #抽取搜索结果页详细页面链接 requests = [] for url in sel.xpath(u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re('go\(([\d]*?)\)'): tp_url = re.sub('&pn=[\d]+?', '', response.url) requests.append(self.make_requests_from_url(tp_url + '&pn=' + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self,response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body) item_content_list = bsoup.find_all('div', class_='bbs-content') #only get the first floor if len(item_content_list) > 0: item['content'] = item_content_list[0].extract().encode('utf8') #item['content'] = ' '.join(v.get_text().encode('utf8') for v in item_content_list) item['content'] = re.sub(r'\n|\t|\r', '', item['content']) item['content'] = self.dc.process(item['content']) if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: bsoup = BeautifulSoup(response.body) main_content = bsoup.select('div#main')[0] #查询项中有一项多余 if main_content: if main_content.select('li#search_msg'): elem_list = main_content.find_all('li')[:-1] else: elem_list = main_content.find_all('li') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '天涯论坛' item['channel'] = 'Search engine' try: item['title'] = elem.div.h3.a.get_text() except: continue item['url'] = elem.div.h3.a['href'] author = elem.find('p', class_='source') if author: item['medianame'] = author.a.get_text() #item['author'] = author.a.get_text() if author.span.get_text().find('-') > 0: item['pubtime'] = author.span.get_text() else: item['pubtime'] = author.find_all('span')[-2].get_text() if self.tool.old_news(item['pubtime']): continue else: print 'element of author no found!\n' return if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['abstract']=elem.div.p.get_text() items.append(item) return items
class TiebaBBSSpider(Spider): name = "tiebabbs" domain_url = "http://tieba.baidu.com" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(TiebaBBSSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "/f/search/res?ie=utf-8&rn=20&qw=" + urllib.quote(query.encode('utf8')) + '&ct=0' self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url+url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self,response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body,from_encoding='utf-8') if bsoup.find('h1', class_='core_title_txt'): item['title'] = bsoup.find('h1', class_='core_title_txt')['title'] elif bsoup.find('h3', class_='core_title_txt'): item['title'] = bsoup.find('h3', class_='core_title_txt')['title'] else: return timeform = '%Y-%m-%d %H:%M' pubtimes = [time.strptime(item['pubtime'], timeform)] for pubtime in re.findall('/d{4}-/d{2}-/d{2} /d{2}:/d{2}', str(bsoup)): pubtimes.append(time.strptime(pubtime, timeform)) item['pubtime'] = time.strftime(timeform, min(pubtimes)) if self.tool.old_news(item['pubtime']): print item['utl'] + ' ' + item['pubtime'] return item['content'] = [] for elem in bsoup.find_all('div', class_='d_post_content'): item['content'].append(str(elem.extract())) #onlt get the first floor break if item: item['content'] = ' '.join(item['content']).encode('utf8') item['content'] = self.dc.process(item['content']) print 'url: ' + item['url'] + ' is added' yield item def parse_items(self,response): if response.body: bsoup = BeautifulSoup(response.body,from_encoding='utf-8') main_content = bsoup.find('div', class_='s_post_list') items = [] if main_content: elem_list = main_content.find_all('div', class_='s_post') else: return items if len(elem_list)>0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '百度贴吧' item['channel'] = 'Search engine' try: item['pubtime'] = elem.find('font', class_='p_date').get_text() if self.tool.old_news(item['pubtime']): continue #item['title'] = elem.span.a.get_text() item['medianame'] = elem.find('font', class_='p_violet').get_text() item['abstract'] = elem.find('div',class_='p_content').get_text() except: continue item['url'] = self.domain_url + re.findall('(/p/.*?)[^\d]', elem.span.a['href'])[0] if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) items.append(item) #去重 new_items = [] url_list = [] for item in items: if item['url'] not in url_list: new_items.append(item) url_list.append(item['url']) items = new_items; return items
class BaiduNewSpider(Spider): name = "baidunew" domain_url = "http://news.baidu.com" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(BaiduNewSpider, self).__init__() # 将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log("---started----") self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) def finalize(self): self.log("---stopped---") # url持久化 def getStartUrl(self): # 从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: # 默认时间排序 query_url = "/ns?rn=20&word=" + urllib.quote(query.encode("utf8")) + "&ct=0" self.start_urls.append(self.domain_url + query_url) # 一个回调函数中返回多个Request以及Item的例子 def parse(self, response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin") self.htable.close_trans() self.test_hbase = False except: raise CloseSpider("no thrift or hbase server!") # print '====start %s==' %response.url # 抽取并解析新闻网页内容 items = self.parse_items(response) # 构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) # 尝试寻找下一页 requests = [] try: url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url + url)) except: pass for item in items: yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content) # return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta["item"] try: charset = response.encoding except: charset = "utf-8" if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) item["content"] = self.dc.process(str(bsoup).decode(charset)) except: bsoup = BeautifulSoup(response.body, from_encoding="utf-8") item["content"] = self.dc.process(str(bsoup)) if len(item["content"].encode("utf8")) < len(item["abstract"]): item["content"] = item["abstract"].replace("百度快照", "") if item["content"]: print "url: " + item["url"] + " is added" return item def parse_items(self, response): if response.body: bsoup = BeautifulSoup(response.body, from_encoding="utf-8") main_content = 0 try: main_content = bsoup.select("div#container")[0].select("div#content_left")[0] except: print "url: " + response.url + " is empty" return [] if main_content: elem_list = main_content.find_all("div", class_="result") items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item["dtype"] = "news" item["source"] = "百度新闻" item["channel"] = "Search engine" try: item["title"] = elem.h3.a.get_text() except: continue item["url"] = elem.h3.a["href"] author = elem.find("p", class_="c-author") if author: source_time = author.get_text().split() if re.match(r"\d{4}.*?\d{1,2}.*?\d{1,2}", source_time[0].encode("utf8")): item["medianame"] = "None" item["pubtime"] = self.normalize_time(str(" ".join(source_time))) elif filter(str.isdigit, source_time[0].encode("utf8")) and len(source_time) == 1: item["medianame"] = "None" item["pubtime"] = self.normalize_time(str(" ".join(source_time))) else: item["medianame"] = source_time[0] item["pubtime"] = self.normalize_time(str(" ".join(source_time[1:]))) if self.tool.old_news(item["pubtime"]): continue else: print "no element of author" continue if item["url"].find("html?") > 0 or item["url"].find("htm?") > 0: item["url"] = "".join(item["url"].split("?")[0:-1]) if self.r.exists(item["url"]): # if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue try: item["source"] = self.tool.get_realname(item["medianame"]) item["medianame"] = " " except: pass item["collecttime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) if elem.find("div", class_="c-summary"): item["abstract"] = elem.find("div", class_="c-summary").get_text() items.append(item) return items def normalize_time(self, time_text): time_text = time_text.encode("utf8") if re.match("\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}", time_text): time_text = ( time_text.replace("年".encode("utf8"), "-") .replace("月".encode("utf8"), "-") .replace("日".encode("utf8"), "") ) else: # 非标准时间转换为时间戳,再转为标准时间 time_digit = float(filter(str.isdigit, time_text)) interval = 0 if time_text.find("天".encode("utf8")) > 0: interval = 86400 elif time_text.find("时".encode("utf8")) > 0: interval = 3600.0 elif time_text.find("分".encode("utf8")) > 0: interval = 60 elif time_text.find("秒".encode("utf8")) > 0: interval = 1 else: return time_text time_true = time.time() - time_digit * interval time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true)) return time_text
class ToutiaoSpider(Spider): name = "toutiaonew" domain_url = "http://toutiao.com/search_content" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(ToutiaoSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 qlist = GetQuery().get_data() for query in qlist: if query: #默认时间排序 query_url = "?offset=0&format=json&count=50&keyword=" + urllib.quote(query.encode('utf8')) self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #尝试寻找下一页 requests = [] try: url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1] requests.append(self.make_requests_from_url(self.domain_url+url)) except: pass for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self,response): item = response.meta['item'] charset = 'utf-8' try: for meta_item in response.xpath('//meta[@http-equiv]').extract(): is_exsit = re.match('charset=(.*?)"', meta_item) if is_exsit: charset = is_exsit.group(0) break except: pass if response.body: try: bsoup = BeautifulSoup(response.body, from_encoding=charset) except: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['content'] = self.dc.process(str(bsoup)) if len(item['content'].encode('utf8')) < len(item['abstract']): item['content'] = item['abstract'].replace('百度快照', '') if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: itemdatas = json.loads(response.body)['data'] else: return [] items = [] for itemdata in itemdatas: item = DataItem() item['dtype'] = 'news' item['source'] = '今日头条' item['channel'] = 'Search engine' item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['pubtime'] = itemdata['datetime'] if self.tool.old_news(item['pubtime']): continue item['url'] = itemdata['display_url'].encode('utf8') if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0: item['url'] = "".join(item['url'].split("?")[0:-1]) if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['title'] = itemdata['title'].encode('utf8') item['medianame'] = itemdata['source'].encode('utf8') item['abstract'] = itemdata['abstract'].encode('utf8') items.append(item) return items
class TianyaBBSSpider(Spider): name = "tianyabbs" domain_url = "http://search.tianya.cn/" tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__(self): super(TianyaBBSSpider, self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial, signals.engine_started) dispatcher.connect(self.finalize, signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3) #self.htable=HBaseTest(table = 'origin') def finalize(self): self.log('---stopped---') #self.htable.close_trans() #url持久化 def getStartUrl(self): #从文件初始化查询关键词 #发帖时间 pageTag = '&s=4' #回复时间 #pageTag = '&s=6' #默认相关性排序 qlist = GetQuery().get_data() for query in qlist: if query: query_url = '/bbs?q=' + urllib.quote( query.encode('utf8')) + pageTag self.start_urls.append(self.domain_url + query_url) #一个回调函数中返回多个Request以及Item的例子 def parse(self, response): #print '====start %s==' %response.url # test the status of hbase and thrift server if self.test_hbase: try: self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table='origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') #抽取并解析新闻网页内容 items = self.parse_items(response) #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) #抽取搜索结果页详细页面链接 requests = [] for url in sel.xpath( u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re( 'go\(([\d]*?)\)'): tp_url = re.sub('&pn=[\d]+?', '', response.url) requests.append(self.make_requests_from_url(tp_url + '&pn=' + url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: continue yield request def parse_content(self, response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body) item_content_list = bsoup.find_all('div', class_='bbs-content') #only get the first floor if len(item_content_list) > 0: item['content'] = item_content_list[0].extract().encode('utf8') #item['content'] = ' '.join(v.get_text().encode('utf8') for v in item_content_list) item['content'] = re.sub(r'\n|\t|\r', '', item['content']) item['content'] = self.dc.process(item['content']) if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self, response): if response.body: bsoup = BeautifulSoup(response.body) main_content = bsoup.select('div#main')[0] #查询项中有一项多余 if main_content: if main_content.select('li#search_msg'): elem_list = main_content.find_all('li')[:-1] else: elem_list = main_content.find_all('li') items = [] if len(elem_list) > 0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '天涯论坛' item['channel'] = 'Search engine' try: item['title'] = elem.div.h3.a.get_text() except: continue item['url'] = elem.div.h3.a['href'] author = elem.find('p', class_='source') if author: item['medianame'] = author.a.get_text() #item['author'] = author.a.get_text() if author.span.get_text().find('-') > 0: item['pubtime'] = author.span.get_text() else: item['pubtime'] = author.find_all( 'span')[-2].get_text() if self.tool.old_news(item['pubtime']): continue else: print 'element of author no found!\n' return if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) item['abstract'] = elem.div.p.get_text() items.append(item) return items
class LuChengBBSSpider(Spider): name = "luchengbbs" domain_url = "http://www.zjxslm.com/" combine_url = ("forum.php?mod=forumdisplay&fid=%d&orderby=lastpost" + "&filter=dateline&dateline=86400") tool = Utools() dc = dataCleaner() start_urls = [] test_hbase = True def __init__ (self): super(LuChengBBSSpider,self).__init__() #将final绑定到爬虫结束的事件上 dispatcher.connect(self.initial,signals.engine_started) dispatcher.connect(self.finalize,signals.engine_stopped) def initial(self): self.log('---started----') self.getStartUrl() self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3) def finalize(self): self.log('---stopped---') #url持久化 def getStartUrl(self): #从文件初始化查询关键词 ids = range(195, 209) ids.append(193) for idx in ids: self.start_urls.append("http://www.zjxslm.com/forum.php?" +"mod=forumdisplay&orderby=lastpost" +"&filter=dateline&dateline=86400&fid=%d" % idx) #一个回调函数中返回多个Request以及Item的例子 def parse(self,response): # test the status of hbase and thrift server if self.test_hbase: try: self.htable=HBaseTest(host = self.tool.HOST_HBASE1, table = 'origin') self.htable.close_trans() self.test_hbase = False except: raise CloseSpider('no thrift or hbase server!') print '====start %s==' %response.url #抽取并解析新闻网页内容 items = self.parse_items(response) #尝试寻找下一页 requests = [] if response.url.find('page') < 0: #构造一个Xpath的select对象,用来进行网页元素抽取 sel = Selector(response) page_num = sel.xpath('//div[@class="pg"]/label/span') if page_num: page_num = re.sub("<.*?>", "", page_num.extract()[0]) page_num = int(re.search("([\d]+)", page_num).group(1)) for idx in range(2, page_num+1): url = response.url + ("&page=%d" % idx) requests.append(self.make_requests_from_url(url)) for item in items: yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content) #return requests for request in requests: yield request def parse_content(self,response): item = response.meta['item'] if response.body: bsoup = BeautifulSoup(response.body, from_encoding='utf-8') item['pubtime'] = bsoup.find_all('div', class_="authi")[1].em.span['title'] if self.tool.old_news(item['pubtime'][0:-3]): return item['content'] = str(bsoup.find('div', class_='pcb')) if item['content']: print 'url: ' + item['url'] + ' is added' return item def parse_items(self,response): if response.body: bsoup = BeautifulSoup(response.body,from_encoding='utf-8') main_content = bsoup.select('div#threadlist')[0] if main_content: elem_list = main_content.find_all('tbody') items = [] if len(elem_list)>0: for elem in elem_list: item = DataItem() item['dtype'] = 'forum' item['source'] = '鹿城论坛' item['channel'] = 'Search engine' #抓取id获取url try: tid = elem['id'] except: continue if tid.find('_') < 0: continue else: tid = tid.split('_')[1] item['url'] = self.domain_url + 'thread-' + tid + '-1-1.html' if self.r.exists(item['url']): #if self.htable.getRowByColumns(item['url'], ['indexData:url']): continue item['title'] = elem.find('th').get_text().split('\n')[2] item['medianame'] = elem.tr.find('td', class_='by').cite.get_text().replace('\n','') item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime()) items.append(item) return items def normalize_time(self, time_text): time_text = time_text.encode('utf8') if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text): time_text = time_text.replace('年'.encode('utf8'), '-').replace('月'.encode('utf8'), '-').replace('日'.encode('utf8'), '') else: #非标准时间转换为时间戳,再转为标准时间 time_digit = float(filter(str.isdigit, time_text)) interval = 0; if time_text.find('天'.encode('utf8')) > 0: interval = 86400 elif time_text.find('时'.encode('utf8')) > 0: interval = 3600. elif time_text.find('分'.encode('utf8')) > 0: interval = 60 elif time_text.find('秒'.encode('utf8')) > 0: interval = 1 else: return time_text time_true = time.time() - time_digit*interval time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true)) return time_text