def execute(self, newUrl, isFirst=False): i = 1 self.urlManager.addOneUrl(newUrl) while(True): newUrlList = [] newDataDict = {} try: if(self.urlManager.hasMoreUrls() > 0): getOneUrl = self.urlManager.getOneUrl() content = self.htmlLoader.htmlDown(getOneUrl) if(content is None): continue if(i == 1): # 解析首页 newUrlList = self.htmlParser.urlParse(content, getOneUrl) else: # 解析每个页面 newUrlList, newDataDict = self.htmlParser.perPageParse(content, getOneUrl) # print(newUrlList) if(len(newUrlList) > 0): self.urlManager.addUrls(newUrlList) if(len(newDataDict) > 0): self.dataHandler.insert(newDataDict) else: loggerSpider.log('has no more url') break except Exception as e: loggerSpider.log(e) i += 1
def urlParse(self, content, newUrl): hrefList = [] soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') aList = soup.find_all('a', href=re.compile(r'http://[a-z-./]+/\d+/$', re.I)) if(len(aList) == 0): loggerSpider.log('find_all is empty [%s]' % (newUrl)) else: for a in aList: if((a.get('href', None) is not None) and (a.get('title', None) is not None)): hrefList.append(a['href']) return hrefList
def urlParse(self, content, newUrl): hrefList = [] soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') aList = soup.find_all('a', href=re.compile(r'http://[a-z-./]+/\d+/$', re.I)) if (len(aList) == 0): loggerSpider.log('find_all is empty [%s]' % (newUrl)) else: for a in aList: if ((a.get('href', None) is not None) and (a.get('title', None) is not None)): hrefList.append(a['href']) return hrefList
def __init__(self): try: self.conn = MySQLdb.Connect( host='localhost', user='******', passwd='123456', port=3306, db='spider', charset='utf8', ) self.conn.autocommit(False) except MySQLdb.Error as e: loggerSpider.log("Mysql connect error %d: %s" % (e.args[0], e.args[1]))
def htmlDown(self, newUrl): try: content = None headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://blog.jobbole.com/', 'Accept-Language': 'zh-CN,zh;q=0.8' } data = {} data = urllib.urlencode(data) # request request = urllib2.Request(newUrl, data=data, headers=headers) # proxy # proxy_handler = urllib2.ProxyHandler({"http" : 'http://192.168.0.101:3128'}) proxy_handler = urllib2.ProxyHandler({}) # build opener opener = urllib2.build_opener(proxy_handler) # install opener urllib2.install_opener(opener) # response response = opener.open(request) # get code code = response.getcode() if (code != 200): return content content = response.read() if (content is None): loggerSpider.log('content is None [%s]' % (newUrl)) return content finally: return content
def htmlDown(self, newUrl): try: content = None headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://blog.jobbole.com/', 'Accept-Language': 'zh-CN,zh;q=0.8' } data = {} data = urllib.urlencode(data) # request request = urllib2.Request(newUrl, data=data, headers=headers) # proxy # proxy_handler = urllib2.ProxyHandler({"http" : 'http://192.168.0.101:3128'}) proxy_handler = urllib2.ProxyHandler({}) # build opener opener = urllib2.build_opener(proxy_handler) # install opener urllib2.install_opener(opener) # response response = opener.open(request) # get code code = response.getcode() if(code != 200): return content content = response.read() if(content is None): loggerSpider.log('content is None [%s]' % (newUrl)) return content finally: return content
def perPageParse(self, content, newUrl): newDataDict = {} soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') div_node = soup.find('div', attrs={'class': 'entry-header'}) if (div_node is None): loggerSpider.log('div_node is None [%s]' % (newUrl)) h1_node = div_node.find('h1') if (h1_node is None): loggerSpider('h1_node is None [%s]' % (newUrl)) else: newDataDict['title'] = h1_node.get_text() newDataDict['href'] = newUrl entry_node = soup.find('div', attrs={'class': 'entry'}) if (entry_node is None): loggerSpider.log('entry_node is None [%s]' % (newUrl)) else: newDataDict['content'] = entry_node.get_text() newUrlList = self.urlParse(content, newUrl) return newUrlList, newDataDict
def perPageParse(self, content, newUrl): newDataDict = {} soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') div_node = soup.find('div', attrs={'class': 'entry-header'}) if(div_node is None): loggerSpider.log('div_node is None [%s]' % (newUrl)) h1_node = div_node.find('h1') if(h1_node is None): loggerSpider('h1_node is None [%s]' % (newUrl)) else: newDataDict['title'] = h1_node.get_text() newDataDict['href'] = newUrl entry_node = soup.find('div', attrs={'class': 'entry'}) if(entry_node is None): loggerSpider.log('entry_node is None [%s]' % (newUrl)) else: newDataDict['content'] = entry_node.get_text() newUrlList = self.urlParse(content, newUrl) return newUrlList, newDataDict
def addOneUrl(self, newUrl): if(newUrl is None): loggerSpider.log('newUrl is None') return if((not self.conn.sismember(self.newUrlSetName, newUrl)) and (not self.conn.sismember(self.oldUrlSetName, newUrl))): if(self.conn.sadd(self.newUrlSetName, newUrl) != 1): loggerSpider.log('newUrl add redis fail [%s]' % (newUrl)) else: pass else: loggerSpider.log('newUrl is exists [%s]' % (newUrl))
def addOneUrl(self, newUrl): if (newUrl is None): loggerSpider.log('newUrl is None') return if ((not self.conn.sismember(self.newUrlSetName, newUrl)) and (not self.conn.sismember(self.oldUrlSetName, newUrl))): if (self.conn.sadd(self.newUrlSetName, newUrl) != 1): loggerSpider.log('newUrl add redis fail [%s]' % (newUrl)) else: pass else: loggerSpider.log('newUrl is exists [%s]' % (newUrl))