Exemplo n.º 1
0
def main():
	# print 'Running.'
	# url = 'https://blog.csdn.net/GitChat'
	# download = Download()
	# articledb = ArticleDB(DB(*config))
	# spider = Spider(url, download, articledb)
	# spider.start()
	# print 'Done.'

	# index
	url = 'https://blog.csdn.net/'
	download = Download()
	html = download.down_html(url, save=True)
	print html
Exemplo n.º 2
0
class CSDN(object):

	def __init__(self):
		self.download = Download()
		self.home = 'https://blog.csdn.net'
		self.catetories = []

		self.blog_user = []
		self.queue = Queue.Queue()
		pass

	def visit_home(self):
		html = self.download.down_html(self.home)
		return html

	def parse_category(self, html):
		# with open('tmp.html') as f:
			# html = f.read()

		soup = BeautifulSoup(html, 'lxml')
		div = soup.find('div', class_='nav_com')
		if div:
			# print div
			a_tags = div.find_all('a')
			print len(a_tags)
			for a_tag in a_tags:
				href = a_tag.attrs['href']
				self.catetories.append(''.join([self.home, href]))
			print self.catetories

	def visit_category(self):
		for category in self.catetories:
			html = self.download.down_html(category, save=True)
			self.parse_blog_user(html)
			
			# break
		print self.blog_user
		print len(self.blog_user)

	def parse_blog_user(self, html):
		print 'parse blog user'
		# 
		soup = BeautifulSoup(html, 'lxml')
		ul = soup.find('ul', class_='feedlist_mod')
		if ul:
			dds = ul.find_all('dd', class_='name')
			for dd in dds:
				href = dd.find('a').attrs['href']
				self.blog_user.append(href)
				self.queue.put(href)


	def start(self):
		html = self.visit_home()
		# print html
		# html = ''
		self.parse_category(html)
		self.visit_category()

		i = 0
		while not self.queue.empty():
			blog = self.queue.get()
			# 下载博客文章

			i += 1
			pass
		print 'run times ', i