示例#1
0
def booktag(url_content, path='web/booktag.xlsx'):
    """
	抓取标签提取 写入Excel

	"""
    soup = BeautifulSoup(url_content, 'html.parser')  # 开始解析
    booktag1 = soup.select('div#content div.article div div')
    # print(booktag1[0])
    taglist = [['标签类别', '标签名', '链接', '图书数']]
    for booktag2 in booktag1:
        print(str(booktag2))
        soup1 = BeautifulSoup(str(booktag2), 'html.parser')  # 开始解析
        booktag2 = soup1.find('a', attrs={'class': 'tag-title-wrapper'})
        type = booktag2['name']  # 标签类别
        # print(type)
        # booktag3 = soup1.findAll('a',attrs={'class':'tag'})
        booktag3 = soup1.findAll("a")
        booktag4 = soup1.findAll("b")
        # print(booktag3)
        for i in range(0, len(booktag4)):
            tag = booktag3[i + 1].string  # 标签名
            taglink = booktag3[i + 1]['href']  # 链接
            tagnum = booktag4[i].string
            taglist.append([type, tag, taglink, tagnum])
    print(taglist)
    writeexcel(path, taglist)
    print("写入EXCEL成功")
示例#2
0
def booktag(url_content, path = 'web/booktag.xlsx'):
	"""
	抓取标签提取 写入Excel

	"""
	soup = BeautifulSoup(url_content, 'html.parser') # 开始解析
	booktag1 = soup.select('div#content div.article div div')
	# print(booktag1[0])
	taglist = [['标签类别', '标签名', '链接','图书数']]
	for booktag2 in booktag1:
		print(str(booktag2))
		soup1 = BeautifulSoup(str(booktag2), 'html.parser') # 开始解析
		booktag2 = soup1.find('a',attrs={'class':'tag-title-wrapper'})
		type = booktag2['name']  # 标签类别
		# print(type)
		# booktag3 = soup1.findAll('a',attrs={'class':'tag'})
		booktag3 = soup1.findAll("a")
		booktag4 = soup1.findAll("b")
		# print(booktag3)
		for i in range(0,len(booktag4)):
			tag = booktag3[i+1].string # 标签名
			taglink =  booktag3[i+1]['href'] # 链接
			tagnum=booktag4[i].string
			taglist.append([type, tag, taglink,tagnum])
	print(taglist)
	writeexcel(path, taglist)
	print("写入EXCEL成功")
示例#3
0
def dealbooklist():
	start = time.clock()
	putplace = 'books'
	# 判断存放位置是否存在
	if os.path.exists(putplace):
		pass
	else: # 否则新建
		print('新建图书提取存放excel处:'+putplace)
		os.makedirs(putplace)
	taglist = readexcel('web/booktag.xlsx') # 读取标签列表
	del taglist[0]
	# 对于每个标签
	for tag in taglist:

		# 图书按照标签存放于文件夹中
		mulu=putplace+'/'+tag[0]
		if os.path.exists(mulu):
			pass
		else:
			os.makedirs(mulu)

		excelpath = mulu+'/'+tag[1]+'.xlsx'
		# 存在处理过的excel文件则跳过
		if os.path.exists(excelpath):
			print(excelpath+'已经存在')
			continue

		tagbooks = [] # 该标签所有书存放处
		path = 'web/'+tag[0]+'/'+tag[1] # 构造读取文件夹入口
		print('本地提取:'+path)
		# 查找目录下已经抓取的Html
		files = listfiles(path)
		# 遍历分析
		for i in files:
			file = path+'/'+i
			print('提取:'+file)
			content = open(file,'rb').read()
			book = bookdeal.manybook(content) # 提取图书列表
			for j in book: # 重新包装图书
				# print('提取:'+','.join(j))
				tagbooks.append(j)

		# 将信息写入本地文件中
		booksattr=['书籍名','URL入口','图片地址','出版信息','评价星数']
		tagbooks.insert(0,booksattr)
		writeexcel(excelpath,tagbooks)
		print('写入成功:'+excelpath)
	end = time.clock()
	print("提取图书列表总共运行时间 : %.03f 秒" %(end-start))
示例#4
0
def dealbooklist():
    start = time.clock()
    putplace = 'books'
    # 判断存放位置是否存在
    if os.path.exists(putplace):
        pass
    else:  # 否则新建
        print('新建图书提取存放excel处:' + putplace)
        os.makedirs(putplace)
    taglist = readexcel('web/booktag.xlsx')  # 读取标签列表
    del taglist[0]
    # 对于每个标签
    for tag in taglist:

        # 图书按照标签存放于文件夹中
        mulu = putplace + '/' + tag[0]
        if os.path.exists(mulu):
            pass
        else:
            os.makedirs(mulu)

        excelpath = mulu + '/' + tag[1] + '.xlsx'
        # 存在处理过的excel文件则跳过
        if os.path.exists(excelpath):
            print(excelpath + '已经存在')
            continue

        tagbooks = []  # 该标签所有书存放处
        path = 'web/' + tag[0] + '/' + tag[1]  # 构造读取文件夹入口
        print('本地提取:' + path)
        # 查找目录下已经抓取的Html
        files = listfiles(path)
        # 遍历分析
        for i in files:
            file = path + '/' + i
            print('提取:' + file)
            content = open(file, 'rb').read()
            book = bookdeal.manybook(content)  # 提取图书列表
            for j in book:  # 重新包装图书
                # print('提取:'+','.join(j))
                tagbooks.append(j)

        # 将信息写入本地文件中
        booksattr = ['书籍名', 'URL入口', '图片地址', '出版信息', '评价星数']
        tagbooks.insert(0, booksattr)
        writeexcel(excelpath, tagbooks)
        print('写入成功:' + excelpath)
    end = time.clock()
    print("提取图书列表总共运行时间 : %.03f 秒" % (end - start))
示例#5
0
def booktag(url_content, path = 'web/booktag.xlsx'):
	"""
	抓取标签提取 写入Excel

	"""
	soup = BeautifulSoup(url_content, 'html.parser') # 开始解析
	booktag1 = soup.select('div#content div.article div div')
	# print(booktag1[0])
	taglist = [['标签类别', '标签名', '链接']]
	for booktag2 in booktag1:
		soup1 = BeautifulSoup(str(booktag2), 'html.parser') # 开始解析
		booktag2 = soup1.find('a',attrs={'class':'tag-title-wrapper'})
		type = booktag2['name']  # 标签类别
		booktag3 = soup1.findAll('a',attrs={'class':'tag'})
		for i in booktag3:
			tag = i.string # 标签名
			taglink = i['href'] # 链接
			taglist.append([type, tag, taglink])
	print(taglist)
	writeexcel(path, taglist)
	print("写入EXCEL成功")