Пример #1
0
def dealbooklist():
	start = time.clock()
	putplace = 'books'
	# 判断存放位置是否存在
	if os.path.exists(putplace):
		pass
	else: # 否则新建
		print('新建图书提取存放excel处:'+putplace)
		os.makedirs(putplace)
	taglist = readexcel('web/booktag.xlsx') # 读取标签列表
	del taglist[0]
	# 对于每个标签
	for tag in taglist:

		# 图书按照标签存放于文件夹中
		mulu=putplace+'/'+tag[0]
		if os.path.exists(mulu):
			pass
		else:
			os.makedirs(mulu)

		excelpath = mulu+'/'+tag[1]+'.xlsx'
		# 存在处理过的excel文件则跳过
		if os.path.exists(excelpath):
			print(excelpath+'已经存在')
			continue

		tagbooks = [] # 该标签所有书存放处
		path = 'web/'+tag[0]+'/'+tag[1] # 构造读取文件夹入口
		print('本地提取:'+path)
		# 查找目录下已经抓取的Html
		files = listfiles(path)
		# 遍历分析
		for i in files:
			file = path+'/'+i
			print('提取:'+file)
			content = open(file,'rb').read()
			book = bookdeal.manybook(content) # 提取图书列表
			for j in book: # 重新包装图书
				# print('提取:'+','.join(j))
				tagbooks.append(j)

		# 将信息写入本地文件中
		booksattr=['书籍名','URL入口','图片地址','出版信息','评价星数']
		tagbooks.insert(0,booksattr)
		writeexcel(excelpath,tagbooks)
		print('写入成功:'+excelpath)
	end = time.clock()
	print("提取图书列表总共运行时间 : %.03f 秒" %(end-start))
Пример #2
0
def dealbooklist():
    start = time.clock()
    putplace = 'books'
    # 判断存放位置是否存在
    if os.path.exists(putplace):
        pass
    else:  # 否则新建
        print('新建图书提取存放excel处:' + putplace)
        os.makedirs(putplace)
    taglist = readexcel('web/booktag.xlsx')  # 读取标签列表
    del taglist[0]
    # 对于每个标签
    for tag in taglist:

        # 图书按照标签存放于文件夹中
        mulu = putplace + '/' + tag[0]
        if os.path.exists(mulu):
            pass
        else:
            os.makedirs(mulu)

        excelpath = mulu + '/' + tag[1] + '.xlsx'
        # 存在处理过的excel文件则跳过
        if os.path.exists(excelpath):
            print(excelpath + '已经存在')
            continue

        tagbooks = []  # 该标签所有书存放处
        path = 'web/' + tag[0] + '/' + tag[1]  # 构造读取文件夹入口
        print('本地提取:' + path)
        # 查找目录下已经抓取的Html
        files = listfiles(path)
        # 遍历分析
        for i in files:
            file = path + '/' + i
            print('提取:' + file)
            content = open(file, 'rb').read()
            book = bookdeal.manybook(content)  # 提取图书列表
            for j in book:  # 重新包装图书
                # print('提取:'+','.join(j))
                tagbooks.append(j)

        # 将信息写入本地文件中
        booksattr = ['书籍名', 'URL入口', '图片地址', '出版信息', '评价星数']
        tagbooks.insert(0, booksattr)
        writeexcel(excelpath, tagbooks)
        print('写入成功:' + excelpath)
    end = time.clock()
    print("提取图书列表总共运行时间 : %.03f 秒" % (end - start))
Пример #3
0
def catchbooklist(requreip = 0, v=0, lockprefix= 'lock'):
	"""
	输入参数为:
	是否使用代理,默认否
	是否限制爬虫速度,默认否,时间为1秒仿人工
	文件加锁后缀
	"""
	# 进行计时
	start = time.clock()
	taglist = readexcel('web/booktag.xlsx') # 读取标签
	daili0 = daili()   # 代理IP数组
	changeip = 0  # 代理ip下标
	# 循环对标签进行抓取
	for i in range(1,len(taglist)):
		kinds = taglist[i][0] # 大分类
		tagname = taglist[i][1] # 标签名
		tag = urllib.parse.quote(tagname) # url中文转码
		mulu0 = 'web/'+kinds
		# 存在大分类文件夹则跳过
		if os.path.exists(mulu0):
			pass
		else: # 否则新建
			print('新建大分类:'+mulu0)
			os.makedirs(mulu0)

		mulu = mulu0+'/'+tagname
		# 存在标签文件夹则跳过
		if os.path.exists(mulu):
			pass
		else: # 否则新建方便网页存放
			print('新建标签文件夹'+mulu)
			os.makedirs(mulu)

		# 网络中断后重新抓取时判断是否加锁
		ok = listfiles(mulu, '.'+lockprefix)
		if ok:
			print('类别:'+kinds+'----标签:'+tagname+'----已经抓完') # 抓完
			continue
		url = 'http://www.douban.com/tag/'+tag+'/book?start=' 	# 基础网址
		pagesize = 15									# 每页15本
		i = 0   # 翻页助手
		while(True):
			# 需要爬取的网页
			site = url+str(i*pagesize)

			# 开始爬取
			# 构造文件名称
			# web/小说/0.html
			src = mulu+'/'+str(i*15)+'.html'

			# 判断文件是否存在,存在则不抓取节省时间
			if(os.path.exists(src) == True):
				pass
			else:
				# 写入本地文件
				print('准备抓取:'+site+'类别:'+kinds+'----标签:'+tagname)
				iprefuse = 1  # 如果抓取成功设为0
				# 如果抓取出错那重新抓取
				while iprefuse:
					try:
						daili1= daili0[changeip]  # 代理ip
						# 爬虫速度控制
						if v:
							a = time.clock()
							time.sleep(v)
							b = time.clock()
							print('时间暂停:'+str(b-a))
						# 不需要代理
						if requreip==0:
							webcontent = getHtml(site).encode('utf-8') # 爬取
							# print(webcontent.decode('utf-8','ignore'))
							notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore')) # 匹配看是否抓取到末页
							iprefuse = 0 # 抓完设置0
						else: # 需要代理
							print('代理:'+daili1)
							webcontent = getBinaryHtml(site, daili1)
							# print(webcontent.decode('utf-8','ignore'))
							notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore'))
							print(notnull)
							iprefuse = 0
					except Exception as e:
						print(e)
						if requreip:
							changeip = changeip+1 # 更换ip下标
							if changeip==len(daili0): # 到达ip数组末循环再来
								changeip = 0
							print('更换代理:'+daili0[changeip])
						else:
							print("IP被封")
							raise
							return
						# break

				# 如果抓不到<dl>标签,证明已经抓取完
				if notnull:
					webfile = open(src, 'wb')
					webfile.write(webcontent)
					webfile.close()
					print("已经抓取:"+site+'类别:'+kinds+'----标签:'+tagname)
				else:
					lock = open(src.replace('html',lockprefix),'w') # 加锁证明抓完
					# 日期:http://blog.csdn.net/caisini_vc/article/details/5619954
					finish = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
					lock.write('抓取完成时间:'+finish)
					print("抓取完毕:"+tagname)
					break
			i =i + 1  # 加页
	# 计时
	end = time.clock()
	print("爬取总共运行时间 : %.03f 秒" %(end-start))
Пример #4
0
def mergeboolist():
	start = time.clock()
	taglist = readexcel('web/booktag.xlsx') # 读取标签列表
	del taglist[0]
	database = Mysql(host="localhost", user="******", pwd="6833066", db="doubanbook")
	for tag in taglist: # 遍历所有标签
		kind = tag[0] # 大类
		tagname = tag[1] # 标签
		excelpath = 'books/'+kind+'/'+tagname+'.xlsx' # 本地文件
		try:
			datas = readexcel(excelpath)
		except Exception as e:
			print(e)
			continue
		del datas[0] # 去掉标题
		#print(datas)
		# 提取图书插入数据库
		for data in datas:
			bookname = data[0].replace("'","\\'").replace('"','\\"')
			bookurl = data[1].replace("'","\\'").replace('"','\\"')
			bookimage = data[2].replace("'","\\'").replace('"','\\"')
			bookno = bookurl.split('/')[-2].replace("'","\\'").replace('"','\\"')
			try:
				bookinfo = data[3].replace("'","\\'").replace('"','\\"')
			except:
				bookinfo = ''
				pass
			try:
				bookstar = data[4]
			except:
				bookstar = '0'
				pass
			# select * from `book` where `bookno`='dc'
			searchsql1 = "select * from `book` where `bookno`='"+bookno+"'"
			print(searchsql1)
			try:
				isexist1 = database.ExecQuery(searchsql1)
			except Exception as e:
				print(e)
				pass
			# 如果图书记录存在,插Booktag表
			if isexist1:
				print(bookname+':'+bookurl+'已经存在')

			else:
				insertbooksql = "INSERT INTO `book` (`bookname`, `bookurl`, `bookimg`, `bookinfo`, `bookstar`, `bookno`) VALUES ('" \
							"{bookname}', '{bookurl}', '{bookimg}', '{bookinfo}', '{bookstar}', '{bookno}')"
				insert1 = insertbooksql.format(bookname=bookname, bookurl=bookurl, bookimg=bookimage, bookinfo=bookinfo, bookstar=bookstar, bookno=bookno)
				print(insert1)
				try:
					database.ExecNonQuery(insert1)
				except Exception as e:
					print(e)
					pass
			# 如果图书标签存在,则不插入
			searchsql = "select * from `booktag` where `bookno`='{bookno}' and `booktag`='{booktag}' and `bookkind`='{bookkind}'"
			searchsql2 = searchsql.format(bookno=bookno,booktag=tagname,bookkind=kind)
			print(searchsql2)
			try:
				isexist2 = database.ExecQuery(searchsql2)
			except Exception as e:
				print(e)
				pass
			if isexist2.__len__()==0:
				inserttag = "INSERT INTO `booktag`(`bookname`,`bookno`,`booktag`,`bookkind`) VALUES ('" \
							"{bookname}', '{bookno}', '{booktag}', '{bookkind}')"
				insert2 = inserttag.format(bookname=bookname, bookno=bookno, booktag=tagname, bookkind=kind)
				print(insert2)
				try:
					database.ExecNonQuery(insert2)
				except Exception as e:
					print(e)
					pass
			print('-'*100)
	print("插入数据库结束")
	end = time.clock()
	print("合并图书列表进数据库总共运行时间 : %.03f 秒" %(end-start))
Пример #5
0
def catchbooklist(requreip=0, v=0, lockprefix='lock'):
    """
	输入参数为:
	requireip 是否使用代理,默认否
	v 是否限制爬虫速度,默认否,时间为1秒仿人工
	lockprefix 文件加锁后缀
	"""
    # 进行计时
    start = time.clock()
    taglist = readexcel('web/booktag.xlsx')  # 读取标签
    daili0 = daili()  # 代理IP数组
    changeip = 0  # 代理ip下标
    # 循环对标签进行抓取
    for i in range(1, len(taglist)):
        kinds = taglist[i][0]  # 大分类
        tagname = taglist[i][1]  # 标签名
        tag = urllib.parse.quote(tagname)  # url中文转码
        mulu0 = 'web/' + kinds
        # 存在大分类文件夹则跳过
        if os.path.exists(mulu0):
            pass
        else:  # 否则新建
            print('新建大分类:' + mulu0)
            os.makedirs(mulu0)

        mulu = mulu0 + '/' + tagname
        # 存在标签文件夹则跳过
        if os.path.exists(mulu):
            pass
        else:  # 否则新建方便网页存放
            print('新建标签文件夹' + mulu)
            os.makedirs(mulu)

        # 网络中断后重新抓取时判断是否加锁
        ok = listfiles(mulu, '.' + lockprefix)
        if ok:
            print('类别:' + kinds + '----标签:' + tagname + '----已经抓完')  # 抓完
            continue
        url = 'http://www.douban.com/tag/' + tag + '/book?start='  # 基础网址
        pagesize = 15  # 每页15本
        i = 0  # 翻页助手
        while (True):
            # 需要爬取的网页
            site = url + str(i * pagesize)

            # 开始爬取
            # 构造文件名称
            # web/小说/0.html
            src = mulu + '/' + str(i * 15) + '.html'

            # 判断文件是否存在,存在则不抓取节省时间
            if (os.path.exists(src) == True):
                pass
            else:
                # 写入本地文件
                print('准备抓取:' + site + '类别:' + kinds + '----标签:' + tagname)
                iprefuse = 1  # 如果抓取成功设为0
                # 如果抓取出错那重新抓取
                while iprefuse:
                    try:
                        daili1 = daili0[changeip]  # 代理ip
                        # 爬虫速度控制
                        if v:
                            a = time.clock()
                            time.sleep(v)
                            b = time.clock()
                            print('时间暂停:' + str(v))
                            print('真实时间暂停(Unix CPU时间,Windows 真实时间):' +
                                  str(b - a))
                        # 不需要代理
                        if requreip == 0:
                            # webcontent = getHtml(site).encode('utf-8') # 爬取
                            webcontent = getHtml(site).encode('utf-8')  # 爬取
                            # print(webcontent.decode('utf-8','ignore'))
                            notnull = re.search(r'<dl>',
                                                webcontent.decode(
                                                    'utf-8',
                                                    'ignore'))  # 匹配看是否抓取到末页
                            iprefuse = 0  # 抓完设置0
                        else:  # 需要代理
                            print('代理:' + daili1)
                            webcontent = getBinaryHtml(site, daili1)
                            # print(webcontent.decode('utf-8','ignore'))
                            notnull = re.search(
                                r'<dl>', webcontent.decode('utf-8', 'ignore'))
                            print(notnull)
                            iprefuse = 0
                    except Exception as e:
                        print(e)
                        if requreip:
                            changeip = changeip + 1  # 更换ip下标
                            if changeip == len(daili0):  # 到达ip数组末循环再来
                                changeip = 0
                            print('更换代理:' + daili0[changeip])
                        else:
                            print("IP被封")
                            raise
                            return
                        # break

                # 如果抓不到<dl>标签,证明已经抓取完
                if notnull:
                    webfile = open(src, 'wb')
                    webfile.write(webcontent)
                    webfile.close()
                    print("已经抓取:" + site + '类别:' + kinds + '----标签:' + tagname)
                else:
                    lock = open(src.replace('html', lockprefix), 'w')  # 加锁证明抓完
                    # 日期:http://blog.csdn.net/caisini_vc/article/details/5619954
                    finish = time.strftime("%Y-%m-%d %H:%M:%S",
                                           time.localtime())
                    lock.write('抓取完成时间:' + finish)
                    print("抓取完毕:" + tagname)
                    break
            i = i + 1  # 加页
    # 计时
    end = time.clock()
    print("爬取总共运行时间 : %.03f 秒" % (end - start))
Пример #6
0
def mergeboolist():
    start = time.clock()
    taglist = readexcel('web/booktag.xlsx')  # 读取标签列表
    del taglist[0]
    database = Mysql(host="localhost",
                     user="******",
                     pwd="6833066",
                     db="doubanbook")
    for tag in taglist:  # 遍历所有标签
        kind = tag[0]  # 大类
        tagname = tag[1]  # 标签
        excelpath = 'books/' + kind + '/' + tagname + '.xlsx'  # 本地文件
        try:
            datas = readexcel(excelpath)
        except Exception as e:
            print(e)
            continue
        del datas[0]  # 去掉标题
        #print(datas)
        # 提取图书插入数据库
        for data in datas:
            bookname = data[0].replace("'", "\\'").replace('"', '\\"')
            bookurl = data[1].replace("'", "\\'").replace('"', '\\"')
            bookimage = data[2].replace("'", "\\'").replace('"', '\\"')
            bookno = bookurl.split('/')[-2].replace("'",
                                                    "\\'").replace('"', '\\"')
            try:
                bookinfo = data[3].replace("'", "\\'").replace('"', '\\"')
            except:
                bookinfo = ''
                pass
            try:
                bookstar = data[4]
            except:
                bookstar = '0'
                pass
            # select * from `book` where `bookno`='dc'
            searchsql1 = "select * from `book` where `bookno`='" + bookno + "'"
            print(searchsql1)
            try:
                isexist1 = database.ExecQuery(searchsql1)
            except Exception as e:
                print(e)
                pass
            # 如果图书记录存在,插Booktag表
            if isexist1:
                print(bookname + ':' + bookurl + '已经存在')

            else:
                insertbooksql = "INSERT INTO `book` (`bookname`, `bookurl`, `bookimg`, `bookinfo`, `bookstar`, `bookno`) VALUES ('" \
                   "{bookname}', '{bookurl}', '{bookimg}', '{bookinfo}', '{bookstar}', '{bookno}')"
                insert1 = insertbooksql.format(bookname=bookname,
                                               bookurl=bookurl,
                                               bookimg=bookimage,
                                               bookinfo=bookinfo,
                                               bookstar=bookstar,
                                               bookno=bookno)
                print(insert1)
                try:
                    database.ExecNonQuery(insert1)
                except Exception as e:
                    print(e)
                    pass
            # 如果图书标签存在,则不插入
            searchsql = "select * from `booktag` where `bookno`='{bookno}' and `booktag`='{booktag}' and `bookkind`='{bookkind}'"
            searchsql2 = searchsql.format(bookno=bookno,
                                          booktag=tagname,
                                          bookkind=kind)
            print(searchsql2)
            try:
                isexist2 = database.ExecQuery(searchsql2)
            except Exception as e:
                print(e)
                pass
            if isexist2.__len__() == 0:
                inserttag = "INSERT INTO `booktag`(`bookname`,`bookno`,`booktag`,`bookkind`) VALUES ('" \
                   "{bookname}', '{bookno}', '{booktag}', '{bookkind}')"
                insert2 = inserttag.format(bookname=bookname,
                                           bookno=bookno,
                                           booktag=tagname,
                                           bookkind=kind)
                print(insert2)
                try:
                    database.ExecNonQuery(insert2)
                except Exception as e:
                    print(e)
                    pass
            print('-' * 100)
    print("插入数据库结束")
    end = time.clock()
    print("合并图书列表进数据库总共运行时间 : %.03f 秒" % (end - start))