os.makedirs(dirName.decode('utf8')) # 创建多层目录 fileResult = re.findall(filePattern, data) for fileName, fileID in fileResult: fileURL = fileBaseUrl + fileID if fileURL in downloaded: continue else: downloaded.add(fileURL) print fileName + ' downloading...................................................' # 在 downloadSingleFile 函数中处理异常 # 假如下载失败会再次尝试,最多三次 maxTry = 3 for i in xrange(maxTry): tryBest = False if i < maxTry - 1 else True if downloadSingleFile.downLoadSingleFile( fileURL, fileName, dirName, downloadLog, tryBest): break print '==========retrying to download file %s of url %s' % ( fileName, fileURL) # 控制爬虫爬取速度,爬完一个文件睡眠一定时间 #time.sleep(random.randint(1,10)) # 控制爬虫爬取速度,爬完一个页面的文件睡眠一定时间 #time.sleep(random.randint(10,20)) # 打印出下载某一类别所访问过的页面 """ for visit in visited: print visit """ return True
fileResult = re.findall(FilePattern, data) for later in fileResult: fileURL = FileBaseURL+later lock.acquire() # 获取锁来修改DOWNLOADED内容 try: if fileURL in DOWNLOADED: continue else: DOWNLOADED.append(fileURL) finally: lock.release() print self.name + ' is downloading' + fileURL+'.......' downloadSingleFile.downLoadSingleFile(fileURL, DIR, DOWNLOADLOG) QUEUE.task_done() # Queue.join()阻塞直到所有任务完成,也就是说要收到从QUEUE中取出的每个item的task_done消息 def downloadSingleCate(caterotyID,downloadDIR): """ 通过类别ID构建某一类词典的下载链接,设置下载目录等参数,初始化这一类别的队列; 通过修改全局变量,线程读取全局变量来获取修改后的内容 :param caterotyID: 下载的词库类型的ID,用于找到正确url :param downloadDIR: 下载词库的存放目录 :return: None """ global CATEID, DIR, PageBaseURL, FileBaseURL, PagePattern, FilePattern, QUEUE CATEID = caterotyID DIR = downloadDIR
# 查找并下载文件 # 指定下载目录,目录不存在时自动创建,需要在前面加上u,指定编码为utf-8 if not os.path.exists(dir.decode('utf8')): # dir 为str类型,但是创建目录# 必须要用 os.makedirs(dir.decode('utf8')) # 创建多层目录 fileResult = re.findall(filePattern, data) for later in fileResult: fileURL = fileBaseUrl+later if fileURL in downloaded: continue else: downloaded.append(fileURL) print fileURL+' downloading.......' downloadSingleFile.downLoadSingleFile(fileURL, dir, logFile) for visit in visited: print visit if __name__ == '__main__': start = time.time() bigCateDict, smallCateDict = getCategory.getSogouDictCate() baseDir = 'G:/搜狗词库/单线程下载' logFile = baseDir+'/download.log' for i in bigCateDict: for j in smallCateDict[i]: downloadDir = baseDir+'/%s/%s/' %(bigCateDict[i],smallCateDict[i][j]) downloadSingleCate(int(j), downloadDir, logFile) print 'process time:%s' % (time.time() - start)
fileResult = re.findall(FilePattern, data) for later in fileResult: fileURL = FileBaseURL + later lock.acquire() # 获取锁来修改DOWNLOADED内容 try: if fileURL in DOWNLOADED: continue else: DOWNLOADED.append(fileURL) finally: lock.release() print self.name + ' is downloading' + fileURL + '.......' downloadSingleFile.downLoadSingleFile(fileURL, DIR, DOWNLOADLOG) QUEUE.task_done( ) # Queue.join()阻塞直到所有任务完成,也就是说要收到从QUEUE中取出的每个item的task_done消息 def downloadSingleCate(caterotyID, downloadDIR): """ 通过类别ID构建某一类词典的下载链接,设置下载目录等参数,初始化这一类别的队列; 通过修改全局变量,线程读取全局变量来获取修改后的内容 :param caterotyID: 下载的词库类型的ID,用于找到正确url :param downloadDIR: 下载词库的存放目录 :return: None """ global CATEID, DIR, PageBaseURL, FileBaseURL, PagePattern, FilePattern, QUEUE CATEID = caterotyID
fileURL = self.fileBaseURL + fileID THREAD_LOCK.acquire() # 获取锁来修改DOWNLOADED内容 try: if fileURL in DOWNLOADED: continue else: DOWNLOADED.add(fileURL) finally: THREAD_LOCK.release() print self.name + ' is downloading' + fileName + '.......' # 防止500,502错误,最大尝试三次 maxTry = 3 for m in xrange(maxTry): tryBest = False if m < maxTry - 1 else True if downloadSingleFile.downLoadSingleFile(fileURL, fileName, DOWNLOAD_DIR, DOWNLOAD_LOG, tryBest): break print '==========retrying to download file %s of url %s'%(fileName, fileURL) PAGE_QUEUE.task_done() # PAGE_QUEUE.join()阻塞直到所有任务完成,也就是说要收到从 PAGE_QUEUE 中取出的每个item的task_done消息 def getCategoryPages(caterotyID,downloadDIR): """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载 :param caterotyID: 下载的词库类型的 ID,用于找到正确 url :param downloadDIR: 下载词库的存放目录 :return: """ global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK CATEID = caterotyID
os.makedirs(dirName.decode('utf8')) # 创建多层目录 fileResult = re.findall(filePattern, data) for fileName, fileID in fileResult: fileURL = fileBaseUrl+fileID if fileURL in downloaded: continue else: downloaded.add(fileURL) print fileName+' downloading...................................................' # 在 downloadSingleFile 函数中处理异常 # 假如下载失败会再次尝试,最多三次 maxTry = 3 for i in xrange(maxTry): tryBest = False if i < maxTry-1 else True if downloadSingleFile.downLoadSingleFile(fileURL, fileName, dirName, downloadLog, tryBest): break print '==========retrying to download file %s of url %s'%(fileName, fileURL) # 控制爬虫爬取速度,爬完一个文件睡眠一定时间 #time.sleep(random.randint(1,10)) # 控制爬虫爬取速度,爬完一个页面的文件睡眠一定时间 #time.sleep(random.randint(10,20)) # 打印出下载某一类别所访问过的页面 """ for visit in visited: print visit """ return True
THREAD_LOCK.acquire() # 获取锁来修改DOWNLOADED内容 try: if fileURL in DOWNLOADED: continue else: DOWNLOADED.add(fileURL) finally: THREAD_LOCK.release() print self.name + ' is downloading' + fileName + '.......' # 防止500,502错误,最大尝试三次 maxTry = 3 for m in xrange(maxTry): tryBest = False if m < maxTry - 1 else True if downloadSingleFile.downLoadSingleFile( fileURL, fileName, DOWNLOAD_DIR, DOWNLOAD_LOG, tryBest): break print '==========retrying to download file %s of url %s' % ( fileName, fileURL) PAGE_QUEUE.task_done( ) # PAGE_QUEUE.join()阻塞直到所有任务完成,也就是说要收到从 PAGE_QUEUE 中取出的每个item的task_done消息 def getCategoryPages(caterotyID, downloadDIR): """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载 :param caterotyID: 下载的词库类型的 ID,用于找到正确 url :param downloadDIR: 下载词库的存放目录 :return:
# 查找并下载文件 # 指定下载目录,目录不存在时自动创建,需要在前面加上u,指定编码为utf-8 if not os.path.exists(dir.decode('utf8')): # dir 为str类型 os.makedirs(dir.decode('utf8')) # 创建多层目录 fileResult = re.findall(filePattern, data) for fileID, fileName in fileResult: fileURL = fileBaseUrl+fileID if fileURL in downloaded: continue else: downloaded.add(fileURL) print fileName+' downloading.......' try: downloadSingleFile.downLoadSingleFile(fileURL, fileName, dir, downloadLog) except: with open(downloadLog, 'a') as f: f.write(fileURL+' is not DOWNLOADED successfully\n') # 打印出下载某一类别所访问过的页面 for visit in visited: print visit if __name__ == '__main__': start = time.time() ''' # test data downloadDir = 'G:/各大输入法词库/百度/' downloadLog = 'G:/各大输入法词库/百度/download.log'