os.makedirs(dirName.decode('utf8'))  # 创建多层目录

        fileResult = re.findall(filePattern, data)
        for fileName, fileID in fileResult:
            fileURL = fileBaseUrl + fileID
            if fileURL in downloaded:
                continue
            else:
                downloaded.add(fileURL)
                print fileName + ' downloading...................................................'
                # 在 downloadSingleFile 函数中处理异常
                # 假如下载失败会再次尝试,最多三次
                maxTry = 3
                for i in xrange(maxTry):
                    tryBest = False if i < maxTry - 1 else True
                    if downloadSingleFile.downLoadSingleFile(
                            fileURL, fileName, dirName, downloadLog, tryBest):
                        break
                    print '==========retrying to download file %s of url %s' % (
                        fileName, fileURL)
                # 控制爬虫爬取速度,爬完一个文件睡眠一定时间
                #time.sleep(random.randint(1,10))

        # 控制爬虫爬取速度,爬完一个页面的文件睡眠一定时间
        #time.sleep(random.randint(10,20))

    # 打印出下载某一类别所访问过的页面
    """
    for visit in visited:
        print visit
    """
    return True
Exemplo n.º 2
0
            fileResult = re.findall(FilePattern, data)
            for later in fileResult:
                fileURL = FileBaseURL+later

                lock.acquire()  # 获取锁来修改DOWNLOADED内容
                try:
                    if fileURL in DOWNLOADED:
                        continue
                    else:
                        DOWNLOADED.append(fileURL)
                finally:
                    lock.release()

                print self.name + ' is downloading' + fileURL+'.......'
                downloadSingleFile.downLoadSingleFile(fileURL, DIR, DOWNLOADLOG)
            QUEUE.task_done()   # Queue.join()阻塞直到所有任务完成,也就是说要收到从QUEUE中取出的每个item的task_done消息


def downloadSingleCate(caterotyID,downloadDIR):
    """
    通过类别ID构建某一类词典的下载链接,设置下载目录等参数,初始化这一类别的队列;
    通过修改全局变量,线程读取全局变量来获取修改后的内容

    :param caterotyID: 下载的词库类型的ID,用于找到正确url
    :param downloadDIR: 下载词库的存放目录
    :return: None
    """
    global CATEID, DIR, PageBaseURL, FileBaseURL, PagePattern, FilePattern, QUEUE
    CATEID = caterotyID
    DIR = downloadDIR   
Exemplo n.º 3
0
        # 查找并下载文件
        # 指定下载目录,目录不存在时自动创建,需要在前面加上u,指定编码为utf-8

        if not os.path.exists(dir.decode('utf8')):   # dir 为str类型,但是创建目录# 必须要用
            os.makedirs(dir.decode('utf8'))          # 创建多层目录

        fileResult = re.findall(filePattern, data)
        for later in fileResult:
            fileURL = fileBaseUrl+later
            if fileURL in downloaded:
                continue
            else:
                downloaded.append(fileURL)
            print fileURL+' downloading.......'
            downloadSingleFile.downLoadSingleFile(fileURL, dir, logFile)
            

    for visit in visited:
        print visit

if __name__ == '__main__':
    start = time.time()
    bigCateDict, smallCateDict = getCategory.getSogouDictCate()
    baseDir = 'G:/搜狗词库/单线程下载'
    logFile = baseDir+'/download.log'
    for i in bigCateDict:
        for j in smallCateDict[i]:
            downloadDir = baseDir+'/%s/%s/' %(bigCateDict[i],smallCateDict[i][j])
            downloadSingleCate(int(j), downloadDir, logFile)
    print 'process time:%s' % (time.time() - start)
Exemplo n.º 4
0
            fileResult = re.findall(FilePattern, data)
            for later in fileResult:
                fileURL = FileBaseURL + later

                lock.acquire()  # 获取锁来修改DOWNLOADED内容
                try:
                    if fileURL in DOWNLOADED:
                        continue
                    else:
                        DOWNLOADED.append(fileURL)
                finally:
                    lock.release()

                print self.name + ' is downloading' + fileURL + '.......'
                downloadSingleFile.downLoadSingleFile(fileURL, DIR,
                                                      DOWNLOADLOG)
            QUEUE.task_done(
            )  # Queue.join()阻塞直到所有任务完成,也就是说要收到从QUEUE中取出的每个item的task_done消息


def downloadSingleCate(caterotyID, downloadDIR):
    """
    通过类别ID构建某一类词典的下载链接,设置下载目录等参数,初始化这一类别的队列;
    通过修改全局变量,线程读取全局变量来获取修改后的内容

    :param caterotyID: 下载的词库类型的ID,用于找到正确url
    :param downloadDIR: 下载词库的存放目录
    :return: None
    """
    global CATEID, DIR, PageBaseURL, FileBaseURL, PagePattern, FilePattern, QUEUE
    CATEID = caterotyID
Exemplo n.º 5
0
                fileURL = self.fileBaseURL + fileID
                THREAD_LOCK.acquire()  # 获取锁来修改DOWNLOADED内容
                try:
                    if fileURL in DOWNLOADED:
                        continue
                    else:
                        DOWNLOADED.add(fileURL)
                finally:
                    THREAD_LOCK.release()
                print self.name + ' is downloading' + fileName + '.......'

                # 防止500,502错误,最大尝试三次
                maxTry = 3
                for m in xrange(maxTry):
                    tryBest = False if m < maxTry - 1 else True
                    if downloadSingleFile.downLoadSingleFile(fileURL, fileName, DOWNLOAD_DIR, DOWNLOAD_LOG, tryBest):
                        break
                    print '==========retrying to download file %s of url %s'%(fileName, fileURL) 

            PAGE_QUEUE.task_done()   # PAGE_QUEUE.join()阻塞直到所有任务完成,也就是说要收到从 PAGE_QUEUE 中取出的每个item的task_done消息


def getCategoryPages(caterotyID,downloadDIR):
    """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载

    :param caterotyID: 下载的词库类型的 ID,用于找到正确 url
    :param downloadDIR: 下载词库的存放目录
    :return:
    """
    global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK
    CATEID = caterotyID
Exemplo n.º 6
0
            os.makedirs(dirName.decode('utf8'))          # 创建多层目录

        fileResult = re.findall(filePattern, data)
        for fileName, fileID in fileResult:
            fileURL = fileBaseUrl+fileID
            if fileURL in downloaded:
                continue
            else:
                downloaded.add(fileURL)
                print fileName+' downloading...................................................'
                # 在 downloadSingleFile 函数中处理异常
                # 假如下载失败会再次尝试,最多三次
                maxTry = 3
                for i in xrange(maxTry):
                    tryBest = False if i < maxTry-1 else True
                    if downloadSingleFile.downLoadSingleFile(fileURL, fileName, dirName, downloadLog, tryBest):
                        break
                    print '==========retrying to download file %s of url %s'%(fileName, fileURL) 
                # 控制爬虫爬取速度,爬完一个文件睡眠一定时间
                #time.sleep(random.randint(1,10)) 

        # 控制爬虫爬取速度,爬完一个页面的文件睡眠一定时间
        #time.sleep(random.randint(10,20)) 
        

    # 打印出下载某一类别所访问过的页面
    """
    for visit in visited:
        print visit
    """
    return True
Exemplo n.º 7
0
                THREAD_LOCK.acquire()  # 获取锁来修改DOWNLOADED内容
                try:
                    if fileURL in DOWNLOADED:
                        continue
                    else:
                        DOWNLOADED.add(fileURL)
                finally:
                    THREAD_LOCK.release()
                print self.name + ' is downloading' + fileName + '.......'

                # 防止500,502错误,最大尝试三次
                maxTry = 3
                for m in xrange(maxTry):
                    tryBest = False if m < maxTry - 1 else True
                    if downloadSingleFile.downLoadSingleFile(
                            fileURL, fileName, DOWNLOAD_DIR, DOWNLOAD_LOG,
                            tryBest):
                        break
                    print '==========retrying to download file %s of url %s' % (
                        fileName, fileURL)

            PAGE_QUEUE.task_done(
            )  # PAGE_QUEUE.join()阻塞直到所有任务完成,也就是说要收到从 PAGE_QUEUE 中取出的每个item的task_done消息


def getCategoryPages(caterotyID, downloadDIR):
    """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载

    :param caterotyID: 下载的词库类型的 ID,用于找到正确 url
    :param downloadDIR: 下载词库的存放目录
    :return:
Exemplo n.º 8
0
        # 查找并下载文件
        # 指定下载目录,目录不存在时自动创建,需要在前面加上u,指定编码为utf-8
        if not os.path.exists(dir.decode('utf8')):   # dir 为str类型
            os.makedirs(dir.decode('utf8'))          # 创建多层目录

        fileResult = re.findall(filePattern, data)
        for fileID, fileName in fileResult:
            fileURL = fileBaseUrl+fileID
            if fileURL in downloaded:
                continue
            else:
                downloaded.add(fileURL)
                print fileName+' downloading.......'
                try:
                    downloadSingleFile.downLoadSingleFile(fileURL, fileName, dir, downloadLog)
                except:
                    with open(downloadLog, 'a') as f:
                        f.write(fileURL+' is not DOWNLOADED successfully\n')


    # 打印出下载某一类别所访问过的页面
    for visit in visited:
        print visit

if __name__ == '__main__':
    start = time.time()
    '''
    # test data
    downloadDir = 'G:/各大输入法词库/百度/'
    downloadLog = 'G:/各大输入法词库/百度/download.log'