def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5): '''Set up downloader''' queue = initDownloader(numThreads) import csv f = open(aList, 'rb') reader = csv.reader(f) for row in reader: startURL = row[0] mlName = startURL.split('/')[-2] spider = Spider(startURL) spider.process_page(startURL) '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)] if len(urlList): print '%s: %d archives' % (mlName, len(urlList)) store = os.path.join(container, mlName) if not (os.path.isdir(store)): os.system("mkdir %s" % store) '''Download each archive''' addToQ(queue, urlList, store) '''If here, download finished. Stop threads''' stopDownloader(queue, numThreads)
def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5): '''Crawl <startURL> and find all mailing list archives (given the filename <extension>). Store the files in the folder with the path <container>. If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well). ''' '''Set up downloader''' queue = initDownloader(numThreads) print 'Downloading archives from', startURL if not lookInsideSubfolders: spider = Spider(startURL) spider.process_page(startURL) '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)] print '%d archives' % (len(urlList)) addToQ(queue, urlList, container) else: spider = Spider(startURL) spider.process_page(startURL) for link in sorted(spider.URLs): subspider = Spider(link) subspider.process_page(link) mlName = link.split('/')[-2] '''Only the links to archive files are interesting: mailing list archive file names end with '.txt.gz' ''' urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)] if len(urlList): print '%s: %d archives' % (mlName, len(urlList)) '''Create a folder for the mailing list''' store = os.path.join(container, mlName) if not (os.path.isdir(store)): os.system("mkdir %s" % store) addToQ(queue, urlList, store) '''If here, download finished. Stop threads''' stopDownloader(queue, numThreads)