def canVisit(self, url): topDomain = utils.getTopDomain(url) retrRules = self.__rulesDict__.get(topDomain, None) if retrRules is None: # Cache miss robotsUrl = utils.robotsTxt(url) roboFileBuf = utils.dlAndDecode(robotsUrl) if not self.parseRobotFile(topDomain, roboFileBuf): return False retr = self.__rulesDict__[topDomain] sp = tuple(filter(lambda a: a, url.split(topDomain))) if sp: firstCh = firstLetterCompile.search(sp[0]) if firstCh: # Time to probe fCh = firstCh.groups(1)[0] retr = self.__rulesDict__[topDomain]['disallow'] compList = retr.get(fCh, None) if compList: for comp in compList: if comp.search(sp[0]): return False return True return True
def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN): # Args: url, extCompile=> A pattern object of the extension(s) to match # depth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not depth: return elif not restDriver.isCallableAttr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s" % (httpDomain, url) if not robotParser.canVisit(url): print('Cannot visit %s due to /robots.txt rules' % (url)) return decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = [ utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls ] plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: rGroup = regSearch.groups(1) u = '%s.%s' % (rGroup[0], rGroup[1]) pathSelector = matchedFileUrls pathSelector.append(u) uniqFileUrls = set(matchedFileUrls) dlResults = [ pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls ] resultsList = [val for val in dlResults if val] depth -= 1 for eachUrl in plainUrls: extractFileUrls(eachUrl, extCompile, router, depth)
def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN): # Args: url, extCompile=> A pattern object of the extension(s) to match # depth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not depth: return elif not restDriver.isCallableAttr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr ) return if not utils.httpHeadCompile.search(url): url = "%s%s"%(httpDomain, url) if not robotParser.canVisit(url): print('Cannot visit %s due to /robots.txt rules'%(url)) return decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = [utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls] plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: rGroup = regSearch.groups(1) u = '%s.%s'%(rGroup[0], rGroup[1]) pathSelector = matchedFileUrls pathSelector.append(u) uniqFileUrls = set(matchedFileUrls) dlResults = [pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls] resultsList = [val for val in dlResults if val] depth -= 1 for eachUrl in plainUrls: extractFileUrls(eachUrl, extCompile, router, depth)
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None): # Args: url, extCompile=> A pattern object of the extension(s) to match # recursionDepth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not recursionDepth: return elif not hasattr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n", sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s" % (httpDomain, url) decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = list( map( lambda s: utils.repeatHttpHeadCompile.sub( utils.HTTP_HEAD_REGEX, s), urls)) if not urls: capableUrls = utils.urlCapableCompile.findall(decodedData) trimmedHeadUrl = url.strip('/') for capableUrl in capableUrls: trimmed = capableUrl.strip('/') fixedUrl = '%s/%s' % (trimmedHeadUrl, trimmed) urls.append(fixedUrl) plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: g = regSearch.groups(1) u = '%s.%s' % (g[0], g[1]) pathSelector = matchedFileUrls pathSelector.append(u) if not baseDir: baseDir = os.path.abspath(".") fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url)) utils.createDir(fullUrlToMemPath) # Time to download all the matched files dlResults = [] for eachUrl in matchedFileUrls: dlResults.append(dlData(eachUrl, fullUrlToMemPath)) resultsList = list(filter(lambda val: val, dlResults)) # Report to user successful saves downloadCount = len(resultsList) # print(downloadCount) if not downloadCount: # Mark this url as a bad one/miss and for the sake of crawling # not hitting dead ends, we won't crawl it anymore unless otherwise specified urlHash = getHash(url) urlScoreTuple = missesDict.get(urlHash, None) badCrawlCount = 0 if urlScoreTuple and len(urlScoreTuple) != 2: badCrawlCount = ( urlScoreTuple[1]) + 1 # Increment the bad crawl score missesDict[urlHash] = (url, badCrawlCount, time.time()) return # Cut this journey short else: utils.streamPrintFlush( "For url %s downloaded %d files\n" % (url, downloadCount), sys.stderr) recursionDepth -= 1 for eachUrl in plainUrls: getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None): # Args: url, extCompile=> A pattern object of the extension(s) to match # recursionDepth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not recursionDepth: return elif not hasattr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n" , sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s"%(httpDomain, url) decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = list( map(lambda s: utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s), urls) ) if not urls: capableUrls = utils.urlCapableCompile.findall(decodedData) trimmedHeadUrl = url.strip('/') for capableUrl in capableUrls: trimmed = capableUrl.strip('/') fixedUrl = '%s/%s'%(trimmedHeadUrl, trimmed) urls.append(fixedUrl) plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: g = regSearch.groups(1) u = '%s.%s'%(g[0], g[1]) pathSelector = matchedFileUrls pathSelector.append(u) if not baseDir: baseDir = os.path.abspath(".") fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url)) utils.createDir(fullUrlToMemPath) # Time to download all the matched files dlResults = [] for eachUrl in matchedFileUrls: dlResults.append(dlData(eachUrl, fullUrlToMemPath)) resultsList = list(filter(lambda val: val, dlResults)) # Report to user successful saves downloadCount = len(resultsList) # print(downloadCount) if not downloadCount: # Mark this url as a bad one/miss and for the sake of crawling # not hitting dead ends, we won't crawl it anymore unless otherwise specified urlHash = getHash(url) urlScoreTuple = missesDict.get(urlHash, None) badCrawlCount = 0 if urlScoreTuple and len(urlScoreTuple) != 2: badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score missesDict[urlHash] = (url, badCrawlCount, time.time()) return # Cut this journey short else: utils.streamPrintFlush( "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr ) recursionDepth -= 1 for eachUrl in plainUrls: getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)