def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN): # Args: url, extCompile=> A pattern object of the extension(s) to match # depth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not depth: return elif not restDriver.isCallableAttr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s" % (httpDomain, url) if not robotParser.canVisit(url): print('Cannot visit %s due to /robots.txt rules' % (url)) return decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = [ utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls ] plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: rGroup = regSearch.groups(1) u = '%s.%s' % (rGroup[0], rGroup[1]) pathSelector = matchedFileUrls pathSelector.append(u) uniqFileUrls = set(matchedFileUrls) dlResults = [ pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls ] resultsList = [val for val in dlResults if val] depth -= 1 for eachUrl in plainUrls: extractFileUrls(eachUrl, extCompile, router, depth)
def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN): # Args: url, extCompile=> A pattern object of the extension(s) to match # depth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not depth: return elif not restDriver.isCallableAttr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr ) return if not utils.httpHeadCompile.search(url): url = "%s%s"%(httpDomain, url) if not robotParser.canVisit(url): print('Cannot visit %s due to /robots.txt rules'%(url)) return decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = [utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls] plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: rGroup = regSearch.groups(1) u = '%s.%s'%(rGroup[0], rGroup[1]) pathSelector = matchedFileUrls pathSelector.append(u) uniqFileUrls = set(matchedFileUrls) dlResults = [pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls] resultsList = [val for val in dlResults if val] depth -= 1 for eachUrl in plainUrls: extractFileUrls(eachUrl, extCompile, router, depth)
def getFiles( url, extCompile, recursionDepth=5, httpDomain=HTTPS_DOMAIN, baseDir=None): #Args: url, extCompile=> A pattern object of the extension(s) to match # recursionDepth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not recursionDepth: return if not hasattr(extCompile, 'search'): streamPrintFlush( "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n" , sys.stderr) return if not re.search(HTTP_HEAD_REGEX,url): url = "%s%s"%(httpDomain, url) print("URL ", url) try: data = urlGetter.urlopen(url) #, timeout=DEFAULT_TIMEOUT) if pyVersion >= 3:decodedData = data.read().decode() else: decodedData = data.read() except Exception: pass else: urls = re.findall(URL_REGEX, decodedData, re.MULTILINE) urls = list(map(lambda s : re.sub(REPEAT_HTTP,HTTP_HEAD_REGEX,s), urls)) matchedFileUrls = filter(lambda s : extCompile.search(s), urls) plainUrls = filter(lambda s : s not in matchedFileUrls, urls) # print(matchedFileUrls) # First create that directory if not baseDir: baseDir = os.path.abspath(".") cleanedPath = re.sub('[/:]+','_', url) fullUrlToMemPath = os.path.join(baseDir, cleanedPath) # print("FULLURL to Mem ", fullUrlToMemPath) createDir(fullUrlToMemPath) #Time to download all the matched files dlResults = map( lambda eachUrl: dlData(eachUrl, fullUrlToMemPath), matchedFileUrls ) resultsList = list(filter(lambda val: val, dlResults)) #Report to user successful saves downloadCount = len(resultsList) # print(downloadCount) if not downloadCount: # Mark this url as a bad one/miss and for the sake of crawling # not hitting dead ends, we won't crawl it anymore unless otherwise specified urlHash = getHash(url) urlScoreTuple = missesDict.get(urlHash, None) badCrawlCount = 0 if urlScoreTuple and len(urlScoreTuple) != 2: badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score missesDict[urlHash] = (url, badCrawlCount, time.time()) return # Cut this journey short else: streamPrintFlush( "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr ) recursionDepth -= 1 for eachUrl in plainUrls: getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
def createDir(dirPath): # print("CreateDir:: ", dirPath) if dirPath and not os.path.exists(dirPath): os.mkdir(dirPath) if DEBUG: streamPrintFlush("Done creating %s\n"%(dirPath), sys.stderr)
def main(): while True: try: streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break baseUrl = lineIn.strip("\n") streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): " ,sys.stderr) lineIn, eofState = readFromStream() if eofState: break rDepth = int(lineIn.strip("\n")) formedRegex =\ "\.(%s)"%(extensions) if extensions else DEFAULT_EXTENSIONS_REGEX extCompile = regexCompile(formedRegex) except ValueError: streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: streamPrintFlush("Ctrl-C applied. Exiting now..\n",sys.stderr) break except Exception: continue else: if not baseUrl: continue if extCompile: getFiles(baseUrl, extCompile, rDepth) streamPrintFlush("Bye..\n",sys.stderr)
def dlData(url, dirStoragePath=None): #Args: A url #Download the data from the url and write it to memory #Returns: True iff the data was successfully written, else: False if not (url and re.search(HTTP_HEAD_REGEX,url)): return None # Let's check the cache first # Computing the url's hash urlStrHash = getHash(url) if not urlStrHash: streamPrintFlush("Cannot hash the provided URL") return isMiss = missesDict.get(urlStrHash, None) if isMiss: if DEBUG: streamPrintFlush("Uncrawlable link: %s"%(url)) return None alreadyIn = hitsDict.get(urlStrHash, None) if alreadyIn: if DEBUG: streamPrintFlush("\033[32mAlready downloaded %s\033[00m\n"%(url)) return None try: data = urlGetter.urlopen(url) except Exception: return False else: fileSearch = re.findall(END_NAME, url) if not fileSearch : return False fileName = fileSearch[0] fnameExtensionSeparate = re.findall("(.*)\.(\w+)$", fileName, re.UNICODE) if not fnameExtensionSeparate: return False # Raise error possibly proposedName, extension = fnameExtensionSeparate[0] # availableName = fileNameTrie.getSuggestion(proposedName) # if not availableName: # print( # "Sorry no alternate suggestions for %s could be proposed"%(fileName) # ) # return False fileName = "%s.%s"%(proposedName, extension) # fileNameTrie.addSeq(availableName, 0, len(availableName)) # Mark this entry as taken if dirStoragePath and os.path.exists(dirStoragePath): fileName = os.path.join(dirStoragePath, fileName) streamPrintFlush("From url %s\n"%(url), sys.stderr) try: f = open(fileName,'wb') f.write(data.read()) f.close() except: streamPrintFlush("Failed to write %s to memory\n"%(fileName), sys.stderr) return False else: streamPrintFlush("Wrote %s to memory\n"%(fileName), sys.stderr) # Let's now cache that url and mark it's content as already visited # where the urlString hash is the key and downloaded urls are the values markedContent = hitsDict.get(urlStrHash, []) markedContent.append(url) hitsDict[urlStrHash] = markedContent return True
def main(): args, options = restDriver.cliParser() # Route manager router = Router([ 'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009' ]) while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break if lineIn: baseUrl = lineIn.strip("\n") else: continue utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break elif lineIn: rDepth = int(lineIn.strip("\n") or 1) else: rDepth = 1 formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: # TODO: [Informative exceptions]: # + Handle traceback from sys somehow, since using Exception as e won't # is invalid syntax for x <= Python2.5 print('Generic exception encountered') continue else: if not baseUrl: continue if extCompile: extractFileUrls(baseUrl, extCompile, router, rDepth) utils.streamPrintFlush("Bye..\n",sys.stderr)
def main(): while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break baseUrl = lineIn.strip("\n") utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break rDepth = int(lineIn.strip("\n")) formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: continue else: if not baseUrl: continue if extCompile: getFiles(baseUrl, extCompile, rDepth) utils.streamPrintFlush("Bye..\n", sys.stderr)
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None): # Args: url, extCompile=> A pattern object of the extension(s) to match # recursionDepth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not recursionDepth: return elif not hasattr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n", sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s" % (httpDomain, url) decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = list( map( lambda s: utils.repeatHttpHeadCompile.sub( utils.HTTP_HEAD_REGEX, s), urls)) if not urls: capableUrls = utils.urlCapableCompile.findall(decodedData) trimmedHeadUrl = url.strip('/') for capableUrl in capableUrls: trimmed = capableUrl.strip('/') fixedUrl = '%s/%s' % (trimmedHeadUrl, trimmed) urls.append(fixedUrl) plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: g = regSearch.groups(1) u = '%s.%s' % (g[0], g[1]) pathSelector = matchedFileUrls pathSelector.append(u) if not baseDir: baseDir = os.path.abspath(".") fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url)) utils.createDir(fullUrlToMemPath) # Time to download all the matched files dlResults = [] for eachUrl in matchedFileUrls: dlResults.append(dlData(eachUrl, fullUrlToMemPath)) resultsList = list(filter(lambda val: val, dlResults)) # Report to user successful saves downloadCount = len(resultsList) # print(downloadCount) if not downloadCount: # Mark this url as a bad one/miss and for the sake of crawling # not hitting dead ends, we won't crawl it anymore unless otherwise specified urlHash = getHash(url) urlScoreTuple = missesDict.get(urlHash, None) badCrawlCount = 0 if urlScoreTuple and len(urlScoreTuple) != 2: badCrawlCount = ( urlScoreTuple[1]) + 1 # Increment the bad crawl score missesDict[urlHash] = (url, badCrawlCount, time.time()) return # Cut this journey short else: utils.streamPrintFlush( "For url %s downloaded %d files\n" % (url, downloadCount), sys.stderr) recursionDepth -= 1 for eachUrl in plainUrls: getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
def dlData(url, dirStoragePath=None): # Args: A url # Download the data from the url and write it to memory # Returns: True iff the data was successfully written, else: False if not (url and utils.httpHeadCompile.search(url)): return None urlStrHash = getHash(url) if not urlStrHash: utils.streamPrintFlush("Cannot hash the provided URL") return isMiss = missesDict.get(urlStrHash, None) if isMiss: if DEBUG: utils.streamPrintFlush("Uncrawlable link: %s" % (url)) return None alreadyIn = hitsDict.get(urlStrHash, None) if alreadyIn: if DEBUG: utils.streamPrintFlush("\033[32mAlready downloaded %s\033[00m\n" % (url)) return None try: data = utils.urlGetter.urlopen(url) except Exception: return False else: fileSearch = utils.endNameCompile.findall(url) if not fileSearch: return False fileName = fileSearch[0] fnameExtensionSeparate = utils.fnameCompile.findall(fileName) if not fnameExtensionSeparate: return False # Raise error possibly proposedName, extension = fnameExtensionSeparate[0] # availableName = fileNameTrie.getSuggestion(proposedName) # if not availableName: # print( # "Sorry no alternate suggestions for %s could be proposed"%(fileName) # ) # return False fileName = "%s.%s" % (proposedName, extension) # fileNameTrie.addSeq(availableName, 0, len(availableName)) # Mark this entry as taken if dirStoragePath and os.path.exists(dirStoragePath): fileName = os.path.join(dirStoragePath, fileName) utils.streamPrintFlush("From url %s\n" % (url), sys.stderr) try: f = open(fileName, 'wb') f.write(data.read()) f.close() except: utils.streamPrintFlush( "Failed to write %s to memory\n" % (fileName), sys.stderr) return False else: utils.streamPrintFlush("Wrote %s to memory\n" % (fileName), sys.stderr) # Let's now cache that url and mark it's content as already visited # where the urlString hash is the key and downloaded urls are the values markedContent = hitsDict.get(urlStrHash, []) markedContent.append(url) hitsDict[urlStrHash] = markedContent return True
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None): # Args: url, extCompile=> A pattern object of the extension(s) to match # recursionDepth => An integer that indicates how deep to scrap # Note: A negative recursion depth indicates that you want # to keep crawling as far as the program can go if not recursionDepth: return elif not hasattr(extCompile, 'search'): utils.streamPrintFlush( "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n" , sys.stderr) return if not utils.httpHeadCompile.search(url): url = "%s%s"%(httpDomain, url) decodedData = utils.dlAndDecode(url) if not decodedData: return else: urls = utils.urlCompile.findall(decodedData) urls = list( map(lambda s: utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s), urls) ) if not urls: capableUrls = utils.urlCapableCompile.findall(decodedData) trimmedHeadUrl = url.strip('/') for capableUrl in capableUrls: trimmed = capableUrl.strip('/') fixedUrl = '%s/%s'%(trimmedHeadUrl, trimmed) urls.append(fixedUrl) plainUrls = [] matchedFileUrls = [] for u in urls: pathSelector = plainUrls regSearch = extCompile.search(u) if regSearch: g = regSearch.groups(1) u = '%s.%s'%(g[0], g[1]) pathSelector = matchedFileUrls pathSelector.append(u) if not baseDir: baseDir = os.path.abspath(".") fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url)) utils.createDir(fullUrlToMemPath) # Time to download all the matched files dlResults = [] for eachUrl in matchedFileUrls: dlResults.append(dlData(eachUrl, fullUrlToMemPath)) resultsList = list(filter(lambda val: val, dlResults)) # Report to user successful saves downloadCount = len(resultsList) # print(downloadCount) if not downloadCount: # Mark this url as a bad one/miss and for the sake of crawling # not hitting dead ends, we won't crawl it anymore unless otherwise specified urlHash = getHash(url) urlScoreTuple = missesDict.get(urlHash, None) badCrawlCount = 0 if urlScoreTuple and len(urlScoreTuple) != 2: badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score missesDict[urlHash] = (url, badCrawlCount, time.time()) return # Cut this journey short else: utils.streamPrintFlush( "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr ) recursionDepth -= 1 for eachUrl in plainUrls: getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
def main(): args, options = restDriver.cliParser() # Route manager router = Router([ 'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009' ]) while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break if lineIn: baseUrl = lineIn.strip("\n") else: continue utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break elif lineIn: rDepth = int(lineIn.strip("\n") or 1) else: rDepth = 1 formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: # TODO: [Informative exceptions]: # + Handle traceback from sys somehow, since using Exception as e won't # is invalid syntax for x <= Python2.5 print('Generic exception encountered') continue else: if not baseUrl: continue if extCompile: extractFileUrls(baseUrl, extCompile, router, rDepth) utils.streamPrintFlush("Bye..\n", sys.stderr)