def crawlingScan(self, url, apiCalls = [], allFoundURLs = []): self.count = self.count - 1 if self.count < 0: return harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams) #If uncommented, will return as soon as a matching call is found #if self.searchString is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: "+url) html = self.openURL(url) if html is not None: bsObj = BeautifulSoup(html, "lxml") harObj = harParser.getSingleHarFile() apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls) allFoundURLs, newUrls = self.findInternalURLs(bsObj, url, allFoundURLs) shuffle(newUrls) for newUrl in newUrls: self.crawlingScan(newUrl, apiCalls, allFoundURLs) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs() exit(1) return apiCalls
def crawlingScan(self, url, apiCalls=[], allFoundURLs=[]): self.count = self.count - 1 if self.count < 0: return harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams) #If uncommented, will return as soon as a matching call is found #if self.searchString is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: " + url) html = self.openURL(url) if html is not None: bsObj = BeautifulSoup(html, "lxml") harObj = harParser.getSingleHarFile() apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls) allFoundURLs, newUrls = self.findInternalURLs( bsObj, url, allFoundURLs) shuffle(newUrls) for newUrl in newUrls: self.crawlingScan(newUrl, apiCalls, allFoundURLs) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs() exit(1) return apiCalls
def search(): #(self, url=None, harDirectory=None, searchString=None, removeParams=False, count=1) searchStr = request.args.get('search') urlStr = request.args.get('url') finder = APIFinder(url=urlStr, searchString=searchStr) apiCalls = finder.start() writer = APIWriter(apiCalls) return writer.outputJSON()
def crawling_scan(self, url, api_calls=None, all_found_urls=None): if api_calls is None: api_calls = [] if all_found_urls is None: all_found_urls = [] self.count = self.count - 1 if self.count < 0: return har_parser = HarParser(self.har_directory, search_string=self.search_string, remove_params=self.remove_params) # If uncommented, will return as soon as a matching call is found # if self.search_string is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: " + url) html = self.open_url(url) if html is not None: soup = BeautifulSoup(html, "lxml") har_obj = har_parser.get_single_har_file() api_calls = har_parser.scan_har_file(har_obj, api_calls=api_calls) all_found_urls, new_urls = self.find_internal_urls( soup, url, all_found_urls) shuffle(new_urls) for newUrl in new_urls: self.crawling_scan(newUrl, api_calls, all_found_urls) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() api_writer = APIWriter(api_calls) api_writer.output_apis() sys.exit(1) return api_calls
help= "File containing JSON formatted cookies to set in driver (with target URL only)", nargs='?') parser.add_argument("-i", help="Count of pages to crawl (with target URL only)", nargs='?') parser.add_argument( '--p', help= "Flag, remove unnecessary parameters (may dramatically increase run time)", action='store_true') args = parser.parse_args() if not (args.u or args.d): print("Need to provide either a URL or directory or both. Use -h for help") sys.exit(1) #Default to directory name "hars" and count of 1 directory = "hars" if args.d is None else args.d count = 1 if args.i is None else int(args.i) finder = APIFinder(url=args.u, harDirectory=directory, searchString=args.s, removeParams=args.p, count=count, cookies=args.c) apiCalls = finder.start() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs()
"File containing JSON formatted cookies to set in driver (with target URL only)", nargs='?') parser.add_argument("-i", help="Count of pages to crawl (with target URL only)", nargs='?') parser.add_argument( '--p', help= "Flag, remove unnecessary parameters (may dramatically increase run time)", action='store_true') args = parser.parse_args() if not (args.u or args.d): print("Need to provide either a URL or directory or both. Use -h for help") sys.exit(1) # Default to directory name "hars" and count of 1 directory = "hars" if args.d is None else args.d count = 1 if args.i is None else int(args.i) finder = APIFinder(url=args.u, har_directory=directory, search_string=args.s, remove_params=args.p, count=count, cookies=args.c) apiCalls = finder.start() apiWriter = APIWriter(apiCalls) apiWriter.output_apis()
from apicall import APIWriter from apiFinder import APIFinder import sys import argparse parser = argparse.ArgumentParser() parser.add_argument("-u", help="Target URL. If not provided, target directory will be scanned for har files.", nargs='?') parser.add_argument("-d", help="Target directory (default is \"hars\"). If URL is provided, directory will store har files. If URL is not provided, directory will be scanned. ", nargs='?') parser.add_argument("-s", help="Search term", nargs='?') parser.add_argument("-c", help="File containing JSON formatted cookies to set in driver (with target URL only)", nargs='?') parser.add_argument("-i", help="Count of pages to crawl (with target URL only)", nargs='?') parser.add_argument('--p', help="Flag, remove unnecessary parameters (may dramatically increase run time)", action='store_true') args = parser.parse_args() if not (args.u or args.d): print("Need to provide either a URL or directory or both. Use -h for help") sys.exit(1) #Default to directory name "hars" and count of 1 directory = "hars" if args.d is None else args.d count = 1 if args.i is None else int(args.i) finder = APIFinder(url=args.u, harDirectory=directory, searchString=args.s, removeParams=args.p, count=count, cookies=args.c) apiCalls = finder.start() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs()