class mySpider(object): def __init__(self): self.will_crawl = [] self.visted = set([]) self.parser = HTMLParser() def crawl(self, initial_url, maxFiles): # standardize url so that it doesn't end in '/', get rid of '#', and starts with http:// redundant = initial_url.find('#') if redundant != -1: initial_url = initial_url[:redundant] length = len(initial_url) if initial_url[length - 1] == '/': initial_url = initial_url[:(length - 1)] if initial_url[0:4] != "http": initial_url = "http://" + initial_url # Add in check for robots.txt to insure it is polite initdomain = "{uri.netloc}".format( uri=urllib.parse.urlparse(initial_url)) print("domain is " + initdomain) robot = urllib.robotparser.RobotFileParser() robot.set_url("http://" + initdomain + "/robots.txt") print("check 1") robot.read() print("check 2") if (robot.can_fetch("*", initial_url)): self.will_crawl.append( initial_url) # put initial_url to will_crawl list if allowed filenum = 1 # initialize number of files downloaded directory = "../file_cache/unprocessed/newly_crawled/" if not os.path.exists(directory): os.makedirs(directory) while (len(self.will_crawl) > 0) and (filenum <= maxFiles): url = self.will_crawl.pop(0) # get next url try: print("Spider at:", url) openedURL = urlopenFun(url) sourceCode = openedURL.read() encoded = urllib.parse.quote( str(url), safe='.') # encode URL so it can be a file name encoded = encoded.replace(".", "%2E") try: name = '{0}/{1}.html'.format(directory, encoded) f = open(name, 'wb') f.write(sourceCode) filenum = filenum + 1 # keeps track of files downloaded if needed f.close() except: print("\tFile Exception\t{0}".format(name)) links = self.parser.findLinks(url) # parse url self.visted.add(url) # mark url as visted # Add links to will_crawl list if not visited already and if robots.txt says is polite for url in links: # print (robot.can_fetch("*", url)) try: if (url not in self.visted) and ( url not in self.will_crawl) and (robot.can_fetch( "*", url)): self.will_crawl.append(url) except: self.visted.add(url) print("Some error occurred adding to frontier.") except Exception as e: self.visted.add(url) print(e)