def __init__(self, siteURL, doHTMLValidation, verboseOutput, checkLinkReference, checkForBrokenImages, robotsAndFavicon): self.linksToBeChecked = {} self.checkedLinks = {} self.uncheckedExternalLinks = [] self.doHTMLValidation = doHTMLValidation self.checkLinkReference = checkLinkReference self.checkForBrokenImages = checkForBrokenImages self.robotsAndFavicon = robotsAndFavicon self.siteURL = siteURL self.siteURLParts = urlparse.urlsplit(siteURL) self.sitemap = SitemapGenerator() self.verboseOutput = verboseOutput self.siteReporter = SiteReporter(self.doHTMLValidation, self.checkLinkReference) self.threadManager = None
class RetrieveLinks(): def __init__(self, siteURL, doHTMLValidation, verboseOutput, checkLinkReference, checkForBrokenImages, robotsAndFavicon): self.linksToBeChecked = {} self.checkedLinks = {} self.uncheckedExternalLinks = [] self.doHTMLValidation = doHTMLValidation self.checkLinkReference = checkLinkReference self.checkForBrokenImages = checkForBrokenImages self.robotsAndFavicon = robotsAndFavicon self.siteURL = siteURL self.siteURLParts = urlparse.urlsplit(siteURL) self.sitemap = SitemapGenerator() self.verboseOutput = verboseOutput self.siteReporter = SiteReporter(self.doHTMLValidation, self.checkLinkReference) self.threadManager = None def run(self): parseHTMLPage = HTMLParseRules() pathname = "reports/"+self.siteURLParts.netloc imgsURLsToCheck = [] validationDataToCheck = [] startTime = time.time() # Make a new directory if it doesn't exist # Else throw all error's except for when the directory already exists (errno.EEXIST) try: os.makedirs(pathname) except OSError as exception: if exception.errno != errno.EEXIST: raise realSiteURL = self.siteURL try: requestOutput = urllib2.urlopen(self.siteURL) self.sitemap.setHomePage(requestOutput.geturl()) realSiteURL = requestOutput.geturl() self.siteURLParts = urlparse.urlsplit(realSiteURL) self.linksToBeChecked[realSiteURL] = None except urllib2.URLError: sys.stderr.write("Unable to connect to '"+self.siteURL+"'\n") sys.exit(0) self.threadManager = Threader.ThreadManagementThread() self.threadManager.start() # Create queue and threads for uber speed page requests. linksToBeCheckedQueue = Queue.Queue() jobsOutput = [] requestPageThreads = [] requestPageExceptions = [] for i in range(5): thread = Threader.RequestPageThread(linksToBeCheckedQueue, jobsOutput, self.verboseOutput, requestPageExceptions) thread.start() requestPageThreads.append(thread) self.threadManager.registerThreads(requestPageThreads, linksToBeCheckedQueue, requestPageExceptions) if (self.robotsAndFavicon): if (self.checkForRobots(self.siteURL)): if (self.verboseOutput): print "Robots.txt is present" else: if (self.verboseOutput): print "Check if robots.txt is present" if (self.checkForFavicon(self.siteURL)): if (self.verboseOutput): print "Favicon.ico is present" else: if (self.verboseOutput): print "Check if favicon.ico is present" else: if (self.verboseOutput): print "Checking for robots.txt and favicon.ico has been switched off for this test!" # Run each link through the HTMLParser and HTMLvalidator while (len(self.linksToBeChecked) > 0): jobsOutput[:] = [] if (self.verboseOutput): print "Checking URLs ..." while (len(self.linksToBeChecked) > 0): currentURL, parentURL = self.linksToBeChecked.popitem() if (currentURL not in self.checkedLinks): linksToBeCheckedQueue.put({'currentURL': currentURL, 'parentURL' : parentURL}) linksToBeCheckedQueue.join() if (self.verboseOutput): print "Done threading!\n" if (len(requestPageExceptions) > 0): self.threadManager.exit() raise requestPageExceptions[0][0], requestPageExceptions[0][1], requestPageExceptions[0][2] # lijst van alle jobs met HTML output van iedere job # for output in outputs: for jobOutput in jobsOutput: if (self.verboseOutput): print " "+jobOutput['currentURL']+" ... ("+str(round(jobOutput['elapsedTime'], 3))+")" if (jobOutput['requestOutput'] == False or not hasattr(jobOutput['requestOutput'], "geturl")): self.siteReporter.addLinkToBrokenLinks(jobOutput['currentURL'], jobOutput['parentURL']) self.checkedLinks[jobOutput['currentURL']] = jobOutput['parentURL'] continue elif (jobOutput['requestOutput'].geturl() != jobOutput['currentURL']): self.checkedLinks[jobOutput['currentURL']] = jobOutput['parentURL'] jobOutput['currentURL'] = jobOutput['requestOutput'].geturl() if (jobOutput['currentURL'] in self.checkedLinks): # Happens if the site redirects us to a page that we've already done before. continue if ("text/html" not in Utilities.getContentType(jobOutput['requestOutput'])): #print "Request: " + str(requestOutput) + " Parent: ", parentURL, "\n" # The request file is not an HTML page, so we skip it. self.checkedLinks[jobOutput['currentURL']] = jobOutput['parentURL'] continue # Set encoding for UTF-8 pages. encoding = jobOutput['requestOutput'].headers.getparam('charset') html = jobOutput['requestOutput'].read().decode(encoding) parseHTMLPage.cleanCurrentData() parseHTMLPage.feed(html) parseHTMLPage.close() urlsInCurrentPage = parseHTMLPage.getAllURLs() imgsInCurrentPage = parseHTMLPage.getAllImgURLs() self.collectURLs(urlsInCurrentPage, jobOutput['currentURL']) self.sitemap.addURL(jobOutput['currentURL'], jobOutput['parentURL']) if (jobOutput['parentURL'] is None): parentURL = realSiteURL else: parentURL = jobOutput['parentURL'] for imgURL in imgsInCurrentPage: imgsURLsToCheck.append({'imgURL' : imgURL, 'parentURL': parentURL}) # Perform HTML validation if (self.doHTMLValidation): validationDataToCheck.append({'currentURL': jobOutput['currentURL'], 'html' : html, 'time' : jobOutput['elapsedTime']}) # Print if a link has been checked. # Reference: def run() -> while -> try -> print #if (self.verboseOutput): # print "" #print " Parent: ", parentURL #print "Done checking "+jobOutput['currentURL']+"\n" self.checkedLinks[jobOutput['currentURL']] = jobOutput['parentURL'] if (self.verboseOutput): print "" # End of while loop nrOfLivingThreads = 0 for thread in requestPageThreads: if (thread.isAlive()): nrOfLivingThreads += 1 for i in range(nrOfLivingThreads): linksToBeCheckedQueue.put('die') linksToBeCheckedQueue.join() # Process html validation in a thread-class if (self.doHTMLValidation): if (self.verboseOutput): print "Sending validation request for ..." self.executeValidation(validationDataToCheck) if (self.verboseOutput): print "Requests has been validated!\n" else: if (self.verboseOutput): print "Validating URLs has been switched off for this test!" # process external links in a thread-class if (self.verboseOutput): print "Checking external links ..." self.executeJobs(self.uncheckedExternalLinks, "externalLinks", 20) if (self.verboseOutput): print "Done checking external links!\n" # Process image links in a thread-class if (self.checkForBrokenImages): if (self.verboseOutput): print "Checking for broken image links ..." self.executeJobs(imgsURLsToCheck, "imageLinks") if (self.verboseOutput): print "Done checking for broken image links!\n " else: if (self.verboseOutput): print "Checking for broken image links has been switched off for this test!\n" self.threadManager.exit() currentDate = datetime.datetime.now() date = currentDate.strftime("%d-%m-%Y") self.siteReporter.generateReport(pathname+"/"+self.siteURLParts.netloc+"-problemReport-"+date+".html") self.sitemap.generateSitemap(pathname+"/"+self.siteURLParts.netloc+"-sitemap-"+date+".xml") if (self.verboseOutput): print str(time.time() - startTime)+" seconds!" if (self.verboseOutput): print "Check finished!" def collectURLs(self, urls, currentURL): for url in urls: urlParts = urlparse.urlsplit(url) urlToBeChecked = None if (urlParts.netloc == self.siteURLParts.netloc): if (url not in self.linksToBeChecked and url not in self.checkedLinks): urlToBeChecked = Utilities.removeTokenAndAddMissingSlash(url) elif (urlParts.scheme == "" and urlParts.netloc == ""): if (len(urlParts.fragment) > 0): pass elif (urlParts.path == "" and len(urlParts.query) > 0): # Only a query part in the URL. urlToBeChecked = currentURL+urlParts.query elif (len(urlParts.path) > 0 and urlParts.path[0] == "/"): # Absolute URL to the URL we're now currently at. currentURLParts = urlparse.urlsplit(currentURL) urlParts = urlparse.urlsplit(Utilities.removeTokenAndAddMissingSlash(url)) if (len(urlParts.query) > 0): query = u"?"+urlParts.query else: query = u"" urlToBeChecked = currentURLParts.scheme+"://"+currentURLParts.netloc+urlParts.path+query elif (len(urlParts.path) > 0 and urlParts.path[0] != "/"): # Relative URL to the URL we're now currently at. urlParts = urlparse.urlsplit(Utilities.removeTokenAndAddMissingSlash(url)) if (len(urlParts.query) > 0): query = u"?"+urlParts.query else: query = u"" urlToBeChecked = currentURL + urlParts.path + query elif (urlParts.netloc != self.siteURLParts.netloc): self.uncheckedExternalLinks.append({'urlToCheck': url, 'parentURL' : currentURL}) elif (len(urlParts.fragment) > 0): continue if (type(urlToBeChecked) == unicode and urlToBeChecked not in self.checkedLinks and urlToBeChecked not in self.linksToBeChecked and currentURL != urlToBeChecked): self.linksToBeChecked[urlToBeChecked] = currentURL self.siteReporter.addLinkToAllURLParents(urlToBeChecked, currentURL) if (type(urlToBeChecked) == unicode and self.siteReporter.isURLInBrokenLinks(urlToBeChecked)): self.siteReporter.addLinkToBrokenLinks(urlToBeChecked, currentURL) elif (self.siteReporter.isURLInBrokenLinks(url)): self.siteReporter.addLinkToBrokenLinks(url, currentURL) if (type(urlToBeChecked) == unicode and self.siteReporter.isURLInAllURLParents(urlToBeChecked)): self.siteReporter.addLinkToAllURLParents(urlToBeChecked, currentURL) elif (self.siteReporter.isURLInAllURLParents(url)): self.siteReporter.addLinkToAllURLParents(url, currentURL) #print "-----------------------------------------------------\n" def executeJobs(self, jobs, jobType, nrOfThreads = 5): queue = Queue.Queue() checkedLinks = {} exceptions = [] threads = [] for i in range(nrOfThreads): if (jobType == "imageLinks"): thread = Threader.ImageLinkThread(queue, self.siteReporter, checkedLinks, self.verboseOutput, exceptions, self.siteURL) elif (jobType == "externalLinks"): thread = Threader.ExternalLinkThread(queue, self.siteReporter, checkedLinks, self.verboseOutput, exceptions) thread.start() threads.append(thread) threadDataID = self.threadManager.registerThreads(threads, queue, exceptions) for job in jobs: queue.put(job) queue.join() nrOfLivingThreads = 0 for thread in threads: if (thread.isAlive()): nrOfLivingThreads += 1 for i in range(nrOfLivingThreads): queue.put('die') queue.join() if (len(exceptions) > 0): self.threadManager.exit() raise exceptions[0][0], exceptions[0][1], exceptions[0][2] else: self.threadManager.unregisterThreads(threadDataID) def executeValidation(self, jobs): queue = Queue.Queue() exceptions = [] threads = [] for i in range(5): thread = Threader.HTMLValidationThread(queue, self.siteReporter, self.verboseOutput, exceptions) thread.start() threads.append(thread) self.threadManager.registerThreads(threads, queue, exceptions) for job in jobs: queue.put(job) queue.join() nrOfLivingThreads = 0 for thread in threads: if (thread.isAlive()): nrOfLivingThreads += 1 for i in range(nrOfLivingThreads): queue.put('die') queue.join() if (len(exceptions) > 0): self.threadManager.exit() raise exceptions[0][0], exceptions[0][1], exceptions[0][2] # Check if robots.txt exists in the root directory def checkForRobots(self, siteURL): robots = urllib2.urlopen(self.siteURLParts.scheme + "://" + self.siteURLParts.netloc + "/robots.txt") if (robots): return True else: pass # Check if favicon.ico exists in the root directory def checkForFavicon(self, siteURL): favicon = urllib2.urlopen(self.siteURLParts.scheme + "://" + self.siteURLParts.netloc + "/favicon.ico") if (favicon): return True else: pass