def main(start, d): seeds = [] seeds.append(start) #################LOOP FOR EACH SEED################# for x in range(0, len(seeds)): depth = d seed = seeds.pop(0), depth domainDetailList = [] domainFileList = [] uncrawledLinks = [] crawledLinks = [] crawlerObj = crawler() #initialize crawler object domainDetail = crawlerObj.crawlDomain(seed) #first crawls the domain of the seed URL. returns a DomainDetail object domainDetailList.append(domainDetail) #stores the returned DomainDetail object in the domainDetailList[] domainFile, childLinks = crawlerObj.crawlFilePath(seed) #second crawls the file path of the seed URL. returns childLinks and a DomainFile object domainFileList.append([]) #appends a list at the first index of the domainFileList domainFileList[0].append(domainFile) #appends the domainFile retrieved from the seed URL to the same row index which is equal to the index of the domain in the domainDetailList crawledLinks.append(seed[0]) #adds the seed URL to the list of crawled Links updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks) #appends the childLinks obtained from the first crawl to the list of uncrawled Links #**may need to have updateUncrawledLinks() return new version of uncrawledLinks and crawledLinks #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED################# while(len(uncrawledLinks) > 0): currUrl = uncrawledLinks.pop(0) #removes the link at the beginning of the uncrawledLinks list and sets currUrl to that link crawledLinks.append(currUrl[0]) #adds the link that was just removed from uncrawledLinks to crawledLinks scheme, domain, filePath, params, query, fragment = urlparse(currUrl[0]) domainExists = False #paramater to specify if the domain of the currUrl already exists in the domainDetailList y = 0 #count to represent the index in the domainDetailList #loops while the domain is not found in the domainDetailList and we are within the bounds of the domainDetailList array while((domainExists == False) and (y < len(domainDetailList))): if(domainDetailList[y].getDomainName() == domain): #checks if the domain already exists in the domainDetailList domainFile, childLinks = crawlerObj.crawlFilePath(currUrl) #since the domain has already been crawled, just crawl the page source of the URL domainFileList[y].append(domainFile) #append the domainFile to the same row index which is equal to the index of the domain in the domainDetailList domainExists = True #domain is found in the domainDetailList set domainExists to True updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks) y = y + 1 if(domainExists != True): domainDetail = crawlerObj.crawlDomain(currUrl) ddIndex = len(domainDetailList) domainDetailList.append(domainDetail) domainFile, childLinks = crawlerObj.crawlFilePath(currUrl) domainFileList.append([]) domainFileList[ddIndex].append(domainFile) updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks) #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED################# print "seed '" + seed[0] + "' done" print "commencing storage Of DomainDetail objects into database..." ssedaoObj = SSEDAO() for index in range(0, len(domainDetailList)): domainDetailList[index].setDomainFiles(domainFileList[index]) ssedaoObj.storeCrawlerInformation(domainDetailList[index]) print"DomainDetail object for '" + domainDetailList[index].getDomainName() + "' send to database"
found = True if(found == False): CLIndex = CLIndex + 1 while (len(childLinks) > 0): uncrawledLinks.append(childLinks.pop(0)) return childLinks, uncrawledLinks, crawledLinks #################PROGRAM BEGINS HERE!################# seeds = [] seeds.append("http://www.bofa.com/") seeds = SSEDAO.getDomainsForCrawler() for x in range (0, len(seeds)): seed[x] = "http://" + seed[x] #################LOOP FOR EACH SEED################# for x in range(0, len(seeds)): depth = 5 seed = seeds.pop(0), depth domainDetailList = [] domainFileList = [] uncrawledLinks = [] crawledLinks = [] crawlerObj = crawler() #initialize crawler object domainDetail = crawlerObj.crawlDomain(seed) #first crawls the domain of the seed URL. returns a DomainDetail object