Пример #1
0
def main(start, d):
           
    seeds = []
    seeds.append(start)  
        
#################LOOP FOR EACH SEED#################
    for x in range(0, len(seeds)):
        depth = d
        seed = seeds.pop(0), depth
        
        domainDetailList = []
        domainFileList = []
        uncrawledLinks = []
        crawledLinks = []
        crawlerObj = crawler()  #initialize crawler object
        domainDetail = crawlerObj.crawlDomain(seed)  #first crawls the domain of the seed URL. returns a DomainDetail object
        domainDetailList.append(domainDetail) #stores the returned DomainDetail object in the domainDetailList[]
        domainFile, childLinks = crawlerObj.crawlFilePath(seed)  #second crawls the file path of the seed URL. returns childLinks and a DomainFile object
        domainFileList.append([])  #appends a list at the first index of the domainFileList
        domainFileList[0].append(domainFile)  #appends the domainFile retrieved from the seed URL to the same row index which is equal to the index of the domain in the domainDetailList
        crawledLinks.append(seed[0]) #adds the seed URL to the list of crawled Links
        updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)  #appends the childLinks obtained from the first crawl to the list of uncrawled Links
                                                                        #**may need to have updateUncrawledLinks() return new version of uncrawledLinks and crawledLinks
        
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
        while(len(uncrawledLinks) > 0):
            currUrl = uncrawledLinks.pop(0) #removes the link at the beginning of the uncrawledLinks list and sets currUrl to that link
            crawledLinks.append(currUrl[0])  #adds the link that was just removed from uncrawledLinks to crawledLinks
            scheme, domain, filePath, params, query, fragment = urlparse(currUrl[0])
            
            domainExists = False  #paramater to specify if the domain of the currUrl already exists in the domainDetailList
            y = 0  #count to represent the index in the domainDetailList
            
            #loops while the domain is not found in the domainDetailList and we are within the bounds of the domainDetailList array
            while((domainExists == False) and (y < len(domainDetailList))):
                if(domainDetailList[y].getDomainName() == domain):  #checks if the domain already exists in the domainDetailList
                    domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)  #since the domain has already been crawled, just crawl the page source of the URL
                    domainFileList[y].append(domainFile)  #append the domainFile to the same row index which is equal to the index of the domain in the domainDetailList
                    domainExists = True  #domain is found in the domainDetailList set domainExists to True
                    updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
                y = y + 1
            
            if(domainExists != True):
                domainDetail = crawlerObj.crawlDomain(currUrl)
                ddIndex = len(domainDetailList)
                domainDetailList.append(domainDetail)
                domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)
                domainFileList.append([])
                domainFileList[ddIndex].append(domainFile)
                updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
    print "seed '" + seed[0] + "' done"
    
    print "commencing storage Of DomainDetail objects into database..."
    ssedaoObj = SSEDAO()
    for index in range(0, len(domainDetailList)):
        domainDetailList[index].setDomainFiles(domainFileList[index])
        ssedaoObj.storeCrawlerInformation(domainDetailList[index])
        print"DomainDetail object for '" + domainDetailList[index].getDomainName() + "' send to database"
Пример #2
0
            found = True
        
        if(found == False):
            CLIndex = CLIndex + 1
    
    while (len(childLinks) > 0):
        uncrawledLinks.append(childLinks.pop(0))
       
    return childLinks, uncrawledLinks, crawledLinks
  

#################PROGRAM BEGINS HERE!#################        
seeds = []
seeds.append("http://www.bofa.com/")  

seeds = SSEDAO.getDomainsForCrawler()
for x in range (0, len(seeds)):
    seed[x] = "http://" + seed[x]


#################LOOP FOR EACH SEED#################
for x in range(0, len(seeds)):
    depth = 5
    seed = seeds.pop(0), depth
    
    domainDetailList = []
    domainFileList = []
    uncrawledLinks = []
    crawledLinks = []
    crawlerObj = crawler()  #initialize crawler object
    domainDetail = crawlerObj.crawlDomain(seed)  #first crawls the domain of the seed URL. returns a DomainDetail object