def saveAllFileExtensions(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerFileExts() cf.saveSection('FileExtensions',contentlist) except Exception,e: print 'Exception:\t',e
def saveAllHrefsToFile(self,nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs',contentlist,coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass
def saveAllFileExtensions(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerFileExts() cf.saveSection('FileExtensions', contentlist) except Exception, e: print 'Exception:\t', e
def saveAllHrefsToFile(self, nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs ] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs', contentlist, coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass
def saveAllPaths(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerPaths(self.url) #print contentlist cf.saveSection('Paths',contentlist) # fp = open(self.file,'w') # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' # filename = self.file # fp = open(filename,'a') # fp.write(os.linesep+'[Paths]'+os.linesep) # urls = self._getCrawlerPaths(self.url) # for eachurl in urls: # fp.write(eachurl + os.linesep) # fp.close() except Exception,e: print 'Exception:\t',e
def saveAllPaths(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerPaths(self.url) # print 'contentlist=',contentlist cf.saveSection('Paths', contentlist) # fp = open(self.file,'w') # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' # filename = self.file # fp = open(filename,'a') # fp.write(os.linesep+'[Paths]'+os.linesep) # urls = self._getCrawlerPaths(self.url) # for eachurl in urls: # fp.write(eachurl + os.linesep) # fp.close() except Exception, e: print 'Exception:\t', e