示例#1
0
 def saveAllFileExtensions(self):
     try:
         cf = CrawlerFile(url=self.url)
         contentlist = self._getCrawlerFileExts()
         cf.saveSection('FileExtensions', contentlist)
     except Exception, e:
         print 'Exception:\t', e
示例#2
0
 def saveAllHrefsToFile(self, nonehtml=True):
     try:
         cf = CrawlerFile(url=self.url)
         contentlist = []
         hrefs = [i for i in self.visitedHrefs
                  ] + [j for j in self.unvisitedHrefs]
         for href in hrefs:
             if href.endswith('.html') and nonehtml:
                 continue
             contentlist.append(href)
         cf.saveSection('Hrefs', contentlist, coverfile=True)
         # fp = open(self.file,'w')
         # fp.write('[Hrefs]'+os.linesep)
         # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
         # rethrefs = []
         # print 'Totally ',len(hrefs), ' hrefs'
         # for href in hrefs:
         # 	if href.endswith('.html'):
         # 		continue
         # 	rethrefs.append(href)
         # 	fp.write(href + os.linesep)
         # 	print href
         # print 'Totally ',len(rethrefs), ' aviable hrefs'
         # fp.close()
     except:
         pass
示例#3
0
 def _getCrawlerFileExts(self):
     try:
         exts = []
         cf = CrawlerFile(url=self.url)
         urls = cf.getSection('Hrefs')
         for eachurl in urls:
             eachulp = urlparse(eachurl)
             pos = eachulp.path.rfind('.')
             if pos != -1:
                 ext = eachulp.path[pos:]
                 ext = ext.lower()
                 if ext not in exts:
                     exts.append(ext)
         return exts
     except Exception, e:
         print 'Exception:\t', e
         return []
示例#4
0
    def saveAllPaths(self):
        try:
            cf = CrawlerFile(url=self.url)
            contentlist = self._getCrawlerPaths(self.url)
            # print 'contentlist=',contentlist
            cf.saveSection('Paths', contentlist)
            # fp = open(self.file,'w')
            # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'

            # filename = self.file
            # fp = open(filename,'a')
            # fp.write(os.linesep+'[Paths]'+os.linesep)
            # urls = self._getCrawlerPaths(self.url)
            # for eachurl in urls:
            # 	fp.write(eachurl + os.linesep)
            # fp.close()
        except Exception, e:
            print 'Exception:\t', e
示例#5
0
    def _getCrawlerPaths(self, url):
        ''' '''
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection('Hrefs')
            #print urls

            for eachline in urls:
                eachline = eachline.replace('\r', '')
                eachline = eachline.replace('\n', '')
                #print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find('.') == -1 and fullpath.endswith(
                            '/') == False:
                        fullpath += '/'
                    pos = 0
                    while True:
                        # print 'fullpath=',fullpath
                        pos = fullpath.find('/', pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:
                                                                                        pos]
                        if tmppth.endswith('/'):
                            #tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print 'Exception:\t', e
            return [url]