def _getCrawlerFileExts(self): try: exts = [] cf = CrawlerFile(url=self.url) urls = cf.getSection('Hrefs') for eachurl in urls: eachulp = urlparse(eachurl) pos = eachulp.path.rfind('.') if pos != -1: ext = eachulp.path[pos:] ext = ext.lower() if ext not in exts: exts.append(ext) return exts except Exception,e: print 'Exception:\t',e return []
def _getCrawlerFileExts(self): try: exts = [] cf = CrawlerFile(url=self.url) urls = cf.getSection('Hrefs') for eachurl in urls: eachulp = urlparse(eachurl) pos = eachulp.path.rfind('.') if pos != -1: ext = eachulp.path[pos:] ext = ext.lower() if ext not in exts: exts.append(ext) return exts except Exception, e: print 'Exception:\t', e return []
def _getCrawlerPaths(self, url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r', '') eachline = eachline.replace('\n', '') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith( '/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/', pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[: pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print 'Exception:\t', e return [url]
def _getCrawlerPaths(self,url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r','') eachline = eachline.replace('\n','') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith('/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/',pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos +=1 return paths except Exception,e: print 'Exception:\t',e return [url]
def _getCrawlerPaths(self, url): """ """ try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection("Hrefs") # print urls for eachline in urls: eachline = eachline.replace("\r", "") eachline = eachline.replace("\n", "") # print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find(".") == -1 and fullpath.endswith("/") == False: fullpath += "/" pos = 0 while True: pos = fullpath.find("/", pos) if pos == -1: break tmppth = eachulp.scheme + "://" + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith("/"): # tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print "Exception:\t", e return [url]