def crawl_url(self, url, level=0): if ( url.count("/") == 2 ): # If the user provides 'http://www.google.com' append an / to it. url += "/" code = self.__simpleGetRequest(url) domain = self.getDomain(url, True) if (code != None): soup = None try: soup = BeautifulSoup(code) except: pass if soup != None: for tag in soup.findAll('a'): isCool = False new_url = None try: new_url = tag['href'] except KeyError, err: pass if new_url != None and not new_url.startswith( "#") and not new_url.startswith("javascript:"): if (new_url.startswith("http://") or new_url.startswith("https://")): if (new_url.lower().startswith(domain.lower())): isCool = True else: if (new_url.startswith("/")): new_url = os.path.join(domain, new_url[1:]) else: new_url = os.path.join(os.path.dirname(url), new_url) isCool = True if (isCool and self.isURLinPool(new_url)): isCool = False if (isCool): tmpUrl = new_url if (tmpUrl.find("?") != -1): tmpUrl = tmpUrl[:tmpUrl.find("?")] for suffix in self.goodTypes: if (tmpUrl.endswith(suffix)): if (level + 1 <= self.config["p_depth"]): self.urlpool.append( (new_url, level + 1)) break
def crawl_url(self, url, level=0): if (url.count("/") == 2): # If the user provides 'http://www.google.com' append an / to it. url += "/" code = self.__simpleGetRequest(url) domain = self.getDomain(url, True) if (code != None): soup = None try: soup = BeautifulSoup(code) except: pass if soup != None: for tag in soup.findAll('a'): isCool = False new_url = None try: new_url = tag['href'] except KeyError, err: pass if new_url != None and not new_url.startswith("#") and not new_url.startswith("javascript:"): if(new_url.startswith("http://") or new_url.startswith("https://")): if (new_url.lower().startswith(domain.lower())): isCool = True else: if (new_url.startswith("/")): new_url = os.path.join(domain, new_url[1:]) else: new_url = os.path.join(os.path.dirname(url), new_url) isCool = True if (isCool and self.isURLinPool(new_url)): isCool = False if (isCool): tmpUrl = new_url if (tmpUrl.find("?") != -1): tmpUrl = tmpUrl[:tmpUrl.find("?")] for suffix in self.goodTypes: if (tmpUrl.endswith(suffix)): if (level+1 <= self.config["p_depth"]): self.urlpool.append((new_url, level+1)) break