예제 #1
0
    def crawl_url(self, url, level=0):
        if (
                url.count("/") == 2
        ):  # If the user provides 'http://www.google.com' append an / to it.
            url += "/"

        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None

            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith(
                            "#") and not new_url.startswith("javascript:"):
                        if (new_url.startswith("http://")
                                or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url),
                                                       new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level + 1 <= self.config["p_depth"]):
                                        self.urlpool.append(
                                            (new_url, level + 1))
                                        break
예제 #2
0
파일: crawler.py 프로젝트: bupt007/fimap
    def crawl_url(self, url, level=0):
        if (url.count("/") == 2): # If the user provides 'http://www.google.com' append an / to it.
            url += "/" 
        
        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None
            
            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith("#") and not new_url.startswith("javascript:"):
                        if(new_url.startswith("http://") or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url), new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level+1 <= self.config["p_depth"]):
                                        self.urlpool.append((new_url, level+1))
                                        break