Пример #1
0
    def extract(self, content):
        pattern = re.compile(
            '<cite class=".*?">(.*?{domain})/.*?</cite>'.format(
                domain=self.target.netloc))
        next_page = "下一页"
        try:
            links = pattern.findall(content)

            self.find_new_domain = False
            for link in links:
                if not link.startswith('http://') and not link.startswith(
                        'https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(
                        self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info("{engine} Found {subdomain}".format(
                            engine=self.engine_name, subdomain=subdomain))
                        self.subdomains.update([subdomain])
                        self.find_new_domain = True
        except Exception:
            pass
        if next_page in content:
            # tell engine there still be next page
            return True
        else:
            return False
Пример #2
0
    def extract(self, content):
        next_page = re.compile('<A.*?>\s*<b>Next page</b>\s*</a>')
        pattern = re.compile(
            '<a href="http[s]*://(.*{domain}).*?" rel="nofollow">'.format(
                domain=self.target.netloc))
        try:
            links = pattern.findall(content)
            self.last_domain = self.target.netloc
            for link in links:
                if not link.startswith('http://') and not link.startswith(
                        'https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(
                        self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info("{engine} Found {subdomain}".format(
                            engine=self.engine_name, subdomain=subdomain))
                        self.subdomains.update([subdomain])
                self.last_domain = subdomain
        except Exception:
            pass
        if next_page.findall(content):
            # tell engine there still be next page
            return True
        else:
            return False
Пример #3
0
    def extract(self, content):
        pattern = re.compile(
            '<a href="javascript:" onclick="window.open.*?" target="_blank">(.*?{domain})</a>'
            .format(domain=self.target.netloc))
        next_page = "下一页"
        try:
            links = pattern.findall(content)

            for link in links:
                if not link.startswith('http://') and not link.startswith(
                        'https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(
                        self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info("{engine} Found {subdomain}".format(
                            engine=self.engine_name, subdomain=subdomain))
                        self.subdomains.update([subdomain])
        except Exception:
            pass
        if next_page in content:
            # tell engine there still be next page
            return True
        else:
            return False
Пример #4
0
    def extract(self, content):
        next_page = re.compile('<a class="next".*?>Next</a>')
        pattern = re.compile('<span class=.{1,100}?>(.{0,100}?<b.{0,100}?>'+self.target.netloc+'</b>)')
        try:
            links = pattern.findall(content)
            self.find_new_domain = False
            for link in links:
                link = re.sub('<.*?>','',link)

                if not link.startswith('http://') and not link.startswith('https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info(
                        "{engine} Found {subdomain}".format(
                                engine=self.engine_name,subdomain=subdomain))
                        self.subdomains.update([subdomain])
                        self.find_new_domain = True
        except Exception:
            pass
        if next_page.findall(content):
            # tell engine there still be next page
            return True
        else:
            return False
Пример #5
0
    def extract(self, content):
        next_page = '<li class="PartialWebPagination-next">Next</li>'
        pattern = re.compile('<p class="PartialSearchResults-item-url">(.*?\.{domain}).*?</p>'
                             .format(domain=self.target.netloc))
        try:
            links = pattern.findall(content)
            self.find_new_domain = False
            for link in links:
                if not link.startswith('http://') and not link.startswith('https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info(
                        "{engine} Found {subdomain}".format(
                                engine=self.engine_name,subdomain=subdomain))
                        self.subdomains.update([subdomain])
                        self.find_new_domain = True
        except Exception:
            pass
        if next_page in content:
            # tell engine there still be next page
            return True
        else:
            return False
Пример #6
0
    def extract(self, content):
        pattern = re.compile('<a.*?class="c-showurl".*?>(.*?{domain})'.format(
            domain=self.target.netloc))
        next_page = re.compile('<a.*?class="n">(.*?)</a>')
        try:
            links = pattern.findall(content)

            self.find_new_domain = False
            for link in links:
                link = re.sub('<.*?>|>|<|&nbsp;', '', link)
                if not link.startswith('http://') and not link.startswith(
                        'https://'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc

                if subdomain != self.target.netloc and subdomain.endswith(
                        self.target.netloc):
                    if subdomain not in self.subdomains:
                        self.logger.info("{engine} Found {subdomain}".format(
                            engine=self.engine_name, subdomain=subdomain))
                        self.subdomains.update([subdomain])
                        self.find_new_domain = True
        except Exception:
            pass
        if next_page.findall(content):
            # tell engine there still be next page
            return True
        else:
            return False