Exemplo n.º 1
0
    def _work(self, entry_url):
        try:
            logger.info("[{}] req = > {}".format(len(self.done_url_list),
                                                 entry_url))
            if utils.url_ext(entry_url) in self.ignore_ext:
                return URLsimilarList()

            conn = utils.http_req(entry_url)
            if conn.status_code in [301, 302, 307]:
                _url = urljoin(entry_url, conn.headers.get("Location",
                                                           "")).strip()
                _url = utils.normal_url(_url)
                if _url is None:
                    return URLsimilarList()

                url_info = URLinfo(entry_url, _url, URLTYPE.document)
                if utils.same_netloc(entry_url,
                                     _url) and (url_info
                                                not in self.done_url_list):
                    entry_url = _url
                    logger.info("[{}] req 302 = > {}".format(
                        len(self.done_url_list), entry_url))
                    conn = utils.http_req(_url)
                    self.done_url_list.add(url_info)
                    self.all_url_list.add(url_info)

            html = conn.content
            if "html" not in conn.headers.get("Content-Type", "").lower():
                return URLsimilarList()

            dom = pq(html)
            ret_url = URLsimilarList()
            for tag in self.tagMap:
                items = dom(tag['name']).items()
                for i in items:
                    _url = urljoin(entry_url, i.attr(tag['attr'])).strip()
                    _url = utils.normal_url(_url)
                    if _url is None:
                        continue
                    _type = tag["type"]
                    if utils.same_netloc(_url, entry_url):
                        url_info = URLinfo(entry_url, _url, _type)
                        ret_url.add(url_info)
                        self.all_url_list.add(url_info)

            return ret_url
        except Exception as e:
            logger.error("error on {} {}".format(entry_url, e))
            return URLsimilarList()
Exemplo n.º 2
0
    def run(self):
        cnt = 0
        for site in self.sites:
            domain = utils.get_hostname(site).split(":")[0]

            if domain not in self.domain_map_site:
                self.domain_map_site[domain] = [site]
            else:
                self.domain_map_site[domain].append(site)

            cnt += 1
            if domain not in self.domain_map_url:
                logger.info("[{}/{}] start SearchEngines  work on {}".format(
                    cnt, len(self.sites), site))
                urls = self.work(domain)
                logger.info("found url {}, by {}".format(len(urls), domain))
                self.domain_map_url[domain] = urls

        for site in self.sites:
            domain = utils.get_hostname(site).split(":")[0]
            urls = self.domain_map_url.get(domain)
            for url in urls:
                if utils.same_netloc(site, url):
                    if urlparse(url).path == "/" or (not urlparse(url).path):
                        continue

                    if site not in self.site_map_url:
                        self.site_map_url[site] = [url]
                    else:
                        self.site_map_url[site].append(url)

        return self.site_map_url