Exemplo n.º 1
0
    def save_static_by_type(self, host, soup, link_items, attr, is_css = False):
        q = Queue.Queue()
        for link in link_items:
            print host.get_url()
            static_url_obj = HostURLParse(link[attr], host.get_url())
            if not self.static_set.in_set(static_url_obj.get_url_hash()):
                #self.static_set.add(static_url_obj.get_url_hash())
                q.put(static_url_obj)
                link[attr] = os.path.join(self.workspace, static_url_obj.get_diskrelpath())
            else:
                logging.info('Found Static file in cache: ' + static_url_obj.get_url())

        stop_event = threading.Event()
        if not q.empty():
            for i in range(5):
                t = threading.Thread(target=self.static_worker, args=(q,stop_event,is_css))
                t.setDaemon(True)
                t.start()

            q.join()
            stop_event.set()
Exemplo n.º 2
0
    def parse_and_enque(self, host, html):
        soup = BeautifulSoup.BeautifulSoup(html)
        links = [x for x in soup.findAll('a', href=True)]
        base = soup.find('base', href=True)
        base_url = ''
        if base:
            base_url = base['href']
            base.extract()
        else:
            base_url = host.get_url()

        for a in links:
            #print a, host.get_url()
            urlhostp = HostURLParse(a['href'], base_url)
            h = urlhostp.get_url_hash()
            if not self.hash_set.in_set(h):
                print "Enquing", a['href'], base_url
                self.queue.put(urlhostp)
                self.hash_set.add(h)
            a['href'] = os.path.join(self. workspace, urlhostp. get_diskrelpath())
        self.save_static_files(host,soup)
        return str(soup)
Exemplo n.º 3
0
    def static_worker(self, static_queue, stop_event, is_css=False):

        while not stop_event.is_set():
            url_obj = static_queue.get()
            content = self.fetch_file(url_obj.get_url())
            if content:
                self.static_set.add(url_obj.get_url_hash())

                if is_css:
                    prog = re.compile(r'url\(([a-zA-Z0-9_./]+)\)')
                    res = prog.findall(content)
                    for u in res:
                        print url_obj.get_url()
                        u_obj = HostURLParse(u, url_obj.get_url())
                        if not self.static_set.in_set(u_obj.get_url_hash()):
                            static_queue.put(u_obj)
                            abs_path = os.path.join(self.workspace, u_obj.get_diskrelpath())
                            content = re.sub(u, abs_path, content)

                logging.info("Saved static file: " + url_obj.get_url())
                self.save_file(url_obj, content)
            static_queue.task_done()