示例#1
0
 def fetch_resource(self, r, key, is_binary, is_attrib_url):
     if is_attrib_url:
         resource_path = get_attribs_key_url(r.attrib, key)
     else:
         resource_path = r.attrib[key]
     if not (resource_path.startswith('http://') or resource_path.startswith('https://')):
         parsed = urlparse(self.page_url)
         resource_path = parsed.scheme + '://' + adjoin_paths(parsed.hostname, resource_path)
     for res in re.findall('/<!--.*?-->', resource_path): resource_path = resource_path.replace(res, '')
     resource_path = resource_path.replace('http:///', 'http://')
     resource_path = resource_path.replace('https:///', 'https://')
     print 'fetching :', resource_path
     file_content = get_markup(resource_path, source_url=self.page_url)
     print 'done     : %s' % resource_path
     parsed = urlparse(resource_path)
     internal_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path)
     ensure_dir_exists(os.path.dirname(internal_path))
     if file_content: write_file(internal_path, file_content, is_binary)
     markup_path = self.markup_path(internal_path)
     if is_attrib_url:
         set_attribs_key_url(r.attrib, key, markup_path)
     else:
         r.attrib[key] = markup_path
     if internal_path.endswith('.css'):
         self.process_css_urls(internal_path, resource_path)
示例#2
0
 def _cl(r):
     if r.startswith('http://'): raise Exception('Unhandled css parse case')
     if r.startswith('https://'): raise Exception('Unhandled css parse case')
     online_path_file = adjoin_paths(os.path.dirname(online_css_path_file.split('?')[0]), r)
     online_file = os.path.basename(online_path_file).split('?')[0]
     local_path_file = adjoin_paths(os.path.dirname(local_css_path_file), r)
     ensure_dir_exists(os.path.dirname(local_path_file))
     print 'fetching :', online_file
     for res in re.findall('/<!--.*?-->', online_file): online_file = online_file.replace(res, '')
     file_content = get_markup(online_path_file, source_url=self.page_url)
     print 'done     : %s' % online_file
     if file_content: write_file(local_path_file, file_content, True)
示例#3
0
 def crawl_page(self, page_url, depth):
     if depth == 0: return
     is_home_page = page_url == self.home_page_url
     page_url = (page_url + '/') if not page_url.endswith('/') else page_url
     if page_url in self.being_crawled: return
     print '\ncrawling page               : %s' % page_url
     self.being_crawled.add(page_url)
     p = SitePage(page_url, self.site_path, 
                  process_inline_js=self.process_inline_js, 
                  process_embedded_css=self.process_embedded_css,
                  fetch_resources=self.fetch_resources,
                  remove_comments=self.remove_comments,
                  remove_ns_tags=self.remove_ns_tags,
                  randomize_text=self.randomize_text)
     ast = p.parse_markup()
     for a in ast.xpath('//a'):
         if not a.attrib.has_key('href'): continue
         a_url = a.attrib['href'].strip()
         if urlparse(page_url).hostname == urlparse(a_url).hostname:
             self.pool.spawn(self.crawl_page, a_url, depth-1)
             parsed = urlparse(a_url)
             p1 = adjoin_paths(self.site_path, parsed.hostname, parsed.path, 'index.html')
             a.attrib['href'] = p.markup_path(p1)
         else:
             a.attrib['href'] = '#'
         #print '>>> ', a.attrib['href']
     self.pool.spawn(p.main_logic, open_when_done=self.open_home_page_in_browser and is_home_page) 
     return
示例#4
0
    def main_logic(self, open_when_done=False):
        print 'getting resources for page  : %s' % self.page_url
        # process links, script sources and images, flash and other media
        if self.fetch_resouces:
            self.fetch_external_resources()

        # wait for all blocking IO threads to finish fetching external resources
        self.pool.waitall()

        # lorem ipsify text
        if self.randomize_text:
            self.process_text()

        # process imbedded css
        if self.process_embedded_css:
            self.process_internal_asset('//style', adjoin_paths(os.path.dirname(self.index_path), 
                                                            'imbedded_css'), 'imbedded.css', 'head', create_css_link)     
        
        # process inline js
        if self.process_inline_js:
            self.process_internal_asset('//body//script', adjoin_paths(os.path.dirname(self.index_path), 
                                                            'inline_js'), 'inline.js', 'body', create_js_tag)
        
        # remove comments
        if self.remove_comments:
            self.filter_comments(True)
        
        # process noscript tags
        if self.remove_ns_tags:
            self.process_noscript_tags()
        
        # render html
        html_output = render_html_element(self.tree_root, format_html5=self.use_html5)
        #print html_output
        
        ensure_dir_exists(os.path.dirname(self.index_path))
        write_file(self.index_path, html_output)

        print 'done getting resources      : %s' % self.page_url
        if open_when_done: open_in_browser(self.index_path)
示例#5
0
 def __init__(self, home_page_url, site_name, www_path, depth=1,
              process_inline_js=True, process_embedded_css=True, 
              fetch_resources=True, remove_comments=True, remove_ns_tags=True,
              open_home_page_in_browser=False, randomize_text=False):
     self.home_page_url = home_page_url
     self.depth = depth
     self.site_name = site_name
     self.site_path = adjoin_paths(os.path.realpath(www_path), self.site_name)
     self.pool = eventlet.GreenPool(10000)
     self.being_crawled = set([])
     self.process_inline_js = process_inline_js
     self.process_embedded_css = process_embedded_css
     self.fetch_resources = fetch_resources
     self.remove_comments = remove_comments
     self.remove_ns_tags = remove_ns_tags
     self.open_home_page_in_browser = open_home_page_in_browser
     self.randomize_text = randomize_text
示例#6
0
 def __init__(self, page_url, site_path, use_html5=False, 
              process_inline_js=True, process_embedded_css=True,
              fetch_resources=True, remove_comments=True, remove_ns_tags=True,
              randomize_text=False):
     self.pool = eventlet.GreenPool(10000)
     self.page_url = page_url
     self.site_path = site_path
     self.tree_root = None
     self.use_html5 = use_html5
     parsed = urlparse(self.page_url)
     self.index_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path, 'index.html')
     self.process_inline_js = process_inline_js
     self.process_embedded_css = process_embedded_css
     self.fetch_resouces = fetch_resources
     self.remove_comments = remove_comments
     self.remove_ns_tags = remove_ns_tags
     self.randomize_text = randomize_text
示例#7
0
 def process_internal_asset(self, selector, asset_path, asset_file, t_tag, new_cl): 
     res = self.tree_root.xpath(selector)
     file_content = u'\n\n'.join(u'\n'.join([l for l in (r.text if r.text else u'').split('\n')]).strip() for r in res)
     for r in res: 
         remove_from_parent(r)
         r.text = None
         try: 
             assert r.attrib.has_key('src')
             append_to_tag(self.tree_root, 'head', r)
         except:
             remove_from_parent(r)
     unique_file_name = uniquify_file_name(asset_path, asset_file)
     internal_path = adjoin_paths(asset_path, unique_file_name)
     ensure_dir_exists(asset_path)
     write_file(internal_path, file_content)
     append_to_tag(self.tree_root, t_tag, new_cl(self.markup_path(internal_path)))
     if asset_file.endswith('.css'):
         self.process_css_urls(internal_path, self.page_url)
示例#8
0
def open_in_browser(path):
    url = adjoin_paths('file://', path)
    webbrowser.open(url)