예제 #1
0
파일: mirror.py 프로젝트: hyades/hyades-a
    def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
        """Fetch and cache a page.
    
    Args:
      key_name: Hash to use to store the cached page.
      base_url: The hostname of the page that's being mirrored.
      translated_address: The URL of the mirrored page on this site.
      mirrored_url: The URL of the original page. Hostname should match
        the base_url.
    
    Returns:
      A new MirroredContent object, if the page was successfully retrieved.
      None if any errors occurred or the content could not be retrieved.
    """
        # Check for the X-Mirrorrr header to ignore potential loops.
        if base_url in MIRROR_HOSTS:
            logging.warning('Encountered recursive request for "%s"; ignoring',
                            mirrored_url)
            return None

        logging.debug("Fetching '%s'", mirrored_url)
        try:
            response = urlfetch.fetch(mirrored_url)
        except (urlfetch.Error, apiproxy_errors.Error):
            logging.exception("Could not fetch URL")
            return None

        adjusted_headers = {}
        for key, value in response.headers.iteritems():
            adjusted_key = key.lower()
            if adjusted_key not in IGNORE_HEADERS:
                adjusted_headers[adjusted_key] = value

        content = response.content
        page_content_type = adjusted_headers.get("content-type", "")
        for content_type in TRANSFORMED_CONTENT_TYPES:
            # Startswith() because there could be a 'charset=UTF-8' in the header.
            if page_content_type.startswith(content_type):
                content = transform_content.TransformContent(
                    base_url, mirrored_url, content)
                break

        # If the transformed content is over 1MB, truncate it (yikes!)
        if len(content) > MAX_CONTENT_SIZE:
            logging.warning('Content is over 1MB; truncating')
            content = content[:MAX_CONTENT_SIZE]

        new_content = MirroredContent(base_url=base_url,
                                      original_address=mirrored_url,
                                      translated_address=translated_address,
                                      status=response.status_code,
                                      headers=adjusted_headers,
                                      data=content)
        if not memcache.add(
                key_name, new_content, time=EXPIRATION_DELTA_SECONDS):
            logging.error(
                'memcache.add failed: key_name = "%s", '
                'original_url = "%s"', key_name, mirrored_url)

        return new_content
예제 #2
0
    def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
        """Fetch and cache a page.

        Args:
          key_name: Hash to use to store the cached page.
          base_url: The hostname of the page that's being mirrored.
          translated_address: The URL of the mirrored page on this site.
          mirrored_url: The URL of the original page. Hostname should match
            the base_url.

        Returns:
          A new MirroredContent object, if the page was successfully retrieved.
          None if any errors occurred or the content could not be retrieved.
        """
        logging.debug("Fetching '%s'", mirrored_url)
        try:
            # response = urlfetch.fetch(mirrored_url)
            response = requests.get(mirrored_url)
        # except (urlfetch.Error, apiproxy_errors.Error):
        except Exception:
            logging.exception("Could not fetch URL")
            return None

        adjusted_headers = {}
        for key, value in response.headers.items():
            adjusted_key = key.lower()
            if adjusted_key not in IGNORE_HEADERS:
                adjusted_headers[adjusted_key] = value

        content = response.content
        if isinstance(content, bytes):
            content = content.decode('utf-8', 'ignore')
        page_content_type = adjusted_headers.get("content-type", "")
        for content_type in TRANSFORMED_CONTENT_TYPES:
            # startswith() because there could be a 'charset=UTF-8' in the header.
            if page_content_type.startswith(content_type):
                content = transform_content.TransformContent(
                    base_url, mirrored_url, content)
                break

        new_content = MirroredContent(base_url=base_url,
                                      original_address=mirrored_url,
                                      translated_address=translated_address,
                                      status=response.status_code,
                                      headers=adjusted_headers,
                                      data=content)

        # Do not memcache content over 1MB
        if len(content) < MAX_CONTENT_SIZE:
            # if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS):
            testaaa = json.dumps(new_content.__dict__)
            # if not r.set(key_name, json.dumps(new_content)):
            if not r.set(key_name, testaaa):
                logging.error(
                    'memcache.add failed: key_name = "%s", '
                    'original_url = "%s"', key_name, mirrored_url)
        else:
            logging.warning("Content is over 1MB; not memcached")

        return new_content
예제 #3
0
 def _RunTransformTest(self, base_url, accessed_url, original, expected):
     tag_tests = [
         '<img src="%s"/>',
         "<img src='%s'/>",
         "<img src=%s/>",
         "<img src=\"%s'/>",
         "<img src='%s\"/>",
         "<img src  \t=  '%s'/>",
         "<img src  \t=  \t '%s'/>",
         "<img src = '%s'/>",
         '<a href="%s">',
         "<a href='%s'>",
         "<a href=%s>",
         "<a href=\"%s'>",
         "<a href='%s\">",
         "<a href \t = \t'%s'>",
         "<a href \t  = '%s'>",
         "<a href =  \t'%s'>",
         "<td background=%s>",
         "<td background='%s'>",
         '<td background="%s">',
         '<form action="%s">',
         "<form action='%s'>",
         "<form action=%s>",
         "<form action=\"%s'>",
         "<form action='%s\">",
         "<form action \t = \t'%s'>",
         "<form action \t  = '%s'>",
         "<form action =  \t'%s'>",
         "@import '%s';",
         "@import '%s'\nnext line here",
         "@import \t '%s';",
         "@import %s;",
         "@import %s",
         '@import "%s";',
         '@import "%s"\nnext line here',
         "@import url(%s)",
         "@import url('%s')",
         '@import url("%s")',
         "background: transparent url(%s) repeat-x left;",
         'background: transparent url("%s") repeat-x left;',
         "background: transparent url('%s') repeat-x left;",
         '<meta http-equiv="Refresh" content="0; URL=%s">',
     ]
     for tag in tag_tests:
         test = tag % original
         correct = tag % expected
         result = transform_content.TransformContent(
             base_url, accessed_url, test)
         logging.info(
             "Test with\n"
             "Accessed: %s\n"
             "Input   : %s\n"
             "Received: %s\n"
             "Expected: %s", accessed_url, test, result, correct)
         if result != correct:
             logging.info("FAIL")
         self.assertEquals(correct, result)
예제 #4
0
    def fetch_and_store(base_url, translated_address, mirrored_url, user_agent,
                        referer, ip):
        """Fetch and cache a page.

    Args:
      base_url: The hostname of the page that's being mirrored.
      translated_address: The URL of the mirrored page on this site.
      mirrored_url: The URL of the original page. Hostname should match
        the base_url.

    Returns:
      A new MirroredContent object, if the page was successfully retrieved.
      None if any errors occurred or the content could not be retrieved.
    """

        # logging.info('Base_url = "%s", mirrored_url = "%s"', base_url, mirrored_url)

        # logging.info("Fetching '%s'", mirrored_url)

        try:
            response = urlfetch.fetch(mirrored_url,
                                      headers={
                                          'User-Agent': user_agent,
                                          'App-Engine': 'true',
                                          'Real-Ip': ip,
                                          'Referer': referer
                                      })
        except (urlfetch.Error, apiproxy_errors.Error):
            logging.info("Could not fetch URL")
            return None

        adjusted_headers = {}
        for key, value in response.headers.iteritems():
            adjusted_key = key.lower()
            if adjusted_key not in IGNORE_HEADERS:
                adjusted_headers[adjusted_key] = value

        content = response.content
        # logging.info("content '%s'", content)
        page_content_type = adjusted_headers.get("content-type", "")
        for content_type in TRANSFORMED_CONTENT_TYPES:
            # startswith() because there could be a 'charset=UTF-8' in the header.
            if page_content_type.startswith(content_type):
                content = transform_content.TransformContent(
                    base_url, mirrored_url, content)
                break

        new_content = MirroredContent(base_url=base_url,
                                      original_address=mirrored_url,
                                      translated_address=translated_address,
                                      status=response.status_code,
                                      headers=adjusted_headers,
                                      data=content)

        return new_content
예제 #5
0
    def fetch_and_store(key_name,
                        base_url,
                        translated_address,
                        mirrored_url,
                        postdata=None,
                        ChineseWordsencoding=True,
                        whitelist=''):
        """Fetch and cache a page.
        
        Args:
          key_name: Hash to use to store the cached page.
          base_url: The hostname of the page that's being mirrored.
          translated_address: The URL of the mirrored page on this site.
          mirrored_url: The URL of the original page. Hostname should match
            the base_url.
        
        Returns:
          A new MirroredContent object, if the page was successfully retrieved.
          None if any errors occurred or the content could not be retrieved.
        """
        # Check for the X-Mirrorrr header to ignore potential loops.
        if base_url in mirror_const.MIRROR_HOSTS:
            logging.warning(
                u'Encountered recursive request for "%s"; ignoring',
                mirrored_url)
            return None

        logging.debug(u"Fetching '%s'", mirrored_url)
        try:
            response = post_and_get_content(mirrored_url, postdata)
        except (urlfetch.Error, apiproxy_errors.Error):
            logging.exception("Could not fetch URL")
            return None

        adjusted_headers = {}
        #for key, value in response['headers']:
        for key, value in response['headers'].iteritems():
            adjusted_key = key.lower()
            if adjusted_key not in mirror_const.IGNORE_HEADERS:
                adjusted_headers[adjusted_key] = value

        #logging.error(adjusted_headers)

        content = response['content']
        page_content_type = adjusted_headers.get("content-type", "")
        for content_type in mirror_const.TRANSFORMED_CONTENT_TYPES:
            # Startswith() because there could be a 'charset=UTF-8' in the header.
            if page_content_type.startswith(content_type):
                is_html = page_content_type.startswith('text/html')
                if is_html:
                    logging.error(u'transform:%s' %
                                  response['geturl'])  #mirrored_url)
                content = transform_content.TransformContent(
                    base_url, response['geturl'], content, is_html,
                    ChineseWordsencoding, whitelist)
                break

        # If the transformed content is over 1MB, truncate it (yikes!)
        if len(content) > mirror_const.MAX_CONTENT_SIZE:
            logging.warning('Content is over 1MB; truncating')
            content = content[:mirror_const.MAX_CONTENT_SIZE]

        new_content = MirroredContent(base_url=base_url,
                                      original_address=mirrored_url,
                                      translated_address=translated_address,
                                      status=response['status_code'],
                                      headers=adjusted_headers,
                                      data=content)

        #=======================================================================
        # if memcache.get(key_name):
        #    if memcache.set(key_name, new_content, time=mirror_const.EXPIRATION_DELTA_SECONDS):
        #        logging.error('memcache.set failed: key_name = "%s", '
        #                'original_url = "%s"', key_name, mirrored_url)
        # else:
        #    if memcache.set(key_name, new_content, time=mirror_const.EXPIRATION_DELTA_SECONDS):
        #        logging.error('memcache.add2 failed: key_name = "%s", '
        #                'original_url = "%s"', key_name, mirrored_url)
        #=======================================================================

        return new_content
 def _RunTransformTest(self, base_url, accessed_url, original, expected):
   tag_tests = [
     '<img src="%s"/>',
     "<img src='%s'/>",
     "<img src=%s/>",
     "<img src=\"%s'/>",
     "<img src='%s\"/>",
     "<img src  \t=  '%s'/>",
     "<img src  \t=  \t '%s'/>",
     "<img src = '%s'/>",
     '<a href="%s">',
     "<a href='%s'>",
     "<a href=%s>",
     "<a href=\"%s'>",
     "<a href='%s\">",
     "<a href \t = \t'%s'>",
     "<a href \t  = '%s'>",
     "<a href =  \t'%s'>",
     "<td background=%s>",
     "<td background='%s'>",
     '<td background="%s">',
     '<form action="%s">',
     "<form action='%s'>",
     "<form action=%s>",
     "<form action=\"%s'>",
     "<form action='%s\">",
     "<form action \t = \t'%s'>",
     "<form action \t  = '%s'>",
     "<form action =  \t'%s'>",      
     "@import '%s';",
     "@import '%s'\nnext line here",
     "@import \t '%s';",
     "@import %s;",
     "@import %s",
     '@import "%s";',
     '@import "%s"\nnext line here',
     "@import url(%s)",
     "@import url('%s')",
     '@import url("%s")',
     "background: transparent url(%s) repeat-x left;",
     'background: transparent url("%s") repeat-x left;',
     "background: transparent url('%s') repeat-x left;",
     '<meta http-equiv="Refresh" content="0; URL=%s">',
     'url(%s)',
     'src="%s" ',
     'style="background:url(%s)'
   ]
   
   No_b64_encoding = expected.lstrip().startswith('#') or  expected.lstrip().lower().startswith('javascript')
   
   scheme =  original.startswith('https') and 'https' or urlparse.urlparse(accessed_url).scheme
   
   expected = No_b64_encoding and expected or  ('/'+b64.uri_b64encode('%s:/'%scheme +expected))
   for tag in tag_tests:   
     #logging.error("\n\n\n tag begin:%s "%tag)     
     test = tag % original
     correct = tag % expected
     result = transform_content.TransformContent(base_url, accessed_url, test)
     logging.error("Test with\n"
                  "Accessed: %s\n"
                  "Input   : %s\n"
                  "Received: %s\n"
                  "Expected: %s",
                  accessed_url, test, result, correct)
     if result != correct:
       logging.info("FAIL")
     self.assertEquals(correct, result)
예제 #7
0
    def fetch_and_store(key_name, base_url, translated_address, mirrored_url,
                        host, handler, shorturl):
        """Fetch and cache a page.

        Args:
          key_name: Hash to use to store the cached page.
          base_url: The hostname of the page that's being mirrored.
          translated_address: The URL of the mirrored page on this site.
          mirrored_url: The URL of the original page. Hostname should match
            the base_url.

        Returns:
          A new MirroredContent object, if the page was successfully retrieved.
          None if any errors occurred or the content could not be retrieved.
        """
        logging.debug(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        logging.debug("Fetching '%s' , base_url is '%s' ,UA is %s ",
                      mirrored_url, base_url,
                      handler.request.headers["User-Agent"])

        headers = {
            # 'content-type': 'application/json',
            'User-Agent':
            handler.request.headers["User-Agent"] or
            'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
            # 'Referer': 'http://'+base_url,
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
        try:
            if handler.request.method == 'GET':
                response = requests.get(mirrored_url, headers=headers)

            if handler.request.method == 'POST':
                logging.debug('handler.request postitems : %s' %
                              handler.request.POST.items())
                response = requests.post(mirrored_url,
                                         headers=headers,
                                         data=handler.request.POST.items())
        except Exception:
            logging.exception("Could not fetch URL")
            return None

        adjusted_headers = {}
        for key, value in response.headers.iteritems():
            adjusted_key = key.lower()
            if adjusted_key not in IGNORE_HEADERS:
                adjusted_headers[adjusted_key] = value

        content = response.content
        page_content_type = adjusted_headers.get("content-type", "")

        logging.info('page_content_type is %s' % page_content_type)
        for content_type in TRANSFORMED_CONTENT_TYPES:
            # startswith() because there could be a 'charset=UTF-8' in the header.
            if page_content_type.startswith(content_type):
                content = transform_content.TransformContent(
                    base_url, mirrored_url, content, shorturl)

                if page_content_type.startswith(
                        "text/html"):  #监听所有的请求,替换ajax地址
                    content = content.replace('document.domain="qq.com";',
                                              'void(0);')
                    #微信的烂代码
                    content = content.replace(
                        '</body>',
                        '<script type="text/javascript" src="http://pingjs.qq.com/h5/stats.js" name="MTAH5" sid="500324497" cid="500331564" opts="{&quot;senseHash&quot;:false}" ></script></body>'
                    )
                    #微信的烂代码
                    content = content.replace(
                        'location.href.indexOf("safe=0") == -1 ',
                        'false')  #微信的蓝代码
                    # content = content.replace('"2",','"3",') #eqx烂代码
                    content = content.replace(
                        "<head>", """<head>
                        <meta name="referrer" content="never">
                        <script>

                            function do_poster_script_onload(el){
                                console.log('script el:',el);
                                if (el.outerHTML.indexOf('http://')<0 && el.outerHTML.indexOf('https://')<0 && el.src.indexOf(base_url)<0){
                                    var path = el.src.replace('http://""" +
                        host + """','');//
                                    el.src = '/""" + shorturl + """/""" +
                        base_url + """'+ path;
                                    //el.onloadstart  = null;
                                }
                            }

                            (function() { 
                                var base_url = '""" + base_url + """';
                                var proxied = window.XMLHttpRequest.prototype.open;
                                window.XMLHttpRequest.prototype.open = function() {
                                    
                                    //console.log( arguments );
                                    if (arguments[1].indexOf('http://')<0 && arguments[1].indexOf('https://')<0) {arguments[1]='http://'+base_url+arguments[1]}
                                    
                                    arguments[1] = arguments[1].replace('http://','/"""
                        + shorturl + """/')
                                    //console.log( 'arguments xhr:',arguments );
                                    return proxied.apply(this, [].slice.call(arguments));
                                };

                                var proxied_append = HTMLElement.prototype.appendChild;
                                HTMLElement.prototype.appendChild = function() {
                                    
                                    //console.log( 'appendChild:', arguments );
                                    for (var i in arguments){
                                        var el = arguments[i];
                                        //debugger;
                                        if (el.tagName==='SCRIPT'){
                                            //debugger;
                                            if (el.outerHTML.indexOf('http://')<0 && el.outerHTML.indexOf('https://')<0 && el.src.indexOf(base_url)<0){
                                                var path = el.src.replace('http://"""
                        + host + """','');//
                                                console.log('path:',path);
                                                if (path==='') {
                                                   el.onbeforeonload = function(){
                                                        //console.log('onbeforeonloadNew:',el);
                                                   }
                                                   el.onerror = function(e){
                                                        //console.log('onerror',e)
                                                   }
                                                }else{
                                                    el.src = '/""" + shorturl +
                        """/""" + base_url + """'+ path;
                                                }
                                                
                                                
                                            }
                                        }
                                    }
                                    //if (arguments[1].indexOf('http://')<0) {arguments[1]='http://'+arguments[1]}
                                    
                                    //arguments[1] = arguments[1].replace('http://','/')
                                    //console.log( 'arguments append:',arguments );
                                    return proxied_append.apply(this, [].slice.call(arguments));
                                };

                                /*
                                var proxied_onload = HTMLElement.prototype.onload; 
                                HTMLElement.prototype.onload  = function(){
                                    var result = proxied_onload.apply(this, [].slice.call(arguments));
                                    //do_poster_script_onload(this);
                                    //console.log('poster onload');
                                    return result;
                                }*/


                                var D = window.localStorage.getItem;
                                window.localStorage.getItem = function() {
                                    var result = D.call(window.localStorage, [].slice.call(arguments));
                                    //console.log('getItem',result);
                                    if (result)
                                        result = result.replace('document.domain="qq.com";','void(0);');//#微信的烂代码

                                    return result;
                                };


                            })();
                            
                        </script>
                        """)
                break

        new_content = MirroredContent(base_url=base_url,
                                      original_address=mirrored_url,
                                      translated_address=translated_address,
                                      status=response.status_code,
                                      headers=adjusted_headers,
                                      data=content)

        # Do not memcache content over 1MB
        if len(content) < MAX_CONTENT_SIZE:
            if not memcache.set(key_name, new_content):
                logging.error(
                    'memcache.add failed: key_name = "%s", '
                    'original_url = "%s"', key_name, mirrored_url)
        else:
            logging.warning("Content is over %s ; not memcached" %
                            MAX_CONTENT_SIZE)

        return new_content