示例#1
0
 def _clean_output(self, output):
     '''Cleaning output from garbage (from 0 to magic string)
        https://groups.google.com/forum/?fromgroups#!topic/phantomjs/LwRGJXpPsZA'''
     try:
         i = output.index(self._magic_string)
     except ValueError:
         fixed = output
         garbage = ''
     else:
         k = i + len(self._magic_string)
         fixed = output[k:]
         garbage = output[:i]
     if self._debug:
         logger.debug('garbage at output: {}B'.format(len(garbage)))
         if garbage:
             logger.debug(garbage)
     return fixed
示例#2
0
    def fetch(self, url, post_params=None, capture_screen=None):
        """
        url
            URL to access. Must start with "http[s]://"

        post_params
            If None, then GET method is used. Otherwise, POST is used with
            those parameters.

        capture_screen
            Filename where the screenshot should be stored.

        Raises:
            PhantomCurlError

        """
        if not _has_accepted_protocol(url):
            raise PhantomCurlError('Unknown protocol for "{}"'
                                   .format(repr(url)))
        logger.info(u'fetching {}'.format(repr(url)))
        options_bin = ['--ignore-ssl-errors=true',
                       '--local-to-remote-url-access=true',
                       '--web-security=false']

#        options_bin += ['--disk-cache=true',
#                        '--max-disk-cache-size=10000'] # DEBUG
        if self._timeout_sec is None:
            timeout_js = None
            timeout_thread = None
        else:
            timeout_js = self._timeout_sec
            timeout_thread = timeout_js + self.TIMEOUT_JS_TO_JOIN_DELTA_SEC

        if self._cookie_jar:
            path = os.path.normpath(self._cookie_jar)
            options_bin.append(u'--cookies-file={:s}'.format(path))
        if self._proxy:
            options_bin.append(u'--proxy={:s}'.format(self._proxy))
        if self._proxy_type:
            options_bin.append(u'--proxy-type={:s}'.format(self._proxy_type))
        if self._proxy_auth:
            options_bin.append(u'--proxy-auth={:s}'.format(self._proxy_auth))
        if self._debug:
            options_bin.append(u'--debug=true')
#        url_encoded =  urllib.quote(url, safe=u'/:')
        options_js = [_OPT_MAGIC_STRING, _MAGIC_STRING, _OPT_URL, url]
        if self._user_agent:
            options_js += [_OPT_USER_AGENT, self._user_agent]
        if capture_screen:
            options_js += [_OPT_CAPTURE_SCREEN, capture_screen]
        if self._inspect_iframes:
            options_js += [_OPT_INSPECT_IFRAMES]
        if timeout_js is not None:
            options_js += [_OPT_TIMEOUT_SEC, timeout_js]
        if self._delay is not None:
            options_js += [_OPT_DELAY_SEC, str(self._delay)]
        if not self._with_content:
            options_js += [_OPT_NO_CONTENT]
        if self._with_request_response:
            options_js += [_OPT_REQUEST_RESPONSE]
        if post_params is not None:
            options_js += [_OPT_POST_FULL, _get_full_post_string(post_params)]
        if self._headers is not None:
            options_js += [_OPT_CUSTOM_HEADERS_JSON, json.dumps(self._headers)]
        if self._cookie is not None:
            options_js += [_OPT_COOKIE, json.dumps(self._cookie)]

        options_js_str = [str(o) for o in options_js]
        cmds = [PHANTOMJS_BIN] + options_bin + [PHANTOMJS_JS] + options_js_str
        logger.debug('call {}'.format(cmds))
        out, err = command.call(cmds, timeout=timeout_thread)
        logger.debug('out: {:.1f}KB, err: {:.1f}KB'.format(
            len(out)/1000.0, len(err)/1000.0))
        if self._debug:
            logger.debug('stderr from phantomjs:')
            logger.debug(err)
        try:
            fixed_out = self._clean_output(out)
            output_json = json.loads(fixed_out)
        except ValueError:
            raise PhantomCurlError('Invalid output', out=out, err=err)
        return output_json