def _clean_output(self, output): '''Cleaning output from garbage (from 0 to magic string) https://groups.google.com/forum/?fromgroups#!topic/phantomjs/LwRGJXpPsZA''' try: i = output.index(self._magic_string) except ValueError: fixed = output garbage = '' else: k = i + len(self._magic_string) fixed = output[k:] garbage = output[:i] if self._debug: logger.debug('garbage at output: {}B'.format(len(garbage))) if garbage: logger.debug(garbage) return fixed
def fetch(self, url, post_params=None, capture_screen=None): """ url URL to access. Must start with "http[s]://" post_params If None, then GET method is used. Otherwise, POST is used with those parameters. capture_screen Filename where the screenshot should be stored. Raises: PhantomCurlError """ if not _has_accepted_protocol(url): raise PhantomCurlError('Unknown protocol for "{}"' .format(repr(url))) logger.info(u'fetching {}'.format(repr(url))) options_bin = ['--ignore-ssl-errors=true', '--local-to-remote-url-access=true', '--web-security=false'] # options_bin += ['--disk-cache=true', # '--max-disk-cache-size=10000'] # DEBUG if self._timeout_sec is None: timeout_js = None timeout_thread = None else: timeout_js = self._timeout_sec timeout_thread = timeout_js + self.TIMEOUT_JS_TO_JOIN_DELTA_SEC if self._cookie_jar: path = os.path.normpath(self._cookie_jar) options_bin.append(u'--cookies-file={:s}'.format(path)) if self._proxy: options_bin.append(u'--proxy={:s}'.format(self._proxy)) if self._proxy_type: options_bin.append(u'--proxy-type={:s}'.format(self._proxy_type)) if self._proxy_auth: options_bin.append(u'--proxy-auth={:s}'.format(self._proxy_auth)) if self._debug: options_bin.append(u'--debug=true') # url_encoded = urllib.quote(url, safe=u'/:') options_js = [_OPT_MAGIC_STRING, _MAGIC_STRING, _OPT_URL, url] if self._user_agent: options_js += [_OPT_USER_AGENT, self._user_agent] if capture_screen: options_js += [_OPT_CAPTURE_SCREEN, capture_screen] if self._inspect_iframes: options_js += [_OPT_INSPECT_IFRAMES] if timeout_js is not None: options_js += [_OPT_TIMEOUT_SEC, timeout_js] if self._delay is not None: options_js += [_OPT_DELAY_SEC, str(self._delay)] if not self._with_content: options_js += [_OPT_NO_CONTENT] if self._with_request_response: options_js += [_OPT_REQUEST_RESPONSE] if post_params is not None: options_js += [_OPT_POST_FULL, _get_full_post_string(post_params)] if self._headers is not None: options_js += [_OPT_CUSTOM_HEADERS_JSON, json.dumps(self._headers)] if self._cookie is not None: options_js += [_OPT_COOKIE, json.dumps(self._cookie)] options_js_str = [str(o) for o in options_js] cmds = [PHANTOMJS_BIN] + options_bin + [PHANTOMJS_JS] + options_js_str logger.debug('call {}'.format(cmds)) out, err = command.call(cmds, timeout=timeout_thread) logger.debug('out: {:.1f}KB, err: {:.1f}KB'.format( len(out)/1000.0, len(err)/1000.0)) if self._debug: logger.debug('stderr from phantomjs:') logger.debug(err) try: fixed_out = self._clean_output(out) output_json = json.loads(fixed_out) except ValueError: raise PhantomCurlError('Invalid output', out=out, err=err) return output_json