def _update_sample(self, sample=None, project=None, data=None): """Recompile sample with latest annotations""" if sample is None: sample = self._load_sample(data, project) path = 'spiders/{}/{}/{{}}.html'.format( self.data['spider'], self.data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(self.storage.open(path).read()) assert html except (AssertionError, IOError): if not self.tab: six.reraise(*sys.exc_info()) html = None if type_ == 'raw': html = self.tab._raw_html if not html: html = self.tab.html() if html: self.storage.save(path, ContentFile(encode(html), path)) html = decode(html) else: html = '<html></html>' sample[name] = decode(html) return sample
def _update_sample(self, sample=None, project=None, data=None): """Recompile sample with latest annotations""" if sample is None: sample = self._load_sample(data, project) path = 'spiders/{}/{}/{{}}.html'.format(self.data['spider'], self.data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(self.storage.open(path).read()) assert html except (AssertionError, IOError): if not self.tab: six.reraise(*sys.exc_info()) html = None if type_ == 'raw': html = self.tab._raw_html if not html: html = self.tab.html() if html: self.storage.save(path, ContentFile(encode(html), path)) html = decode(html) else: html = '<html></html>' sample[name] = decode(html) return sample
def open(self, *args, **kwargs): raw = kwargs.get('raw') path = self.rel_path(*args) if raw: fp = self.storage.open(path) else: fp = self.storage.open_with_default(path, {}) return decode(fp.read()) if raw else json.loads(fp.read())
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(decode(contents)).substitute( project_name=project_name, ProjectName=string_camelcase(project_name)) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] if path.endswith('scrapy.cfg'): path = 'scrapy.cfg' out_files[path] = contents out_files['setup.py'] = SETUP(project_name) return out_files
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(decode(contents)).substitute( project_name=project_name, ProjectName=string_camelcase(project_name) ) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] if path.endswith('scrapy.cfg'): path = 'scrapy.cfg' out_files[path] = contents out_files['setup.py'] = SETUP(project_name) return out_files
def process_css(css_source, tabid, base_uri): """ Wraps urls in css source. >>> url = 'http://scrapinghub.com/style.css' >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS '@import "/proxy?..."' """ def _absolutize_css_import(match): return '@import "{}"'.format(wrap_url(match.group(1), tabid, base_uri).replace('"', '%22')) def _absolutize_css_url(match): url = match.group(1).strip("\"'") return 'url("{}")'.format(wrap_url(url, tabid, base_uri).replace('"', '%22')) css_source = decode(css_source) css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source) css_source = CSS_URL.sub(_absolutize_css_url, css_source) css_source = BAD_CSS.sub('portia-blocked', css_source) return encode(css_source)
def process_css(css_source, tabid, base_uri): """ Wraps urls in css source. >>> url = 'http://scrapinghub.com/style.css' >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS '@import "/proxy?..."' """ def _absolutize_css_import(match): return '@import "{}"'.format( wrap_url(match.group(1), tabid, base_uri).replace('"', '%22')) def _absolutize_css_url(match): url = match.group(1).strip("\"'") return 'url("{}")'.format( wrap_url(url, tabid, base_uri).replace('"', '%22')) css_source = decode(css_source) css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source) css_source = CSS_URL.sub(_absolutize_css_url, css_source) css_source = BAD_CSS.sub('portia-blocked', css_source) return encode(css_source)
def _update_sample(data, socket, sample=None, project=None): """Recompile sample with latest annotations""" if sample is None: sample = _load_sample(data, socket, project) path = 'spiders/{}/{}/{{}}.html'.format(data['spider'], data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(socket.storage.open(path).read()) except IOError: if not socket.tab: six.reraise(*sys.exc_info()) html = decoded_html(socket.tab, type_) if html: socket.storage.save(path, ContentFile(html, path)) else: html = '<html></html>' sample[name] = html return sample
def decoded_html(tab, type_=None): if type_ == 'raw': stated_encoding = tab.evaljs('document.characterSet') return decode(tab._raw_html or tab.html(), default=stated_encoding) return tab.html()
def decoded_html(tab, type_=None): if type_ == 'raw': stated_encoding = tab.evaljs('document.characterSet') return decode(tab.network_manager._raw_html or tab.html(), default=stated_encoding) return tab.html()
def _set_tab_html(self, reply, har, content): url = reply.url().toString() if content is not None and url == self.tab.url: self.tab._raw_html = decode(content) self.tab._raw_url = decode(url)
def url(self): """ Current URL """ if self._closing: return '' return decode(self.web_page.mainFrame().url().toString())
def _set_tab_html(self, reply, har, content): url = decode(reply.url().toString()) if content is not None and url == self.tab.url: self.tab._raw_html = decode(content) self.tab._raw_url = url