def handle(self, response, log, browser, video_id): attr = {'video_id': video_id} title = None if response.data_type == 'soup': soup = response.data with trapped: title = strip_site_name(render_node(soup.head.title), response.url) if self.extra_attr: desc = soup.find('div', id='watch-description-clip') # this describes what we need to scrape.. youtube is awfully structured. # NOTE: this is stupidly slow.. disable if doing any volume. for row in (('uploader', desc, 'p', 'id', 'watch-uploader-info', None), ('summary', desc, 'p', 'id', 'eow-description', None), ('category', desc, 'p', 'id', 'eow-category', None), ('license', desc, 'p', 'id', 'eow-reuse', None), ('views', soup, 'span', 'class', 'watch-view-count', None), ('extras', soup, 'ul', 'id', 'watch-description-extra-info', 'li' ), ('tags', desc, 'ul', 'id', 'eow-tags', 'a' )): with trapped: name, parent, tag, key, val, multi = row node = parent.find(tag, **{key: val}) attr[name] = [render_node(s) for s in node(multi)] if multi else render_node(node) return ScanResult(response=response, override_url=None, title=title, content_type=None, content=None, attr=attr)
def handle(self, response, log, browser): if response.data_type != 'soup': raise InvalidContent(response, 'Not an HTML file') soup = response.data title = None with trapped: title = strip_site_name(render_node(soup.head.title), response.url) with trapped: url = soup.head.find('link', rel='image_src')['href'] response = browser.open(url, follow_meta_redirect=True) result = super(IMGurScanner, self).handle(response, log, browser) return ScanResult(response=result.response, override_url=result.override_url, title=result.title if title is None else title, content_type=result.content_type, content=result.content, attr=result.attr) raise InvalidContent(response, "Couldn't find the image")
def handle(self, response, log, browser): if response.data_type != 'soup': raise InvalidContent(response, 'Not an HTML file') soup = response.data title = summary = content_type = None with trapped: title = strip_site_name(render_node(soup.head.title), response.url) with trapped: summary = self.summarize_soup(soup) content_type = 'text/plain' if title is None and summary is None and content_type is None: raise InvalidContent("couldn't get anything useful out of that..") return ScanResult(response=response, override_url=None, title=title, content_type=content_type, content=summary, attr=None)
def handle(self, response, log, browser, video_id): attr = {'video_id': video_id} title = None if response.data_type == 'soup': soup = response.data with trapped: title = strip_site_name(render_node(soup.head.title), response.url) if self.extra_attr: desc = soup.find('div', id='watch-description-clip') # this describes what we need to scrape.. youtube is awfully structured. # NOTE: this is stupidly slow.. disable if doing any volume. for row in (('uploader', desc, 'p', 'id', 'watch-uploader-info', None), ('summary', desc, 'p', 'id', 'eow-description', None), ('category', desc, 'p', 'id', 'eow-category', None), ('license', desc, 'p', 'id', 'eow-reuse', None), ('views', soup, 'span', 'class', 'watch-view-count', None), ('extras', soup, 'ul', 'id', 'watch-description-extra-info', 'li'), ('tags', desc, 'ul', 'id', 'eow-tags', 'a')): with trapped: name, parent, tag, key, val, multi = row node = parent.find(tag, **{key: val}) attr[name] = [render_node(s) for s in node(multi) ] if multi else render_node(node) return ScanResult(response=response, override_url=None, title=title, content_type=None, content=None, attr=attr)