def _url_from_selector(sel): # type: (parsel.Selector) -> str if isinstance(sel.root, six.string_types): # e.g. ::attr(href) result return strip_html5_whitespace(sel.root) if not hasattr(sel.root, 'tag'): raise ValueError("Unsupported selector: %s" % sel) if sel.root.tag != 'a': raise ValueError("Only <a> elements are supported; got <%s>" % sel.root.tag) href = sel.root.get('href') if href is None: raise ValueError("<a> element has no href attribute: %s" % sel) return strip_html5_whitespace(href)
def _get_form_url(form, url): if url is None: action = form.get('action') if action is None: return form.base_url return urljoin(form.base_url, strip_html5_whitespace(action)) return urljoin(form.base_url, url)
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val) attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = safe_url_string(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link( url, _collect_string_content(el) or "", nofollow=rel_has_nofollow(el.get("rel")), ) links.append(link) return self._deduplicate_if_needed(links)
def _url_from_selector(sel): # type: (parsel.Selector) -> str if isinstance(sel.root, six.string_types): # e.g. ::attr(href) result return strip_html5_whitespace(sel.root) if not hasattr(sel.root, 'tag'): raise ValueError("Unsupported selector: %s" % sel) if sel.root.tag not in ('a', 'link'): raise ValueError( "Only <a> and <link> elements are supported; got <%s>" % sel.root.tag) href = sel.root.get('href') if href is None: raise ValueError("<%s> element has no href attribute: %s" % (sel.root.tag, sel)) return strip_html5_whitespace(href)
def _extract_property_value(self, node, items_seen, base_url, itemids, force=False): #http://www.w3.org/TR/microdata/#values if not force and node.get("itemscope") is not None: if self.nested: return self._extract_item(node, items_seen=items_seen, base_url=base_url, itemids=itemids) else: return {"iid_ref": self.get_docid(node, itemids)} elif node.tag == "meta": return node.get("content", "") elif node.tag in ("audio", "embed", "iframe", "img", "source", "track", "video"): return urljoin(base_url, strip_html5_whitespace(node.get("src", ""))) elif node.tag in ("a", "area", "link"): return urljoin(base_url, strip_html5_whitespace(node.get("href", ""))) elif node.tag in ("object", ): return urljoin(base_url, strip_html5_whitespace(node.get("data", ""))) elif node.tag in ("data", "meter"): return node.get("value", "") elif node.tag in ("time", ): return node.get("datetime", "") # not in W3C specs but used in schema.org examples elif node.get("content"): return node.get("content") else: return self._extract_textContent(node)
def handle_starttag(self, tag, attrs): if tag == 'base': self.base_url = dict(attrs).get('href') if self.scan_tag(tag): for attr, value in attrs: if self.scan_attr(attr): if self.strip: value = strip_html5_whitespace(value) url = self.process_attr(value) link = Link(url=url) self.links.append(link) self.current_link = link
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('html'): cleaner = Cleaner(safe_attrs_only=True, safe_attrs={'src', 'alt', 'href', 'title'}) adapter['html'] = cleaner.clean_html(adapter['html']) adapter['html'] = w3lib_cleaner(adapter['html']) if adapter.get('images'): for img in adapter.get('images'): adapter['html'] = adapter['html'].replace( img['url'], img['path']) if adapter.get('h1'): adapter['h1'] = w3lib_html.strip_html5_whitespace(adapter['h1']) if adapter.get('title'): adapter['title'] = w3lib_html.strip_html5_whitespace( adapter['title']) if adapter.get('author'): adapter['author'] = w3lib_html.strip_html5_whitespace( adapter['author']) return item
def unknown_starttag(self, tag, attrs): if tag == 'base': self.base_url = dict(attrs).get('href') if self.scan_tag(tag): for attr, value in attrs: if self.scan_attr(attr): if self.strip and value is not None: value = strip_html5_whitespace(value) url = self.process_value(value) if url is not None: link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel'))) self.links.append(link) self.current_link = link
def _extract_links(self, json_path, response): # 通过json_path获取具体urls, to add codes here try: # 通过正则匹配得到具体的json内容信息 json_re = response.meta.get('json_re', None) response_text = response.text if json_re: mo = re.search(pattern=json_re, string=response_text, flags=re.S | re.M | re.I) if mo: response_text = mo.group(1) # 因为返回结果为json格式,所以需要先json decode, 有可能发生异常失败 j = json.loads(response_text, encoding='utf-8') except Exception as e: log.error(e) return [] json_func = SelectJmes(json_path) results = json_func(j) if not results: log.warning("json_path:{0} 没有在response中没有匹配到相应的links, 退出!".format( json_path)) return [] links = [] base_url = get_base_url(response) results = arg_to_iter(results) for url_texts in results: try: url = str(url_texts.get('url', '')) if not url: continue url = strip_html5_whitespace(url) url = urljoin(base_url, url) url = self.process_attr(url) if not url: continue url = urljoin(response.url, url) text = url_texts.get('text', '') fragment = str(url_texts.get("fragment", "")) link = Link(url=url, text=text, fragment=fragment) links.append(link) except Exception as e: log.error(e) return self._deduplicate_if_needed(links)
def parse_feed(self, response: TextResponse): """ Parse a feed XML. """ if not isinstance(response, TextResponse): self.logger.warning('Invalid Feed response: %s', response) self.crawler.stats.inc_value('error/invalid_feed_response') return feed = feedparser.parse(response.text) if not feed: self.crawler.stats.inc_value('error/rss_initially_empty') return seen = set() for entry in feed.get('entries', []): url = strip_html5_whitespace(entry.get('link')) if not is_valid_url(url): self.logger.warning('Ignoring invalid article URL: %s', url) continue if url not in seen: seen.add(url) if not seen: self.crawler.stats.inc_value('error/rss_finally_empty') return self.logger.info('Links extracted from <%s> feed = %d', response.url, len(seen)) source_url = response.meta['source_url'] feed_url = response.url for url in seen: self.crawler.stats.inc_value('links/rss') # Make a request to fetch the full page HTML # Risk of being banned self.crawler.stats.inc_value('x_request/discovery') yield Request(url, meta={ 'source_url': source_url, 'feed_url': feed_url }, callback=self.parse_page, errback=self.errback_page, dont_filter=self.dont_filter)
def extract_items(self, document, base_url=None): elements = [] terms = [] def attrib_to_dict(attribs): # convert _attrib type to dict return dict(attribs.items()) def populate_results(node, main_attrib): # fill list with DC Elements or DC Terms node_attrib = node.attrib if main_attrib not in node_attrib: return name = node.attrib[main_attrib] lower_name = get_lower_attrib(name) if lower_name in _DC_ELEMENTS: node.attrib.update({'URI': _DC_ELEMENTS[lower_name]}) elements.append(attrib_to_dict(node.attrib)) elif lower_name in _DC_TERMS: node.attrib.update({'URI': _DC_TERMS[lower_name]}) terms.append(attrib_to_dict(node.attrib)) namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') namespaces = {} for i in namespaces_nodes: url = strip_html5_whitespace(i.attrib['href']) if url in _URL_NAMESPACES: namespaces.update( {re.sub(r"schema\.", "", i.attrib['rel']): url}) list_meta_node = document.xpath('//meta') for meta_node in list_meta_node: populate_results(meta_node, 'name') list_link_node = document.xpath('//link') for link_node in list_link_node: populate_results(link_node, 'rel') yield {'namespaces': namespaces, 'elements': elements, 'terms': terms}
def w3lib_cleaner(el): # Нормализуем символы и прочие el = unicodedata.normalize('NFKC', el) # Удалить escape-символы. el = w3lib_html.replace_escape_chars(el) # Удалите все начальные и конечные пробелы el = w3lib_html.strip_html5_whitespace(el) # Удалить большие пробелы el = el.replace(' ', '') el = w3lib_html.replace_entities(el, remove_illegal=True, encoding='utf-8') # Удаляем теги вместе с содержимым el = w3lib_html.remove_tags_with_content(el, which_ones=('noidex', 'iframe', 'form')) # Оставляем разрешенные теги и содержимое # (! КАКИМ ТО ВОЛШЕБНЫМ ОБРАЗОМ ТЕКСТ ОСТАВШИЙСЯ БЕЗ ВНЕШНЕГО ТЕГА ОБОРАЧИВАЕТСЯ в <p> # и это хорошо) allowed_tag = ('p', 'img', 'a', 'b', 'i', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'li', 'ins') el = w3lib_html.remove_tags(el, keep=allowed_tag) return el
def urls_in_rendered_page(self) -> List[str]: if not self.rendered_html: raise Har2TreeError('Not the node of a page rendered, invalid request.') urls: Set[str] = set() soup = BeautifulSoup(self.rendered_html.getvalue(), "lxml") for a_tag in soup.find_all(["a", "area"]): href = a_tag.attrs.get("href") if not href: continue href = strip_html5_whitespace(href) href = safe_url_string(href) href = urljoin(self.name, href) href = canonicalize_url(href, keep_fragments=True) parsed = urlparse(href) if not parsed.netloc: continue urls.add(href) return sorted(urls)
def parse_feed(self, response: TextResponse): """ Parse a feed XML. """ if not isinstance(response, TextResponse): self.logger.warning('Invalid Feed response: %s', response) self.crawler.stats.inc_value('error/invalid_feed_response') return feed = feedparser.parse(response.text) if not feed: self.crawler.stats.inc_value('error/rss_initially_empty') return seen = set() for entry in feed.get('entries', []): url = strip_html5_whitespace(entry.get('link')) if not is_valid_url(url): self.logger.warning('Ignoring invalid article URL: %s', url) continue if url not in seen: seen.add(url) if not seen: self.crawler.stats.inc_value('error/rss_finally_empty') return self.logger.info('Links extracted from <%s> feed = %d', response.url, len(seen)) source_url = response.meta['source_url'] feed_url = response.url for url in seen: self.crawler.stats.inc_value('links/rss') yield self.make_extract_request(url, meta={ 'source_url': source_url, 'feed_url': feed_url, 'dont_filter': self.dont_filter }, check_page_type=False)
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val) attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = to_native_str(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def parse_page(self, response): """ Parse the spider response. """ if not isinstance(response, TextResponse): return response_url = strip_html5_whitespace(response.url) # Try to parse the AutoExtract response (if available) and return the correct Item if not self.only_discovery: if is_autoextract_request(response): yield from self.parse_item(response) else: # For discovery-only mode, return only the URLs item = {'url': response_url} item['scraped_at'] = utc_iso_date() if response.meta.get('source_url'): item['source_url'] = response.meta['source_url'] if response.meta.get('link_text'): item['link_text'] = response.meta['link_text'].strip() yield item # Cycle and follow links # Currently AutoExtract responses don't contain the full page HTML, # so there are no links and nothing to follow if response.body: for request in self._requests_to_follow(response): yield crawlera_session.init_request(request) elif is_autoextract_request(response): # Make another request to fetch the full page HTML # Risk of being banned self.crawler.stats.inc_value('x_request/discovery') request = Request(response_url, meta={'source_url': response.meta['source_url']}, callback=self.main_callback, errback=self.main_errback, dont_filter=True) yield crawlera_session.init_request(request)
def format_date(text: str): return parse(strip_html5_whitespace(text))
def get_absolute_url(relative_url, base_url): try: url = strip_html5_whitespace(relative_url) return urljoin(base_url, url) except ValueError: return None
def _extract_link_dicts(selector: Selector, base_url: str, only_urls: bool = False): """ Extract dicts with link information:: { 'url': '<absolute URL>', 'attrs': { '<attribute name>': '<value>', ... }, 'inside_text': '<text inside link>', # 'before_text': '<text preceeding this link>', } If only_urls is true, extract only links as strings. Note that ``base_url`` argument must contain page base URL, which can be different from page URL. Use w3lib.html.get_base_url to get it:: from w3lib.html import get_base_url base_url = get_base_url(html[:4096], page_url) links = list(extract_link_dicts(Selector(html), base_url)) If you're using Scrapy, and Response object is available, then scrapy.utils.response.get_base_url should be faster:: from scrapy.utils.response import get_base_url base_url = get_base_url(response) links = list(extract_link_dicts(response.selector, base_url)) """ selector.remove_namespaces() for a in selector.xpath('//a'): link = {} # type: Dict attrs = a.root.attrib if 'href' not in attrs: continue href = strip_html5_whitespace(attrs['href']) if 'mailto:' in href: continue js_link = extract_js_link(href) if js_link: href = js_link link['js'] = True if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')): continue url = urljoin(base_url, href) if url_has_any_extension(url, _IGNORED): continue if only_urls: yield url else: link['url'] = url link['attrs'] = dict(attrs) link_text = a.xpath('normalize-space()').extract_first(default='') img_link_text = a.xpath('./img/@alt').extract_first(default='') link['inside_text'] = ' '.join([link_text, img_link_text]).strip() # TODO: fix before_text and add after_text # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:] yield link
def parse_ad(self, response): il = AdLoader(item=Ad(), response=response) reserved = strip_html5_whitespace( response.xpath( '/html/body/div[4]/div/section/div/div[2]/main/aside/div[1]/div/div/text()' ).get()) if reserved == self.properties_name['reserved']: self.offset = 1 else: self.offset = 0 first_div = '/html/body/div[4]/div/section/div/div[2]/main/aside/div[%d]' % ( 1 + self.offset) second_div = '/html/body/div[4]/div/section/div/div[2]/main/aside/div[%d]' % ( 2 + self.offset) # Scraping the properties of the announce property_loader = il.nested_xpath(first_div + '/div[1]/div[2]') for div in range(1, 9): current_property = property_loader.get_xpath( './/div[%d]/div[1]/text()' % div) if current_property == []: break elif current_property[0].find(self.properties_name['brand']) != -1: property_loader.add_xpath( 'brand', './/div[%d]/div[2]/a/span/text()' % div) elif current_property[0].find(self.properties_name['size']) != -1: property_loader.add_xpath('size', './/div[%d]/div[2]/text()' % div) elif current_property[0].find( self.properties_name['condition']) != -1: property_loader.add_xpath('condition', './/div[%d]/div[2]/text()' % div) elif current_property[0].find(self.properties_name['color']) != -1: property_loader.add_xpath('color', './/div[%d]/div[2]/text()' % div) elif current_property[0].find( self.properties_name['location']) != -1: location = property_loader.get_xpath( './/div[%d]/div[2]/text()' % div)[0] location = strip_html5_whitespace(location).split(',') if len(location) == 2: property_loader.add_value('city', location[0]) property_loader.add_value('country', location[1]) else: property_loader.add_value('city', None) property_loader.add_value('country', location[0]) elif current_property[0].find(self.properties_name['views']) != -1: property_loader.add_xpath('views', './/div[%d]/div[2]/text()' % div) elif current_property[0].find( self.properties_name['interested']) != -1: property_loader.add_xpath('interested', './/div[%d]/div[2]/text()' % div, re=r'\d+') elif current_property[0].find( self.properties_name['uploadedDatetime']) != -1: property_loader.add_xpath( 'uploadedDatetime', './/div[%d]/div[2]/time/@datetime' % div) il.add_xpath('price', first_div + '/div[1]/div[1]/div[1]/span/div/text()', re=r'\d+,\d+') # Scraping title and description description = response.xpath(first_div + '/div[2]/script/text()').get() description = json.loads(description) il.add_value('title', description['content']['title']) il.add_value('description', description['content']['description']) il.add_value('itemId', description['itemId']) # Scraping user information user_loader = il.nested_xpath(second_div) user_url = user_loader.get_xpath('.//div/a/@href')[0] user_loader.add_value('userId', user_url.split('/')[2]) user_loader.add_xpath('userName', './/div[1]/div[2]/div[1]/h4/span/span/a/text()') user_loader.add_xpath( 'lastSeen', './/div[1]/div[2]/div[3]/div/span/time/@datetime') # Scraping ratings information ratings_loader = user_loader.nested_xpath( './/div[1]/div[2]/div[1]/a/div') nbRating = ratings_loader.get_xpath('.//div[6]/div/text()') if nbRating == []: ratings_loader.add_value('nbRating', 0) else: ratings_loader.add_value('nbRating', nbRating[0]) # Counting the number of stars rate = 0 for i in range(1, 6): star = ratings_loader.get_xpath('.//div[%d]/@class' % i)[0] if star == 'c-rating__star c-rating__star--full': rate = rate + 1 elif star == 'c-rating__star c-rating__star--half-full': rate = rate + 0.5 break else: break ratings_loader.add_value('rate', rate) il.add_value('url', response.request.url) # Scraping images if self.download_images == 'True': il.add_xpath( 'image_urls', '/html/body/div[4]/div/section/div/div[2]/main/div/section/div/figure/a/@href' ) yield il.load_item()